diff --git a/.agents/skills/openclaw-qa-testing/SKILL.md b/.agents/skills/openclaw-qa-testing/SKILL.md index d0201975c56..bf006793641 100644 --- a/.agents/skills/openclaw-qa-testing/SKILL.md +++ b/.agents/skills/openclaw-qa-testing/SKILL.md @@ -57,12 +57,11 @@ Use `qa character-eval` for style/persona/vibe checks across multiple live model pnpm openclaw qa character-eval \ --model openai/gpt-5.4,thinking=xhigh \ --model openai/gpt-5.2,thinking=xhigh \ + --model openai/gpt-5,thinking=xhigh \ --model anthropic/claude-opus-4-6,thinking=high \ --model anthropic/claude-sonnet-4-6,thinking=high \ - --model minimax/MiniMax-M2.7,thinking=high \ --model zai/glm-5.1,thinking=high \ --model moonshot/kimi-k2.5,thinking=high \ - --model qwen/qwen3.5-plus,thinking=high \ --model google/gemini-3.1-pro-preview,thinking=high \ --judge-model openai/gpt-5.4,thinking=xhigh,fast \ --judge-model anthropic/claude-opus-4-6,thinking=high \ @@ -74,7 +73,7 @@ pnpm openclaw qa character-eval \ - Runs local QA gateway child processes, not Docker. - Preferred model spec syntax is `provider/model,thinking=[,fast|,no-fast|,fast=]` for both `--model` and `--judge-model`. - Do not add new examples with separate `--model-thinking`; keep that flag as legacy compatibility only. -- Defaults to candidate models `openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`, `anthropic/claude-sonnet-4-6`, `minimax/MiniMax-M2.7`, `zai/glm-5.1`, `moonshot/kimi-k2.5`, `qwen/qwen3.5-plus`, and `google/gemini-3.1-pro-preview` when no `--model` is passed. +- Defaults to candidate models `openai/gpt-5.4`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-6`, `anthropic/claude-sonnet-4-6`, `zai/glm-5.1`, `moonshot/kimi-k2.5`, and `google/gemini-3.1-pro-preview` when no `--model` is passed. - Candidate thinking defaults to `high`, with `xhigh` for OpenAI models that support it. Prefer inline `--model provider/model,thinking=`; `--thinking ` and `--model-thinking ` remain compatibility shims. - OpenAI candidate refs default to fast mode so priority processing is used where supported. Use inline `,fast`, `,no-fast`, or `,fast=false` for one model; use `--fast` only to force fast mode for every candidate. - Judges default to `openai/gpt-5.4,thinking=xhigh,fast` and `anthropic/claude-opus-4-6,thinking=high`. diff --git a/docs/concepts/qa-e2e-automation.md b/docs/concepts/qa-e2e-automation.md index 27eca3a5ccd..becd26cd1d6 100644 --- a/docs/concepts/qa-e2e-automation.md +++ b/docs/concepts/qa-e2e-automation.md @@ -89,12 +89,11 @@ refs and write a judged Markdown report: pnpm openclaw qa character-eval \ --model openai/gpt-5.4,thinking=xhigh \ --model openai/gpt-5.2,thinking=xhigh \ + --model openai/gpt-5,thinking=xhigh \ --model anthropic/claude-opus-4-6,thinking=high \ --model anthropic/claude-sonnet-4-6,thinking=high \ - --model minimax/MiniMax-M2.7,thinking=high \ --model zai/glm-5.1,thinking=high \ --model moonshot/kimi-k2.5,thinking=high \ - --model qwen/qwen3.5-plus,thinking=high \ --model google/gemini-3.1-pro-preview,thinking=high \ --judge-model openai/gpt-5.4,thinking=xhigh,fast \ --judge-model anthropic/claude-opus-4-6,thinking=high \ @@ -128,9 +127,9 @@ Candidate and judge model runs both default to concurrency 16. Lower `--concurrency` or `--judge-concurrency` when provider limits or local gateway pressure make a run too noisy. When no candidate `--model` is passed, the character eval defaults to -`openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`, -`anthropic/claude-sonnet-4-6`, `minimax/MiniMax-M2.7`, `zai/glm-5.1`, -`moonshot/kimi-k2.5`, `qwen/qwen3.5-plus`, and +`openai/gpt-5.4`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-6`, +`anthropic/claude-sonnet-4-6`, `zai/glm-5.1`, +`moonshot/kimi-k2.5`, and `google/gemini-3.1-pro-preview` when no `--model` is passed. When no `--judge-model` is passed, the judges default to `openai/gpt-5.4,thinking=xhigh,fast` and diff --git a/extensions/qa-lab/src/character-eval.test.ts b/extensions/qa-lab/src/character-eval.test.ts index eaebb06b004..259b53d89ad 100644 --- a/extensions/qa-lab/src/character-eval.test.ts +++ b/extensions/qa-lab/src/character-eval.test.ts @@ -185,13 +185,12 @@ describe("runQaCharacterEval", () => { rankings: [ { model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }, { model: "openai/gpt-5.2", rank: 2, score: 7.5, summary: "ok" }, - { model: "anthropic/claude-opus-4-6", rank: 3, score: 7, summary: "ok" }, - { model: "anthropic/claude-sonnet-4-6", rank: 4, score: 6.8, summary: "ok" }, - { model: "minimax/MiniMax-M2.7", rank: 5, score: 6.5, summary: "ok" }, + { model: "openai/gpt-5", rank: 3, score: 7.2, summary: "ok" }, + { model: "anthropic/claude-opus-4-6", rank: 4, score: 7, summary: "ok" }, + { model: "anthropic/claude-sonnet-4-6", rank: 5, score: 6.8, summary: "ok" }, { model: "zai/glm-5.1", rank: 6, score: 6.3, summary: "ok" }, { model: "moonshot/kimi-k2.5", rank: 7, score: 6.2, summary: "ok" }, - { model: "qwen/qwen3.5-plus", rank: 8, score: 6.1, summary: "ok" }, - { model: "google/gemini-3.1-pro-preview", rank: 9, score: 6, summary: "ok" }, + { model: "google/gemini-3.1-pro-preview", rank: 8, score: 6, summary: "ok" }, ], }), ); @@ -204,23 +203,21 @@ describe("runQaCharacterEval", () => { runJudge, }); - expect(runSuite).toHaveBeenCalledTimes(9); + expect(runSuite).toHaveBeenCalledTimes(8); expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([ "openai/gpt-5.4", "openai/gpt-5.2", + "openai/gpt-5", "anthropic/claude-opus-4-6", "anthropic/claude-sonnet-4-6", - "minimax/MiniMax-M2.7", "zai/glm-5.1", "moonshot/kimi-k2.5", - "qwen/qwen3.5-plus", "google/gemini-3.1-pro-preview", ]); expect(runSuite.mock.calls.map(([params]) => params.thinkingDefault)).toEqual([ "xhigh", "xhigh", - "high", - "high", + "xhigh", "high", "high", "high", @@ -230,8 +227,7 @@ describe("runQaCharacterEval", () => { expect(runSuite.mock.calls.map(([params]) => params.fastMode)).toEqual([ true, true, - false, - false, + true, false, false, false, diff --git a/extensions/qa-lab/src/character-eval.ts b/extensions/qa-lab/src/character-eval.ts index 384a3eeed07..8fb5fe63dfa 100644 --- a/extensions/qa-lab/src/character-eval.ts +++ b/extensions/qa-lab/src/character-eval.ts @@ -10,12 +10,11 @@ const DEFAULT_CHARACTER_SCENARIO_ID = "character-vibes-gollum"; const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([ "openai/gpt-5.4", "openai/gpt-5.2", + "openai/gpt-5", "anthropic/claude-opus-4-6", "anthropic/claude-sonnet-4-6", - "minimax/MiniMax-M2.7", "zai/glm-5.1", "moonshot/kimi-k2.5", - "qwen/qwen3.5-plus", "google/gemini-3.1-pro-preview", ]); const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high"; @@ -24,6 +23,7 @@ const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly