agents: split GPT-5 prompt and retry behavior (#65597)

* agents: split GPT-5 prompt and retry behavior * agents: fix GPT-5 review follow-ups * agents: address GPT-5 review follow-ups * agents: avoid replaying side-effectful GPT retries * agents: mark subagent control as mutating * agents: fail closed on single-action retries * commands: stabilize channel legacy doctor migration test * agents: narrow single-action retry promise trigger
2026-04-18 15:23:23 +02:00 · 2026-04-12 18:52:22 -07:00
parent d0c83777fb
commit c848ebc8ce
8 changed files with 352 additions and 17 deletions
--- a/extensions/openai/index.test.ts
+++ b/extensions/openai/index.test.ts
@@ -14,6 +14,7 @@ import {
  OPENAI_FRIENDLY_PROMPT_OVERLAY,
  OPENAI_GPT5_EXECUTION_BIAS,
  OPENAI_GPT5_OUTPUT_CONTRACT,
+  OPENAI_GPT5_TOOL_CALL_STYLE,
 } from "./prompt-overlay.js";

 const runtimeMocks = vi.hoisted(() => ({
@@ -365,7 +366,7 @@ describe("openai plugin", () => {
    };

    expect(openaiProvider.resolveSystemPromptContribution?.(contributionContext)).toEqual({
-      stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+      stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
      sectionOverrides: {
        interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
        execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -382,7 +383,7 @@ describe("openai plugin", () => {
      "Occasional emoji are welcome when they fit naturally, especially for warmth or brief celebration; keep them sparse.",
    );
    expect(codexProvider.resolveSystemPromptContribution?.(contributionContext)).toEqual({
-      stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+      stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
      sectionOverrides: {
        interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
        execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -454,9 +455,22 @@ describe("openai plugin", () => {
    expect(OPENAI_FRIENDLY_PROMPT_OVERLAY).toContain(
      "Occasional emoji are welcome when they fit naturally, especially for warmth or brief celebration; keep them sparse.",
    );
+    expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
+      "Use a real tool call or concrete action FIRST when the task is actionable. Do not stop at a plan or promise-to-act reply.",
+    );
+    expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
+      "If the work will take multiple steps, keep calling tools until the task is done or you hit a real blocker. Do not stop after one step to ask permission.",
+    );
    expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
      "Do prerequisite lookup or discovery before dependent actions.",
    );
+    expect(OPENAI_GPT5_TOOL_CALL_STYLE).toContain(
+      "Call tools directly without narrating what you are about to do. Do not describe a plan before each tool call.",
+    );
+    expect(OPENAI_GPT5_TOOL_CALL_STYLE).toContain(
+      "When a first-class tool exists for an action, use the tool instead of asking the user to run a command.",
+    );
+    expect(OPENAI_GPT5_TOOL_CALL_STYLE).not.toContain("/approve");
    expect(OPENAI_GPT5_OUTPUT_CONTRACT).toContain(
      "Return the requested sections only, in the requested order.",
    );
@@ -486,7 +500,7 @@ describe("openai plugin", () => {
        agentId: undefined,
      }),
    ).toEqual({
-      stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+      stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
      sectionOverrides: {
        interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
        execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -514,7 +528,7 @@ describe("openai plugin", () => {
        agentId: undefined,
      }),
    ).toEqual({
-      stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+      stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
      sectionOverrides: {
        execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
      },
@@ -540,7 +554,7 @@ describe("openai plugin", () => {
        agentId: undefined,
      }),
    ).toEqual({
-      stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+      stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
      sectionOverrides: {
        execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
      },
@@ -567,7 +581,7 @@ describe("openai plugin", () => {
        agentId: undefined,
      }),
    ).toEqual({
-      stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+      stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
      sectionOverrides: {
        interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
        execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -594,7 +608,7 @@ describe("openai plugin", () => {
        agentId: undefined,
      }),
    ).toEqual({
-      stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+      stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
      sectionOverrides: {
        interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
        execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
--- a/extensions/openai/prompt-overlay.ts
+++ b/extensions/openai/prompt-overlay.ts
@@ -64,11 +64,20 @@ Do not use em dashes unless the user explicitly asks for them or they are requir

 export const OPENAI_GPT5_EXECUTION_BIAS = `## Execution Bias

-Start the real work in the same turn when the next step is clear.
+Use a real tool call or concrete action FIRST when the task is actionable. Do not stop at a plan or promise-to-act reply.
+Commentary-only turns are incomplete when tools are available and the next action is clear.
+If the work will take multiple steps, keep calling tools until the task is done or you hit a real blocker. Do not stop after one step to ask permission.
 Do prerequisite lookup or discovery before dependent actions.
-If another tool call would likely improve correctness or completeness, keep going instead of stopping at partial progress.
 Multi-part requests stay incomplete until every requested item is handled or clearly marked blocked.
-Before the final answer, quickly verify correctness, coverage, formatting, and obvious side effects.`;
+Act first, then verify if needed. Do not pause to summarize or verify before taking the next action.`;
+
+export const OPENAI_GPT5_TOOL_CALL_STYLE = `## Tool Call Style
+
+Call tools directly without narrating what you are about to do. Do not describe a plan before each tool call.
+When a first-class tool exists for an action, use the tool instead of asking the user to run a command.
+If multiple tool calls are needed, call them in sequence without stopping to explain between calls.
+Default: do not narrate routine, low-risk tool calls (just call the tool).
+Narrate only when it genuinely helps: complex multi-step work, sensitive actions like deletions, or when the user explicitly asks for commentary.`;

 export type OpenAIPromptOverlayMode = "friendly" | "off";

@@ -103,8 +112,14 @@ export function resolveOpenAISystemPromptContribution(params: {
  ) {
    return undefined;
  }
+  // tool_call_style is NOT overridden via sectionOverrides because the
+  // default section includes dynamic channel-specific approval guidance
+  // from buildExecApprovalPromptGuidance() that varies per runtime
+  // channel. Overriding it with a static string would lose that dynamic
+  // content. Instead, the tool-first reinforcement lives in stablePrefix
+  // so it's always present alongside the default tool_call_style section.
  return {
-    stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
+    stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
    sectionOverrides: {
      execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
      ...(params.mode === "friendly" ? { interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY } : {}),
--- a/src/agents/pi-embedded-runner/run.incomplete-turn.test.ts
+++ b/src/agents/pi-embedded-runner/run.incomplete-turn.test.ts
@@ -10,8 +10,10 @@ import {
  resetRunOverflowCompactionHarnessMocks,
 } from "./run.overflow-compaction.harness.js";
 import {
+  buildAttemptReplayMetadata,
  extractPlanningOnlyPlanDetails,
  isLikelyExecutionAckPrompt,
+  PLANNING_ONLY_RETRY_INSTRUCTION,
  resolveAckExecutionFastPathInstruction,
  resolvePlanningOnlyRetryLimit,
  resolvePlanningOnlyRetryInstruction,
@@ -281,7 +283,10 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => {
      timedOut: false,
      attempt: makeAttemptResult({
        assistantTexts: ["I'll inspect the code, make the change, and run the checks."],
-        toolMetas: [{ toolName: "bash", meta: "ls" }],
+        toolMetas: [
+          { toolName: "read", meta: "path=src/index.ts" },
+          { toolName: "search", meta: "pattern=runEmbeddedPiAgent" },
+        ],
      }),
    });

@@ -436,3 +441,156 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => {
    ).toBe("paused");
  });
 });
+
+describe("resolvePlanningOnlyRetryInstruction single-action loophole", () => {
+  const openaiParams = { provider: "openai", modelId: "gpt-5.4" } as const;
+
+  function makeAttemptWithTools(
+    toolNames: string[],
+    assistantText: string,
+  ): Parameters<typeof resolvePlanningOnlyRetryInstruction>[0]["attempt"] {
+    const toolMetas = toolNames.map((toolName) => ({ toolName }));
+    return {
+      toolMetas,
+      assistantTexts: [assistantText],
+      lastAssistant: { stopReason: "stop" },
+      itemLifecycle: { startedCount: toolNames.length },
+      replayMetadata: buildAttemptReplayMetadata({
+        toolMetas,
+        didSendViaMessagingTool: false,
+      }),
+      clientToolCall: null,
+      yieldDetected: false,
+      didSendDeterministicApprovalPrompt: false,
+      didSendViaMessagingTool: false,
+      lastToolError: null,
+    } as unknown as Parameters<typeof resolvePlanningOnlyRetryInstruction>[0]["attempt"];
+  }
+
+  it("retries when exactly 1 non-plan tool call plus 'i can do that' prose is detected", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["read"], "I can do that next."),
+    });
+
+    expect(result).toBe(PLANNING_ONLY_RETRY_INSTRUCTION);
+  });
+
+  it("retries when exactly 1 non-plan tool call plus planning prose is detected", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["read"], "I'll analyze the structure next."),
+    });
+
+    expect(result).toBe(PLANNING_ONLY_RETRY_INSTRUCTION);
+  });
+
+  it("does not retry when 2+ non-plan tool calls are present", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["read", "search"], "I'll verify the output."),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when 1 tool call plus completion language is present", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["read"], "Done. The file looks correct."),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when 1 tool call plus 'let me know' handoff is present", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["read"], "Let me know if you need anything else."),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when 1 tool call plus an answer-style summary is present", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(
+        ["read"],
+        "I'll summarize the root cause: the provider auth scope is missing.",
+      ),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when 1 tool call plus a future-tense description is present", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(
+        ["read"],
+        "I'll describe the issue: the provider auth scope is missing.",
+      ),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when 1 safe tool call is followed by answer prose joined with 'and'", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["read"], "I'll explain and recommend a fix."),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when 1 tool call plus a bare 'i can do that' reply is present", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["read"], "I can do that."),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when the lone tool call already had side effects", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["sessions_spawn"], "I'll continue from there next."),
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it("does not retry when the lone tool call is unclassified", () => {
+    const result = resolvePlanningOnlyRetryInstruction({
+      ...openaiParams,
+      aborted: false,
+      timedOut: false,
+      attempt: makeAttemptWithTools(["vendor_widget"], "I'll continue from there next."),
+    });
+
+    expect(result).toBeNull();
+  });
+});
--- a/src/agents/pi-embedded-runner/run/incomplete-turn.ts
+++ b/src/agents/pi-embedded-runner/run/incomplete-turn.ts
@@ -54,6 +54,21 @@ const PLANNING_ONLY_COMPLETION_RE =
  /\b(?:done|finished|implemented|updated|fixed|changed|ran|verified|found|here(?:'s| is) what|blocked by|the blocker is)\b/i;
 const PLANNING_ONLY_HEADING_RE = /^(?:plan|steps?|next steps?)\s*:/i;
 const PLANNING_ONLY_BULLET_RE = /^(?:[-*•]\s+|\d+[.)]\s+)/u;
+const PLANNING_ONLY_MAX_VISIBLE_TEXT = 700;
+const SINGLE_ACTION_EXPLICIT_CONTINUATION_RE =
+  /\b(?:going to|first[, ]+i(?:'ll| will)|next[, ]+i(?:'ll| will)|then[, ]+i(?:'ll| will)|i can do that next|let me (?!know\b)\w+(?:\s+\w+){0,3}\s+(?:next|then|first)\b)/i;
+const SINGLE_ACTION_MULTI_STEP_PROMISE_RE =
+  /\bi(?:'ll| will)\b(?=[^.!?]{0,160}\b(?:next|then|after(?:wards)?|once)\b)/i;
+const SINGLE_ACTION_RESULT_STYLE_RE =
+  /\b(?:i(?:'ll| will)\s+(?:summarize|explain|share|show|report|describe|clarify|answer|recap)(?:\s+\w+){0,4}\s*:|(?:here(?:'s| is)|summary|result|answer|findings?|root cause)\s*:)/i;
+const SINGLE_ACTION_RETRY_SAFE_TOOL_NAMES = new Set([
+  "read",
+  "search",
+  "find",
+  "grep",
+  "glob",
+  "ls",
+]);
 const DEFAULT_PLANNING_ONLY_RETRY_LIMIT = 1;
 const STRICT_AGENTIC_PLANNING_ONLY_RETRY_LIMIT = 2;
 const ACK_EXECUTION_NORMALIZED_SET = new Set([
@@ -284,10 +299,51 @@ function countPlanOnlyToolMetas(toolMetas: PlanningOnlyAttempt["toolMetas"]): nu
  return toolMetas.filter((entry) => entry.toolName === "update_plan").length;
 }

+function countNonPlanToolCalls(toolMetas: PlanningOnlyAttempt["toolMetas"]): number {
+  return toolMetas.filter((entry) => entry.toolName !== "update_plan").length;
+}
+
 function hasNonPlanToolActivity(toolMetas: PlanningOnlyAttempt["toolMetas"]): boolean {
  return toolMetas.some((entry) => entry.toolName !== "update_plan");
 }

+function hasSingleRetrySafeNonPlanTool(toolMetas: PlanningOnlyAttempt["toolMetas"]): boolean {
+  const nonPlanToolNames = toolMetas
+    .map((entry) => normalizeLowercaseStringOrEmpty(entry.toolName))
+    .filter((toolName) => toolName && toolName !== "update_plan");
+  return (
+    nonPlanToolNames.length === 1 &&
+    SINGLE_ACTION_RETRY_SAFE_TOOL_NAMES.has(nonPlanToolNames[0] ?? "")
+  );
+}
+
+/**
+ * Treat a turn with exactly one non-plan tool call plus visible "I'll do X
+ * next" prose as effectively planning-only from the user's perspective. This
+ * closes the one-action-then-narrative loophole without changing the 2+ tool
+ * call path, which still counts as real multi-step progress.
+ */
+function isSingleActionThenNarrativePattern(params: {
+  toolMetas: PlanningOnlyAttempt["toolMetas"];
+  assistantTexts: readonly string[];
+}): boolean {
+  const nonPlanCount = countNonPlanToolCalls(params.toolMetas);
+  if (nonPlanCount !== 1) {
+    return false;
+  }
+  const text = params.assistantTexts.join("\n\n").trim();
+  if (!text || text.length > PLANNING_ONLY_MAX_VISIBLE_TEXT) {
+    return false;
+  }
+  if (SINGLE_ACTION_RESULT_STYLE_RE.test(text)) {
+    return false;
+  }
+  return (
+    SINGLE_ACTION_EXPLICIT_CONTINUATION_RE.test(text) ||
+    SINGLE_ACTION_MULTI_STEP_PROMISE_RE.test(text)
+  );
+}
+
 export function resolvePlanningOnlyRetryLimit(
  executionContract?: EmbeddedPiExecutionContract,
 ): number {
@@ -304,6 +360,12 @@ export function resolvePlanningOnlyRetryInstruction(params: {
  attempt: PlanningOnlyAttempt;
 }): string | null {
  const planOnlyToolMetaCount = countPlanOnlyToolMetas(params.attempt.toolMetas);
+  const singleActionNarrative = isSingleActionThenNarrativePattern({
+    toolMetas: params.attempt.toolMetas,
+    assistantTexts: params.attempt.assistantTexts,
+  });
+  const allowSingleActionRetryBypass =
+    singleActionNarrative && hasSingleRetrySafeNonPlanTool(params.attempt.toolMetas);
  if (
    !shouldApplyPlanningOnlyRetryGuard({
      provider: params.provider,
@@ -316,8 +378,9 @@ export function resolvePlanningOnlyRetryInstruction(params: {
    params.attempt.didSendDeterministicApprovalPrompt ||
    params.attempt.didSendViaMessagingTool ||
    params.attempt.lastToolError ||
-    hasNonPlanToolActivity(params.attempt.toolMetas) ||
-    params.attempt.itemLifecycle.startedCount > planOnlyToolMetaCount ||
+    (hasNonPlanToolActivity(params.attempt.toolMetas) && !allowSingleActionRetryBypass) ||
+    (params.attempt.itemLifecycle.startedCount > planOnlyToolMetaCount &&
+      !allowSingleActionRetryBypass) ||
    params.attempt.replayMetadata.hadPotentialSideEffects
  ) {
    return null;
@@ -329,7 +392,7 @@ export function resolvePlanningOnlyRetryInstruction(params: {
  }

  const text = params.attempt.assistantTexts.join("\n\n").trim();
-  if (!text || text.length > 700 || text.includes("```")) {
+  if (!text || text.length > PLANNING_ONLY_MAX_VISIBLE_TEXT || text.includes("```")) {
    return null;
  }
  if (!PLANNING_ONLY_PROMISE_RE.test(text) && !hasStructuredPlanningOnlyFormat(text)) {
--- a/src/agents/pi-embedded-subscribe.handlers.tools.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.test.ts
@@ -283,6 +283,71 @@ describe("handleToolExecutionEnd mutating failure recovery", () => {
    });
  });

+  it("marks successful subagents control actions as replay-invalid", async () => {
+    const { ctx } = createTestContext();
+
+    await handleToolExecutionStart(
+      ctx as never,
+      {
+        type: "tool_execution_start",
+        toolName: "subagents",
+        toolCallId: "tool-subagents-kill",
+        args: {
+          action: "kill",
+          target: "worker-1",
+        },
+      } as never,
+    );
+
+    await handleToolExecutionEnd(
+      ctx as never,
+      {
+        type: "tool_execution_end",
+        toolName: "subagents",
+        toolCallId: "tool-subagents-kill",
+        isError: false,
+        result: { status: "ok", action: "kill", target: "worker-1" },
+      } as never,
+    );
+
+    expect(ctx.state.replayState).toEqual({
+      replayInvalid: true,
+      hadPotentialSideEffects: true,
+    });
+  });
+
+  it("keeps read-only subagents list actions replay-safe", async () => {
+    const { ctx } = createTestContext();
+
+    await handleToolExecutionStart(
+      ctx as never,
+      {
+        type: "tool_execution_start",
+        toolName: "subagents",
+        toolCallId: "tool-subagents-list",
+        args: {
+          action: "list",
+        },
+      } as never,
+    );
+
+    await handleToolExecutionEnd(
+      ctx as never,
+      {
+        type: "tool_execution_end",
+        toolName: "subagents",
+        toolCallId: "tool-subagents-list",
+        isError: false,
+        result: { status: "ok", action: "list", total: 0, text: "no active subagents." },
+      } as never,
+    );
+
+    expect(ctx.state.replayState).toEqual({
+      replayInvalid: false,
+      hadPotentialSideEffects: false,
+    });
+  });
+
  it("keeps successful mutating retries replay-invalid after an earlier tool failure", async () => {
    const { ctx } = createTestContext();

--- a/src/agents/tool-mutation.test.ts
+++ b/src/agents/tool-mutation.test.ts
@@ -58,6 +58,13 @@ describe("tool mutation helpers", () => {
      buildToolMutationState("message", { action: "send", to: "telegram:1" }).mutatingAction,
    ).toBe(true);
    expect(buildToolMutationState("browser", { action: "list" }).mutatingAction).toBe(false);
+    expect(
+      buildToolMutationState("subagents", { action: "kill", target: "worker-1" }).mutatingAction,
+    ).toBe(true);
+    expect(
+      buildToolMutationState("subagents", { action: "steer", target: "worker-1" }).mutatingAction,
+    ).toBe(true);
+    expect(buildToolMutationState("subagents", { action: "list" }).mutatingAction).toBe(false);
  });

  it("matches tool actions by fingerprint and fails closed on asymmetric data", () => {
@@ -82,6 +89,7 @@ describe("tool mutation helpers", () => {
  });

  it("keeps legacy name-only mutating heuristics for payload fallback", () => {
+    expect(isLikelyMutatingToolName("sessions_spawn")).toBe(true);
    expect(isLikelyMutatingToolName("sessions_send")).toBe(true);
    expect(isLikelyMutatingToolName("browser_actions")).toBe(true);
    expect(isLikelyMutatingToolName("message_slack")).toBe(true);
--- a/src/agents/tool-mutation.ts
+++ b/src/agents/tool-mutation.ts
@@ -12,6 +12,7 @@ const MUTATING_TOOL_NAMES = new Set([
  "bash",
  "process",
  "message",
+  "sessions_spawn",
  "sessions_send",
  "cron",
  "gateway",
@@ -129,6 +130,8 @@ export function isMutatingToolCall(toolName: string, args: unknown): boolean {
        typeof record?.content === "string" ||
        typeof record?.message === "string"
      );
+    case "subagents":
+      return action === "kill" || action === "steer";
    case "session_status":
      return typeof record?.model === "string" && record.model.trim().length > 0;
    default: {
--- a/src/commands/doctor/shared/channel-legacy-config-migrate.test.ts
+++ b/src/commands/doctor/shared/channel-legacy-config-migrate.test.ts
@@ -1,5 +1,4 @@
-import { describe, expect, it, vi } from "vitest";
-import { applyChannelDoctorCompatibilityMigrations } from "./channel-legacy-config-migrate.js";
+import { beforeAll, describe, expect, it, vi } from "vitest";

 const applyPluginDoctorCompatibilityMigrations = vi.hoisted(() => vi.fn());

@@ -8,6 +7,16 @@ vi.mock("../../../plugins/doctor-contract-registry.js", () => ({
    applyPluginDoctorCompatibilityMigrations(...args),
 }));

+let applyChannelDoctorCompatibilityMigrations: typeof import("./channel-legacy-config-migrate.js").applyChannelDoctorCompatibilityMigrations;
+
+beforeAll(async () => {
+  // Commands runs on the shared non-isolated worker, so reload after installing
+  // this file's mock to avoid inheriting a cached real registry import.
+  vi.resetModules();
+  ({ applyChannelDoctorCompatibilityMigrations } =
+    await import("./channel-legacy-config-migrate.js"));
+});
+
 describe("bundled channel legacy config migrations", () => {
  it("normalizes legacy private-network aliases exposed through bundled contract surfaces", () => {
    applyPluginDoctorCompatibilityMigrations.mockReturnValueOnce({