agents: split GPT-5 prompt and retry behavior (#65597)

* agents: split GPT-5 prompt and retry behavior

* agents: fix GPT-5 review follow-ups

* agents: address GPT-5 review follow-ups

* agents: avoid replaying side-effectful GPT retries

* agents: mark subagent control as mutating

* agents: fail closed on single-action retries

* commands: stabilize channel legacy doctor migration test

* agents: narrow single-action retry promise trigger
This commit is contained in:
pashpashpash
2026-04-12 18:52:22 -07:00
committed by GitHub
parent d0c83777fb
commit c848ebc8ce
8 changed files with 352 additions and 17 deletions

View File

@@ -14,6 +14,7 @@ import {
OPENAI_FRIENDLY_PROMPT_OVERLAY,
OPENAI_GPT5_EXECUTION_BIAS,
OPENAI_GPT5_OUTPUT_CONTRACT,
OPENAI_GPT5_TOOL_CALL_STYLE,
} from "./prompt-overlay.js";
const runtimeMocks = vi.hoisted(() => ({
@@ -365,7 +366,7 @@ describe("openai plugin", () => {
};
expect(openaiProvider.resolveSystemPromptContribution?.(contributionContext)).toEqual({
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -382,7 +383,7 @@ describe("openai plugin", () => {
"Occasional emoji are welcome when they fit naturally, especially for warmth or brief celebration; keep them sparse.",
);
expect(codexProvider.resolveSystemPromptContribution?.(contributionContext)).toEqual({
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -454,9 +455,22 @@ describe("openai plugin", () => {
expect(OPENAI_FRIENDLY_PROMPT_OVERLAY).toContain(
"Occasional emoji are welcome when they fit naturally, especially for warmth or brief celebration; keep them sparse.",
);
expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
"Use a real tool call or concrete action FIRST when the task is actionable. Do not stop at a plan or promise-to-act reply.",
);
expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
"If the work will take multiple steps, keep calling tools until the task is done or you hit a real blocker. Do not stop after one step to ask permission.",
);
expect(OPENAI_GPT5_EXECUTION_BIAS).toContain(
"Do prerequisite lookup or discovery before dependent actions.",
);
expect(OPENAI_GPT5_TOOL_CALL_STYLE).toContain(
"Call tools directly without narrating what you are about to do. Do not describe a plan before each tool call.",
);
expect(OPENAI_GPT5_TOOL_CALL_STYLE).toContain(
"When a first-class tool exists for an action, use the tool instead of asking the user to run a command.",
);
expect(OPENAI_GPT5_TOOL_CALL_STYLE).not.toContain("/approve");
expect(OPENAI_GPT5_OUTPUT_CONTRACT).toContain(
"Return the requested sections only, in the requested order.",
);
@@ -486,7 +500,7 @@ describe("openai plugin", () => {
agentId: undefined,
}),
).toEqual({
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -514,7 +528,7 @@ describe("openai plugin", () => {
agentId: undefined,
}),
).toEqual({
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
},
@@ -540,7 +554,7 @@ describe("openai plugin", () => {
agentId: undefined,
}),
).toEqual({
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
},
@@ -567,7 +581,7 @@ describe("openai plugin", () => {
agentId: undefined,
}),
).toEqual({
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
@@ -594,7 +608,7 @@ describe("openai plugin", () => {
agentId: undefined,
}),
).toEqual({
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY,
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,

View File

@@ -64,11 +64,20 @@ Do not use em dashes unless the user explicitly asks for them or they are requir
export const OPENAI_GPT5_EXECUTION_BIAS = `## Execution Bias
Start the real work in the same turn when the next step is clear.
Use a real tool call or concrete action FIRST when the task is actionable. Do not stop at a plan or promise-to-act reply.
Commentary-only turns are incomplete when tools are available and the next action is clear.
If the work will take multiple steps, keep calling tools until the task is done or you hit a real blocker. Do not stop after one step to ask permission.
Do prerequisite lookup or discovery before dependent actions.
If another tool call would likely improve correctness or completeness, keep going instead of stopping at partial progress.
Multi-part requests stay incomplete until every requested item is handled or clearly marked blocked.
Before the final answer, quickly verify correctness, coverage, formatting, and obvious side effects.`;
Act first, then verify if needed. Do not pause to summarize or verify before taking the next action.`;
export const OPENAI_GPT5_TOOL_CALL_STYLE = `## Tool Call Style
Call tools directly without narrating what you are about to do. Do not describe a plan before each tool call.
When a first-class tool exists for an action, use the tool instead of asking the user to run a command.
If multiple tool calls are needed, call them in sequence without stopping to explain between calls.
Default: do not narrate routine, low-risk tool calls (just call the tool).
Narrate only when it genuinely helps: complex multi-step work, sensitive actions like deletions, or when the user explicitly asks for commentary.`;
export type OpenAIPromptOverlayMode = "friendly" | "off";
@@ -103,8 +112,14 @@ export function resolveOpenAISystemPromptContribution(params: {
) {
return undefined;
}
// tool_call_style is NOT overridden via sectionOverrides because the
// default section includes dynamic channel-specific approval guidance
// from buildExecApprovalPromptGuidance() that varies per runtime
// channel. Overriding it with a static string would lose that dynamic
// content. Instead, the tool-first reinforcement lives in stablePrefix
// so it's always present alongside the default tool_call_style section.
return {
stablePrefix: OPENAI_GPT5_OUTPUT_CONTRACT,
stablePrefix: [OPENAI_GPT5_OUTPUT_CONTRACT, OPENAI_GPT5_TOOL_CALL_STYLE].join("\n\n"),
sectionOverrides: {
execution_bias: OPENAI_GPT5_EXECUTION_BIAS,
...(params.mode === "friendly" ? { interaction_style: OPENAI_FRIENDLY_PROMPT_OVERLAY } : {}),

View File

@@ -10,8 +10,10 @@ import {
resetRunOverflowCompactionHarnessMocks,
} from "./run.overflow-compaction.harness.js";
import {
buildAttemptReplayMetadata,
extractPlanningOnlyPlanDetails,
isLikelyExecutionAckPrompt,
PLANNING_ONLY_RETRY_INSTRUCTION,
resolveAckExecutionFastPathInstruction,
resolvePlanningOnlyRetryLimit,
resolvePlanningOnlyRetryInstruction,
@@ -281,7 +283,10 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => {
timedOut: false,
attempt: makeAttemptResult({
assistantTexts: ["I'll inspect the code, make the change, and run the checks."],
toolMetas: [{ toolName: "bash", meta: "ls" }],
toolMetas: [
{ toolName: "read", meta: "path=src/index.ts" },
{ toolName: "search", meta: "pattern=runEmbeddedPiAgent" },
],
}),
});
@@ -436,3 +441,156 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => {
).toBe("paused");
});
});
describe("resolvePlanningOnlyRetryInstruction single-action loophole", () => {
const openaiParams = { provider: "openai", modelId: "gpt-5.4" } as const;
function makeAttemptWithTools(
toolNames: string[],
assistantText: string,
): Parameters<typeof resolvePlanningOnlyRetryInstruction>[0]["attempt"] {
const toolMetas = toolNames.map((toolName) => ({ toolName }));
return {
toolMetas,
assistantTexts: [assistantText],
lastAssistant: { stopReason: "stop" },
itemLifecycle: { startedCount: toolNames.length },
replayMetadata: buildAttemptReplayMetadata({
toolMetas,
didSendViaMessagingTool: false,
}),
clientToolCall: null,
yieldDetected: false,
didSendDeterministicApprovalPrompt: false,
didSendViaMessagingTool: false,
lastToolError: null,
} as unknown as Parameters<typeof resolvePlanningOnlyRetryInstruction>[0]["attempt"];
}
it("retries when exactly 1 non-plan tool call plus 'i can do that' prose is detected", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["read"], "I can do that next."),
});
expect(result).toBe(PLANNING_ONLY_RETRY_INSTRUCTION);
});
it("retries when exactly 1 non-plan tool call plus planning prose is detected", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["read"], "I'll analyze the structure next."),
});
expect(result).toBe(PLANNING_ONLY_RETRY_INSTRUCTION);
});
it("does not retry when 2+ non-plan tool calls are present", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["read", "search"], "I'll verify the output."),
});
expect(result).toBeNull();
});
it("does not retry when 1 tool call plus completion language is present", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["read"], "Done. The file looks correct."),
});
expect(result).toBeNull();
});
it("does not retry when 1 tool call plus 'let me know' handoff is present", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["read"], "Let me know if you need anything else."),
});
expect(result).toBeNull();
});
it("does not retry when 1 tool call plus an answer-style summary is present", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(
["read"],
"I'll summarize the root cause: the provider auth scope is missing.",
),
});
expect(result).toBeNull();
});
it("does not retry when 1 tool call plus a future-tense description is present", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(
["read"],
"I'll describe the issue: the provider auth scope is missing.",
),
});
expect(result).toBeNull();
});
it("does not retry when 1 safe tool call is followed by answer prose joined with 'and'", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["read"], "I'll explain and recommend a fix."),
});
expect(result).toBeNull();
});
it("does not retry when 1 tool call plus a bare 'i can do that' reply is present", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["read"], "I can do that."),
});
expect(result).toBeNull();
});
it("does not retry when the lone tool call already had side effects", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["sessions_spawn"], "I'll continue from there next."),
});
expect(result).toBeNull();
});
it("does not retry when the lone tool call is unclassified", () => {
const result = resolvePlanningOnlyRetryInstruction({
...openaiParams,
aborted: false,
timedOut: false,
attempt: makeAttemptWithTools(["vendor_widget"], "I'll continue from there next."),
});
expect(result).toBeNull();
});
});

View File

@@ -54,6 +54,21 @@ const PLANNING_ONLY_COMPLETION_RE =
/\b(?:done|finished|implemented|updated|fixed|changed|ran|verified|found|here(?:'s| is) what|blocked by|the blocker is)\b/i;
const PLANNING_ONLY_HEADING_RE = /^(?:plan|steps?|next steps?)\s*:/i;
const PLANNING_ONLY_BULLET_RE = /^(?:[-*]\s+|\d+[.)]\s+)/u;
const PLANNING_ONLY_MAX_VISIBLE_TEXT = 700;
const SINGLE_ACTION_EXPLICIT_CONTINUATION_RE =
/\b(?:going to|first[, ]+i(?:'ll| will)|next[, ]+i(?:'ll| will)|then[, ]+i(?:'ll| will)|i can do that next|let me (?!know\b)\w+(?:\s+\w+){0,3}\s+(?:next|then|first)\b)/i;
const SINGLE_ACTION_MULTI_STEP_PROMISE_RE =
/\bi(?:'ll| will)\b(?=[^.!?]{0,160}\b(?:next|then|after(?:wards)?|once)\b)/i;
const SINGLE_ACTION_RESULT_STYLE_RE =
/\b(?:i(?:'ll| will)\s+(?:summarize|explain|share|show|report|describe|clarify|answer|recap)(?:\s+\w+){0,4}\s*:|(?:here(?:'s| is)|summary|result|answer|findings?|root cause)\s*:)/i;
const SINGLE_ACTION_RETRY_SAFE_TOOL_NAMES = new Set([
"read",
"search",
"find",
"grep",
"glob",
"ls",
]);
const DEFAULT_PLANNING_ONLY_RETRY_LIMIT = 1;
const STRICT_AGENTIC_PLANNING_ONLY_RETRY_LIMIT = 2;
const ACK_EXECUTION_NORMALIZED_SET = new Set([
@@ -284,10 +299,51 @@ function countPlanOnlyToolMetas(toolMetas: PlanningOnlyAttempt["toolMetas"]): nu
return toolMetas.filter((entry) => entry.toolName === "update_plan").length;
}
function countNonPlanToolCalls(toolMetas: PlanningOnlyAttempt["toolMetas"]): number {
return toolMetas.filter((entry) => entry.toolName !== "update_plan").length;
}
function hasNonPlanToolActivity(toolMetas: PlanningOnlyAttempt["toolMetas"]): boolean {
return toolMetas.some((entry) => entry.toolName !== "update_plan");
}
function hasSingleRetrySafeNonPlanTool(toolMetas: PlanningOnlyAttempt["toolMetas"]): boolean {
const nonPlanToolNames = toolMetas
.map((entry) => normalizeLowercaseStringOrEmpty(entry.toolName))
.filter((toolName) => toolName && toolName !== "update_plan");
return (
nonPlanToolNames.length === 1 &&
SINGLE_ACTION_RETRY_SAFE_TOOL_NAMES.has(nonPlanToolNames[0] ?? "")
);
}
/**
* Treat a turn with exactly one non-plan tool call plus visible "I'll do X
* next" prose as effectively planning-only from the user's perspective. This
* closes the one-action-then-narrative loophole without changing the 2+ tool
* call path, which still counts as real multi-step progress.
*/
function isSingleActionThenNarrativePattern(params: {
toolMetas: PlanningOnlyAttempt["toolMetas"];
assistantTexts: readonly string[];
}): boolean {
const nonPlanCount = countNonPlanToolCalls(params.toolMetas);
if (nonPlanCount !== 1) {
return false;
}
const text = params.assistantTexts.join("\n\n").trim();
if (!text || text.length > PLANNING_ONLY_MAX_VISIBLE_TEXT) {
return false;
}
if (SINGLE_ACTION_RESULT_STYLE_RE.test(text)) {
return false;
}
return (
SINGLE_ACTION_EXPLICIT_CONTINUATION_RE.test(text) ||
SINGLE_ACTION_MULTI_STEP_PROMISE_RE.test(text)
);
}
export function resolvePlanningOnlyRetryLimit(
executionContract?: EmbeddedPiExecutionContract,
): number {
@@ -304,6 +360,12 @@ export function resolvePlanningOnlyRetryInstruction(params: {
attempt: PlanningOnlyAttempt;
}): string | null {
const planOnlyToolMetaCount = countPlanOnlyToolMetas(params.attempt.toolMetas);
const singleActionNarrative = isSingleActionThenNarrativePattern({
toolMetas: params.attempt.toolMetas,
assistantTexts: params.attempt.assistantTexts,
});
const allowSingleActionRetryBypass =
singleActionNarrative && hasSingleRetrySafeNonPlanTool(params.attempt.toolMetas);
if (
!shouldApplyPlanningOnlyRetryGuard({
provider: params.provider,
@@ -316,8 +378,9 @@ export function resolvePlanningOnlyRetryInstruction(params: {
params.attempt.didSendDeterministicApprovalPrompt ||
params.attempt.didSendViaMessagingTool ||
params.attempt.lastToolError ||
hasNonPlanToolActivity(params.attempt.toolMetas) ||
params.attempt.itemLifecycle.startedCount > planOnlyToolMetaCount ||
(hasNonPlanToolActivity(params.attempt.toolMetas) && !allowSingleActionRetryBypass) ||
(params.attempt.itemLifecycle.startedCount > planOnlyToolMetaCount &&
!allowSingleActionRetryBypass) ||
params.attempt.replayMetadata.hadPotentialSideEffects
) {
return null;
@@ -329,7 +392,7 @@ export function resolvePlanningOnlyRetryInstruction(params: {
}
const text = params.attempt.assistantTexts.join("\n\n").trim();
if (!text || text.length > 700 || text.includes("```")) {
if (!text || text.length > PLANNING_ONLY_MAX_VISIBLE_TEXT || text.includes("```")) {
return null;
}
if (!PLANNING_ONLY_PROMISE_RE.test(text) && !hasStructuredPlanningOnlyFormat(text)) {

View File

@@ -283,6 +283,71 @@ describe("handleToolExecutionEnd mutating failure recovery", () => {
});
});
it("marks successful subagents control actions as replay-invalid", async () => {
const { ctx } = createTestContext();
await handleToolExecutionStart(
ctx as never,
{
type: "tool_execution_start",
toolName: "subagents",
toolCallId: "tool-subagents-kill",
args: {
action: "kill",
target: "worker-1",
},
} as never,
);
await handleToolExecutionEnd(
ctx as never,
{
type: "tool_execution_end",
toolName: "subagents",
toolCallId: "tool-subagents-kill",
isError: false,
result: { status: "ok", action: "kill", target: "worker-1" },
} as never,
);
expect(ctx.state.replayState).toEqual({
replayInvalid: true,
hadPotentialSideEffects: true,
});
});
it("keeps read-only subagents list actions replay-safe", async () => {
const { ctx } = createTestContext();
await handleToolExecutionStart(
ctx as never,
{
type: "tool_execution_start",
toolName: "subagents",
toolCallId: "tool-subagents-list",
args: {
action: "list",
},
} as never,
);
await handleToolExecutionEnd(
ctx as never,
{
type: "tool_execution_end",
toolName: "subagents",
toolCallId: "tool-subagents-list",
isError: false,
result: { status: "ok", action: "list", total: 0, text: "no active subagents." },
} as never,
);
expect(ctx.state.replayState).toEqual({
replayInvalid: false,
hadPotentialSideEffects: false,
});
});
it("keeps successful mutating retries replay-invalid after an earlier tool failure", async () => {
const { ctx } = createTestContext();

View File

@@ -58,6 +58,13 @@ describe("tool mutation helpers", () => {
buildToolMutationState("message", { action: "send", to: "telegram:1" }).mutatingAction,
).toBe(true);
expect(buildToolMutationState("browser", { action: "list" }).mutatingAction).toBe(false);
expect(
buildToolMutationState("subagents", { action: "kill", target: "worker-1" }).mutatingAction,
).toBe(true);
expect(
buildToolMutationState("subagents", { action: "steer", target: "worker-1" }).mutatingAction,
).toBe(true);
expect(buildToolMutationState("subagents", { action: "list" }).mutatingAction).toBe(false);
});
it("matches tool actions by fingerprint and fails closed on asymmetric data", () => {
@@ -82,6 +89,7 @@ describe("tool mutation helpers", () => {
});
it("keeps legacy name-only mutating heuristics for payload fallback", () => {
expect(isLikelyMutatingToolName("sessions_spawn")).toBe(true);
expect(isLikelyMutatingToolName("sessions_send")).toBe(true);
expect(isLikelyMutatingToolName("browser_actions")).toBe(true);
expect(isLikelyMutatingToolName("message_slack")).toBe(true);

View File

@@ -12,6 +12,7 @@ const MUTATING_TOOL_NAMES = new Set([
"bash",
"process",
"message",
"sessions_spawn",
"sessions_send",
"cron",
"gateway",
@@ -129,6 +130,8 @@ export function isMutatingToolCall(toolName: string, args: unknown): boolean {
typeof record?.content === "string" ||
typeof record?.message === "string"
);
case "subagents":
return action === "kill" || action === "steer";
case "session_status":
return typeof record?.model === "string" && record.model.trim().length > 0;
default: {

View File

@@ -1,5 +1,4 @@
import { describe, expect, it, vi } from "vitest";
import { applyChannelDoctorCompatibilityMigrations } from "./channel-legacy-config-migrate.js";
import { beforeAll, describe, expect, it, vi } from "vitest";
const applyPluginDoctorCompatibilityMigrations = vi.hoisted(() => vi.fn());
@@ -8,6 +7,16 @@ vi.mock("../../../plugins/doctor-contract-registry.js", () => ({
applyPluginDoctorCompatibilityMigrations(...args),
}));
let applyChannelDoctorCompatibilityMigrations: typeof import("./channel-legacy-config-migrate.js").applyChannelDoctorCompatibilityMigrations;
beforeAll(async () => {
// Commands runs on the shared non-isolated worker, so reload after installing
// this file's mock to avoid inheriting a cached real registry import.
vi.resetModules();
({ applyChannelDoctorCompatibilityMigrations } =
await import("./channel-legacy-config-migrate.js"));
});
describe("bundled channel legacy config migrations", () => {
it("normalizes legacy private-network aliases exposed through bundled contract surfaces", () => {
applyPluginDoctorCompatibilityMigrations.mockReturnValueOnce({