From c60282421524ef26c9367aac66331bd7686817c0 Mon Sep 17 00:00:00 2001 From: Mariano <132747814+mbelinky@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:10:03 +0200 Subject: [PATCH] fix(cron): stop unresolved next-run refire loops (#66083) Merged via squash. Prepared head SHA: b86ba58d3bff3f290995c830e992a739e5510665 --- CHANGELOG.md | 1 + src/cron/service.armtimer-tight-loop.test.ts | 37 ++++++ ...ce.issue-66019-unresolved-next-run.test.ts | 114 ++++++++++++++++++ src/cron/service/timer.regression.test.ts | 89 +++++++++++++- src/cron/service/timer.ts | 50 +++++++- 5 files changed, 285 insertions(+), 6 deletions(-) create mode 100644 src/cron/service.issue-66019-unresolved-next-run.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c21d7bbd20..b0e271da3e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Docs: https://docs.openclaw.ai - Browser/CDP: let managed local Chrome readiness, status probes, and managed loopback CDP control bypass browser SSRF policy for their own loopback control plane, so OpenClaw no longer misclassifies a healthy child browser as "not reachable after start". (#65695, #66043) Thanks @mbelinky. - Gateway/sessions: stop heartbeat, cron-event, and exec-event turns from overwriting shared-session routing and origin metadata, preventing synthetic `heartbeat` targets from poisoning later cron or user delivery. (#63733, #35300) - Browser/CDP: let local attach-only `manual-cdp` profiles reuse the local loopback CDP control plane under strict default policy and remote-class probe timeouts, so tabs/snapshot stop falsely reporting a live local browser session as not running. (#65611, #66080) Thanks @mbelinky. +- Cron/scheduler: stop inventing short retries when cron next-run calculation returns no valid future slot, and keep a maintenance wake armed so enabled unscheduled jobs recover without entering a refire loop. (#66019, #66083) Thanks @mbelinky. ## 2026.4.12 diff --git a/src/cron/service.armtimer-tight-loop.test.ts b/src/cron/service.armtimer-tight-loop.test.ts index b725adc78d6..c8cca51af98 100644 --- a/src/cron/service.armtimer-tight-loop.test.ts +++ b/src/cron/service.armtimer-tight-loop.test.ts @@ -141,6 +141,43 @@ describe("CronService - armTimer tight loop prevention", () => { timeoutSpy.mockRestore(); }); + it("keeps a maintenance wake armed when enabled jobs have no nextRunAtMs", () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + const now = Date.parse("2026-02-28T12:32:00.000Z"); + + const state = createTimerState({ + storePath: "/tmp/test-cron/jobs.json", + now, + }); + state.store = { + version: 1, + jobs: [ + { + id: "missing-next-run", + name: "missing-next-run", + enabled: true, + deleteAfterRun: false, + createdAtMs: now - 60_000, + updatedAtMs: now - 60_000, + schedule: { kind: "cron", expr: "*/15 * * * *" }, + sessionTarget: "isolated" as const, + wakeMode: "next-heartbeat" as const, + payload: { kind: "agentTurn" as const, message: "test" }, + delivery: { mode: "none" as const }, + state: {}, + }, + ], + }; + + armTimer(state); + + expect(state.timer).not.toBeNull(); + const delays = extractTimeoutDelays(timeoutSpy); + expect(delays).toContain(60_000); + + timeoutSpy.mockRestore(); + }); + it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => { const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); const store = await makeStorePath(); diff --git a/src/cron/service.issue-66019-unresolved-next-run.test.ts b/src/cron/service.issue-66019-unresolved-next-run.test.ts new file mode 100644 index 00000000000..18f447c7cbe --- /dev/null +++ b/src/cron/service.issue-66019-unresolved-next-run.test.ts @@ -0,0 +1,114 @@ +import { describe, expect, it, vi } from "vitest"; +import { + createDefaultIsolatedRunner, + createIsolatedRegressionJob, + noopLogger, + setupCronRegressionFixtures, + writeCronJobs, +} from "../../test/helpers/cron/service-regression-fixtures.js"; +import * as schedule from "./schedule.js"; +import { createCronServiceState } from "./service/state.js"; +import { onTimer } from "./service/timer.js"; + +const issue66019Fixtures = setupCronRegressionFixtures({ prefix: "cron-66019-" }); + +describe("#66019 unresolved next-run repro", () => { + it("does not refire a recurring cron job 2s later when next-run resolution returns undefined", async () => { + const store = issue66019Fixtures.makeStorePath(); + const scheduledAt = Date.parse("2026-04-13T15:40:00.000Z"); + let now = scheduledAt; + + const cronJob = createIsolatedRegressionJob({ + id: "cron-66019-minimal-success", + name: "cron-66019-minimal-success", + scheduledAt, + schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" }, + payload: { kind: "agentTurn", message: "ping" }, + state: { nextRunAtMs: scheduledAt - 1_000 }, + }); + await writeCronJobs(store.storePath, [cronJob]); + + const runIsolatedAgentJob = createDefaultIsolatedRunner(); + const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined); + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob, + }); + + try { + await onTimer(state); + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1); + expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined(); + + // Before the fix, applyJobResult would synthesize endedAt + 2_000 here, + // so a second tick a couple seconds later would refire the same job. + now = scheduledAt + 2_001; + await onTimer(state); + + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1); + expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined(); + } finally { + nextRunSpy.mockRestore(); + if (state.timer) { + clearTimeout(state.timer); + state.timer = null; + } + } + }); + + it("does not refire a recurring errored cron job after the first backoff window when next-run resolution returns undefined", async () => { + const store = issue66019Fixtures.makeStorePath(); + const scheduledAt = Date.parse("2026-04-13T15:45:00.000Z"); + let now = scheduledAt; + + const cronJob = createIsolatedRegressionJob({ + id: "cron-66019-minimal-error", + name: "cron-66019-minimal-error", + scheduledAt, + schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" }, + payload: { kind: "agentTurn", message: "ping" }, + state: { nextRunAtMs: scheduledAt - 1_000 }, + }); + await writeCronJobs(store.storePath, [cronJob]); + + const runIsolatedAgentJob = vi.fn().mockResolvedValue({ + status: "error", + error: "synthetic failure", + }); + const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined); + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob, + }); + + try { + await onTimer(state); + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1); + expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined(); + + // Before the fix, the error branch would synthesize the first backoff + // retry (30s), so the next tick after that window would rerun the job. + now = scheduledAt + 30_001; + await onTimer(state); + + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1); + expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined(); + } finally { + nextRunSpy.mockRestore(); + if (state.timer) { + clearTimeout(state.timer); + state.timer = null; + } + } + }); +}); diff --git a/src/cron/service/timer.regression.test.ts b/src/cron/service/timer.regression.test.ts index a9ba523407e..f010df729bc 100644 --- a/src/cron/service/timer.regression.test.ts +++ b/src/cron/service/timer.regression.test.ts @@ -1061,11 +1061,11 @@ describe("cron service timer regressions", () => { expect(job.state.lastStatus).toBe("ok"); expect(job.state.scheduleErrorCount).toBe(1); expect(job.state.lastError).toMatch(/^schedule error:/); - expect(job.state.nextRunAtMs).toBe(endedAt + 2_000); + expect(job.state.nextRunAtMs).toBeUndefined(); expect(job.enabled).toBe(true); }); - it("falls back to backoff schedule when cron next-run computation throws on error path (#30905)", () => { + it("keeps state updates when cron next-run computation throws on error path (#30905)", () => { const startedAt = Date.parse("2026-03-02T12:05:00.000Z"); const endedAt = startedAt + 25; const state = createCronServiceState({ @@ -1100,10 +1100,93 @@ describe("cron service timer regressions", () => { expect(job.state.consecutiveErrors).toBe(1); expect(job.state.scheduleErrorCount).toBe(1); expect(job.state.lastError).toMatch(/^schedule error:/); - expect(job.state.nextRunAtMs).toBe(endedAt + 30_000); + expect(job.state.nextRunAtMs).toBeUndefined(); expect(job.enabled).toBe(true); }); + it("does not synthesize a 2s retry when cron schedule computation returns undefined (#66019)", () => { + const startedAt = Date.parse("2026-04-13T15:40:00.000Z"); + const endedAt = startedAt + 50; + const state = createCronServiceState({ + cronEnabled: true, + storePath: "/tmp/cron-66019-success.json", + log: noopLogger, + nowMs: () => endedAt, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: createDefaultIsolatedRunner(), + }); + const job = createIsolatedRegressionJob({ + id: "cron-66019-success", + name: "cron-66019-success", + scheduledAt: startedAt, + schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" }, + payload: { kind: "agentTurn", message: "ping" }, + state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 }, + }); + const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined); + + try { + const shouldDelete = applyJobResult(state, job, { + status: "ok", + delivered: true, + startedAt, + endedAt, + }); + + expect(shouldDelete).toBe(false); + expect(job.state.runningAtMs).toBeUndefined(); + expect(job.state.lastRunAtMs).toBe(startedAt); + expect(job.state.lastStatus).toBe("ok"); + expect(job.state.nextRunAtMs).toBeUndefined(); + expect(job.enabled).toBe(true); + } finally { + nextRunSpy.mockRestore(); + } + }); + + it("does not synthesize backoff retries when cron schedule computation returns undefined (#66019)", () => { + const startedAt = Date.parse("2026-04-13T15:45:00.000Z"); + const endedAt = startedAt + 25; + const state = createCronServiceState({ + cronEnabled: true, + storePath: "/tmp/cron-66019-error.json", + log: noopLogger, + nowMs: () => endedAt, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: createDefaultIsolatedRunner(), + }); + const job = createIsolatedRegressionJob({ + id: "cron-66019-error", + name: "cron-66019-error", + scheduledAt: startedAt, + schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" }, + payload: { kind: "agentTurn", message: "ping" }, + state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 }, + }); + const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined); + + try { + const shouldDelete = applyJobResult(state, job, { + status: "error", + error: "synthetic failure", + startedAt, + endedAt, + }); + + expect(shouldDelete).toBe(false); + expect(job.state.runningAtMs).toBeUndefined(); + expect(job.state.lastRunAtMs).toBe(startedAt); + expect(job.state.lastStatus).toBe("error"); + expect(job.state.consecutiveErrors).toBe(1); + expect(job.state.nextRunAtMs).toBeUndefined(); + expect(job.enabled).toBe(true); + } finally { + nextRunSpy.mockRestore(); + } + }); + it("force run preserves 'every' anchor while recording manual lastRunAtMs", () => { const nowMs = Date.now(); const everyMs = 24 * 60 * 60 * 1_000; diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 8cfe3b19270..5bfd0c9e7a1 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -240,6 +240,27 @@ function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[] return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error)); } +function resolveCronNextRunWithLowerBound(params: { + state: CronServiceState; + job: CronJob; + naturalNext: number | undefined; + lowerBoundMs: number; + context: "completion" | "error_backoff"; +}): number | undefined { + if (params.naturalNext === undefined) { + params.state.deps.log.warn( + { + jobId: params.job.id, + jobName: params.job.name, + context: params.context, + }, + "cron: next run unresolved; clearing schedule to avoid a refire loop", + ); + return undefined; + } + return Math.max(params.naturalNext, params.lowerBoundMs); +} + function resolveRetryConfig(cronConfig?: CronConfig) { const retry = cronConfig?.retry; return { @@ -518,7 +539,17 @@ export function applyJobResult( const backoffNext = result.endedAt + backoff; // Use whichever is later: the natural next run or the backoff delay. job.state.nextRunAtMs = - normalNext !== undefined ? Math.max(normalNext, backoffNext) : backoffNext; + job.schedule.kind === "cron" + ? resolveCronNextRunWithLowerBound({ + state, + job, + naturalNext: normalNext, + lowerBoundMs: backoffNext, + context: "error_backoff", + }) + : normalNext !== undefined + ? Math.max(normalNext, backoffNext) + : backoffNext; state.deps.log.info( { jobId: job.id, @@ -547,8 +578,13 @@ export function applyJobResult( // schedule computation lands in the same second due to // timezone/croner edge cases (see #17821). const minNext = result.endedAt + MIN_REFIRE_GAP_MS; - job.state.nextRunAtMs = - naturalNext !== undefined ? Math.max(naturalNext, minNext) : minNext; + job.state.nextRunAtMs = resolveCronNextRunWithLowerBound({ + state, + job, + naturalNext, + lowerBoundMs: minNext, + context: "completion", + }); } else { job.state.nextRunAtMs = naturalNext; } @@ -609,6 +645,14 @@ export function armTimer(state: CronServiceState) { const withNextRun = state.store?.jobs.filter((j) => j.enabled && hasScheduledNextRunAtMs(j.state.nextRunAtMs)) .length ?? 0; + if (enabledCount > 0) { + armRunningRecheckTimer(state); + state.deps.log.debug( + { jobCount, enabledCount, withNextRun, delayMs: MAX_TIMER_DELAY_MS }, + "cron: timer armed for maintenance recheck", + ); + return; + } state.deps.log.debug( { jobCount, enabledCount, withNextRun }, "cron: armTimer skipped - no jobs with nextRunAtMs",