From c60282421524ef26c9367aac66331bd7686817c0 Mon Sep 17 00:00:00 2001
From: Mariano <132747814+mbelinky@users.noreply.github.com>
Date: Mon, 13 Apr 2026 20:10:03 +0200
Subject: [PATCH] fix(cron): stop unresolved next-run refire loops (#66083)

Merged via squash.

Prepared head SHA: b86ba58d3bff3f290995c830e992a739e5510665
---
 CHANGELOG.md                                  |   1 +
 src/cron/service.armtimer-tight-loop.test.ts  |  37 ++++++
 ...ce.issue-66019-unresolved-next-run.test.ts | 114 ++++++++++++++++++
 src/cron/service/timer.regression.test.ts     |  89 +++++++++++++-
 src/cron/service/timer.ts                     |  50 +++++++-
 5 files changed, 285 insertions(+), 6 deletions(-)
 create mode 100644 src/cron/service.issue-66019-unresolved-next-run.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5c21d7bbd20..b0e271da3e0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ Docs: https://docs.openclaw.ai
 - Browser/CDP: let managed local Chrome readiness, status probes, and managed loopback CDP control bypass browser SSRF policy for their own loopback control plane, so OpenClaw no longer misclassifies a healthy child browser as "not reachable after start". (#65695, #66043) Thanks @mbelinky.
 - Gateway/sessions: stop heartbeat, cron-event, and exec-event turns from overwriting shared-session routing and origin metadata, preventing synthetic `heartbeat` targets from poisoning later cron or user delivery. (#63733, #35300)
 - Browser/CDP: let local attach-only `manual-cdp` profiles reuse the local loopback CDP control plane under strict default policy and remote-class probe timeouts, so tabs/snapshot stop falsely reporting a live local browser session as not running. (#65611, #66080) Thanks @mbelinky.
+- Cron/scheduler: stop inventing short retries when cron next-run calculation returns no valid future slot, and keep a maintenance wake armed so enabled unscheduled jobs recover without entering a refire loop. (#66019, #66083) Thanks @mbelinky.
 
 ## 2026.4.12
 
diff --git a/src/cron/service.armtimer-tight-loop.test.ts b/src/cron/service.armtimer-tight-loop.test.ts
index b725adc78d6..c8cca51af98 100644
--- a/src/cron/service.armtimer-tight-loop.test.ts
+++ b/src/cron/service.armtimer-tight-loop.test.ts
@@ -141,6 +141,43 @@ describe("CronService - armTimer tight loop prevention", () => {
     timeoutSpy.mockRestore();
   });
 
+  it("keeps a maintenance wake armed when enabled jobs have no nextRunAtMs", () => {
+    const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
+    const now = Date.parse("2026-02-28T12:32:00.000Z");
+
+    const state = createTimerState({
+      storePath: "/tmp/test-cron/jobs.json",
+      now,
+    });
+    state.store = {
+      version: 1,
+      jobs: [
+        {
+          id: "missing-next-run",
+          name: "missing-next-run",
+          enabled: true,
+          deleteAfterRun: false,
+          createdAtMs: now - 60_000,
+          updatedAtMs: now - 60_000,
+          schedule: { kind: "cron", expr: "*/15 * * * *" },
+          sessionTarget: "isolated" as const,
+          wakeMode: "next-heartbeat" as const,
+          payload: { kind: "agentTurn" as const, message: "test" },
+          delivery: { mode: "none" as const },
+          state: {},
+        },
+      ],
+    };
+
+    armTimer(state);
+
+    expect(state.timer).not.toBeNull();
+    const delays = extractTimeoutDelays(timeoutSpy);
+    expect(delays).toContain(60_000);
+
+    timeoutSpy.mockRestore();
+  });
+
   it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => {
     const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
     const store = await makeStorePath();
diff --git a/src/cron/service.issue-66019-unresolved-next-run.test.ts b/src/cron/service.issue-66019-unresolved-next-run.test.ts
new file mode 100644
index 00000000000..18f447c7cbe
--- /dev/null
+++ b/src/cron/service.issue-66019-unresolved-next-run.test.ts
@@ -0,0 +1,114 @@
+import { describe, expect, it, vi } from "vitest";
+import {
+  createDefaultIsolatedRunner,
+  createIsolatedRegressionJob,
+  noopLogger,
+  setupCronRegressionFixtures,
+  writeCronJobs,
+} from "../../test/helpers/cron/service-regression-fixtures.js";
+import * as schedule from "./schedule.js";
+import { createCronServiceState } from "./service/state.js";
+import { onTimer } from "./service/timer.js";
+
+const issue66019Fixtures = setupCronRegressionFixtures({ prefix: "cron-66019-" });
+
+describe("#66019 unresolved next-run repro", () => {
+  it("does not refire a recurring cron job 2s later when next-run resolution returns undefined", async () => {
+    const store = issue66019Fixtures.makeStorePath();
+    const scheduledAt = Date.parse("2026-04-13T15:40:00.000Z");
+    let now = scheduledAt;
+
+    const cronJob = createIsolatedRegressionJob({
+      id: "cron-66019-minimal-success",
+      name: "cron-66019-minimal-success",
+      scheduledAt,
+      schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
+      payload: { kind: "agentTurn", message: "ping" },
+      state: { nextRunAtMs: scheduledAt - 1_000 },
+    });
+    await writeCronJobs(store.storePath, [cronJob]);
+
+    const runIsolatedAgentJob = createDefaultIsolatedRunner();
+    const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
+    const state = createCronServiceState({
+      cronEnabled: true,
+      storePath: store.storePath,
+      log: noopLogger,
+      nowMs: () => now,
+      enqueueSystemEvent: vi.fn(),
+      requestHeartbeatNow: vi.fn(),
+      runIsolatedAgentJob,
+    });
+
+    try {
+      await onTimer(state);
+      expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
+      expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
+
+      // Before the fix, applyJobResult would synthesize endedAt + 2_000 here,
+      // so a second tick a couple seconds later would refire the same job.
+      now = scheduledAt + 2_001;
+      await onTimer(state);
+
+      expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
+      expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
+    } finally {
+      nextRunSpy.mockRestore();
+      if (state.timer) {
+        clearTimeout(state.timer);
+        state.timer = null;
+      }
+    }
+  });
+
+  it("does not refire a recurring errored cron job after the first backoff window when next-run resolution returns undefined", async () => {
+    const store = issue66019Fixtures.makeStorePath();
+    const scheduledAt = Date.parse("2026-04-13T15:45:00.000Z");
+    let now = scheduledAt;
+
+    const cronJob = createIsolatedRegressionJob({
+      id: "cron-66019-minimal-error",
+      name: "cron-66019-minimal-error",
+      scheduledAt,
+      schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
+      payload: { kind: "agentTurn", message: "ping" },
+      state: { nextRunAtMs: scheduledAt - 1_000 },
+    });
+    await writeCronJobs(store.storePath, [cronJob]);
+
+    const runIsolatedAgentJob = vi.fn().mockResolvedValue({
+      status: "error",
+      error: "synthetic failure",
+    });
+    const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
+    const state = createCronServiceState({
+      cronEnabled: true,
+      storePath: store.storePath,
+      log: noopLogger,
+      nowMs: () => now,
+      enqueueSystemEvent: vi.fn(),
+      requestHeartbeatNow: vi.fn(),
+      runIsolatedAgentJob,
+    });
+
+    try {
+      await onTimer(state);
+      expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
+      expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
+
+      // Before the fix, the error branch would synthesize the first backoff
+      // retry (30s), so the next tick after that window would rerun the job.
+      now = scheduledAt + 30_001;
+      await onTimer(state);
+
+      expect(runIsolatedAgentJob).toHaveBeenCalledTimes(1);
+      expect(state.store?.jobs[0]?.state.nextRunAtMs).toBeUndefined();
+    } finally {
+      nextRunSpy.mockRestore();
+      if (state.timer) {
+        clearTimeout(state.timer);
+        state.timer = null;
+      }
+    }
+  });
+});
diff --git a/src/cron/service/timer.regression.test.ts b/src/cron/service/timer.regression.test.ts
index a9ba523407e..f010df729bc 100644
--- a/src/cron/service/timer.regression.test.ts
+++ b/src/cron/service/timer.regression.test.ts
@@ -1061,11 +1061,11 @@ describe("cron service timer regressions", () => {
     expect(job.state.lastStatus).toBe("ok");
     expect(job.state.scheduleErrorCount).toBe(1);
     expect(job.state.lastError).toMatch(/^schedule error:/);
-    expect(job.state.nextRunAtMs).toBe(endedAt + 2_000);
+    expect(job.state.nextRunAtMs).toBeUndefined();
     expect(job.enabled).toBe(true);
   });
 
-  it("falls back to backoff schedule when cron next-run computation throws on error path (#30905)", () => {
+  it("keeps state updates when cron next-run computation throws on error path (#30905)", () => {
     const startedAt = Date.parse("2026-03-02T12:05:00.000Z");
     const endedAt = startedAt + 25;
     const state = createCronServiceState({
@@ -1100,10 +1100,93 @@ describe("cron service timer regressions", () => {
     expect(job.state.consecutiveErrors).toBe(1);
     expect(job.state.scheduleErrorCount).toBe(1);
     expect(job.state.lastError).toMatch(/^schedule error:/);
-    expect(job.state.nextRunAtMs).toBe(endedAt + 30_000);
+    expect(job.state.nextRunAtMs).toBeUndefined();
     expect(job.enabled).toBe(true);
   });
 
+  it("does not synthesize a 2s retry when cron schedule computation returns undefined (#66019)", () => {
+    const startedAt = Date.parse("2026-04-13T15:40:00.000Z");
+    const endedAt = startedAt + 50;
+    const state = createCronServiceState({
+      cronEnabled: true,
+      storePath: "/tmp/cron-66019-success.json",
+      log: noopLogger,
+      nowMs: () => endedAt,
+      enqueueSystemEvent: vi.fn(),
+      requestHeartbeatNow: vi.fn(),
+      runIsolatedAgentJob: createDefaultIsolatedRunner(),
+    });
+    const job = createIsolatedRegressionJob({
+      id: "cron-66019-success",
+      name: "cron-66019-success",
+      scheduledAt: startedAt,
+      schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
+      payload: { kind: "agentTurn", message: "ping" },
+      state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
+    });
+    const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
+
+    try {
+      const shouldDelete = applyJobResult(state, job, {
+        status: "ok",
+        delivered: true,
+        startedAt,
+        endedAt,
+      });
+
+      expect(shouldDelete).toBe(false);
+      expect(job.state.runningAtMs).toBeUndefined();
+      expect(job.state.lastRunAtMs).toBe(startedAt);
+      expect(job.state.lastStatus).toBe("ok");
+      expect(job.state.nextRunAtMs).toBeUndefined();
+      expect(job.enabled).toBe(true);
+    } finally {
+      nextRunSpy.mockRestore();
+    }
+  });
+
+  it("does not synthesize backoff retries when cron schedule computation returns undefined (#66019)", () => {
+    const startedAt = Date.parse("2026-04-13T15:45:00.000Z");
+    const endedAt = startedAt + 25;
+    const state = createCronServiceState({
+      cronEnabled: true,
+      storePath: "/tmp/cron-66019-error.json",
+      log: noopLogger,
+      nowMs: () => endedAt,
+      enqueueSystemEvent: vi.fn(),
+      requestHeartbeatNow: vi.fn(),
+      runIsolatedAgentJob: createDefaultIsolatedRunner(),
+    });
+    const job = createIsolatedRegressionJob({
+      id: "cron-66019-error",
+      name: "cron-66019-error",
+      scheduledAt: startedAt,
+      schedule: { kind: "cron", expr: "0 7 * * *", tz: "Asia/Shanghai" },
+      payload: { kind: "agentTurn", message: "ping" },
+      state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
+    });
+    const nextRunSpy = vi.spyOn(schedule, "computeNextRunAtMs").mockReturnValue(undefined);
+
+    try {
+      const shouldDelete = applyJobResult(state, job, {
+        status: "error",
+        error: "synthetic failure",
+        startedAt,
+        endedAt,
+      });
+
+      expect(shouldDelete).toBe(false);
+      expect(job.state.runningAtMs).toBeUndefined();
+      expect(job.state.lastRunAtMs).toBe(startedAt);
+      expect(job.state.lastStatus).toBe("error");
+      expect(job.state.consecutiveErrors).toBe(1);
+      expect(job.state.nextRunAtMs).toBeUndefined();
+      expect(job.enabled).toBe(true);
+    } finally {
+      nextRunSpy.mockRestore();
+    }
+  });
+
   it("force run preserves 'every' anchor while recording manual lastRunAtMs", () => {
     const nowMs = Date.now();
     const everyMs = 24 * 60 * 60 * 1_000;
diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts
index 8cfe3b19270..5bfd0c9e7a1 100644
--- a/src/cron/service/timer.ts
+++ b/src/cron/service/timer.ts
@@ -240,6 +240,27 @@ function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]
   return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error));
 }
 
+function resolveCronNextRunWithLowerBound(params: {
+  state: CronServiceState;
+  job: CronJob;
+  naturalNext: number | undefined;
+  lowerBoundMs: number;
+  context: "completion" | "error_backoff";
+}): number | undefined {
+  if (params.naturalNext === undefined) {
+    params.state.deps.log.warn(
+      {
+        jobId: params.job.id,
+        jobName: params.job.name,
+        context: params.context,
+      },
+      "cron: next run unresolved; clearing schedule to avoid a refire loop",
+    );
+    return undefined;
+  }
+  return Math.max(params.naturalNext, params.lowerBoundMs);
+}
+
 function resolveRetryConfig(cronConfig?: CronConfig) {
   const retry = cronConfig?.retry;
   return {
@@ -518,7 +539,17 @@ export function applyJobResult(
       const backoffNext = result.endedAt + backoff;
       // Use whichever is later: the natural next run or the backoff delay.
       job.state.nextRunAtMs =
-        normalNext !== undefined ? Math.max(normalNext, backoffNext) : backoffNext;
+        job.schedule.kind === "cron"
+          ? resolveCronNextRunWithLowerBound({
+              state,
+              job,
+              naturalNext: normalNext,
+              lowerBoundMs: backoffNext,
+              context: "error_backoff",
+            })
+          : normalNext !== undefined
+            ? Math.max(normalNext, backoffNext)
+            : backoffNext;
       state.deps.log.info(
         {
           jobId: job.id,
@@ -547,8 +578,13 @@ export function applyJobResult(
         // schedule computation lands in the same second due to
         // timezone/croner edge cases (see #17821).
         const minNext = result.endedAt + MIN_REFIRE_GAP_MS;
-        job.state.nextRunAtMs =
-          naturalNext !== undefined ? Math.max(naturalNext, minNext) : minNext;
+        job.state.nextRunAtMs = resolveCronNextRunWithLowerBound({
+          state,
+          job,
+          naturalNext,
+          lowerBoundMs: minNext,
+          context: "completion",
+        });
       } else {
         job.state.nextRunAtMs = naturalNext;
       }
@@ -609,6 +645,14 @@ export function armTimer(state: CronServiceState) {
     const withNextRun =
       state.store?.jobs.filter((j) => j.enabled && hasScheduledNextRunAtMs(j.state.nextRunAtMs))
         .length ?? 0;
+    if (enabledCount > 0) {
+      armRunningRecheckTimer(state);
+      state.deps.log.debug(
+        { jobCount, enabledCount, withNextRun, delayMs: MAX_TIMER_DELAY_MS },
+        "cron: timer armed for maintenance recheck",
+      );
+      return;
+    }
     state.deps.log.debug(
       { jobCount, enabledCount, withNextRun },
       "cron: armTimer skipped - no jobs with nextRunAtMs",