test: speed up Docker live scheduling

2026-04-28 04:28:58 +02:00 · 2026-04-25 10:01:41 +01:00
parent 6bdf87de87
commit 70b3ba2fed
4 changed files with 207 additions and 30 deletions
--- a/docs/reference/test.md
+++ b/docs/reference/test.md
@@ -32,7 +32,8 @@ title: "Tests"
 - Gateway integration: opt-in via `OPENCLAW_TEST_INCLUDE_GATEWAY=1 pnpm test` or `pnpm test:gateway`.
 - `pnpm test:e2e`: Runs gateway end-to-end smoke tests (multi-instance WS/HTTP/node pairing). Defaults to `threads` + `isolate: false` with adaptive workers in `vitest.e2e.config.ts`; tune with `OPENCLAW_E2E_WORKERS=<n>` and set `OPENCLAW_E2E_VERBOSE=1` for verbose logs.
 - `pnpm test:live`: Runs provider live tests (minimax/zai). Requires API keys and `LIVE=1` (or provider-specific `*_LIVE_TEST=1`) to unskip.
- `pnpm test:docker:all`: Builds the shared live-test image and Docker E2E image once, then runs the Docker smoke lanes with `OPENCLAW_SKIP_DOCKER_BUILD=1` through a weighted scheduler. `OPENCLAW_DOCKER_ALL_PARALLELISM=<n>` controls process slots and defaults to 10; `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM=<n>` controls the provider-sensitive tail pool and defaults to 10. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=6`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=8`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7`; use `OPENCLAW_DOCKER_ALL_WEIGHT_LIMIT` or `OPENCLAW_DOCKER_ALL_DOCKER_LIMIT` for larger hosts. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=<ms>`. The runner preflights Docker by default, cleans stale OpenClaw E2E containers, emits active-lane status every 30 seconds, and stores lane timings in `.artifacts/docker-tests/lane-timings.json` for longest-first ordering on later runs. Use `OPENCLAW_DOCKER_ALL_DRY_RUN=1` to print the lane manifest without running Docker, `OPENCLAW_DOCKER_ALL_STATUS_INTERVAL_MS=<ms>` to tune status output, or `OPENCLAW_DOCKER_ALL_TIMINGS=0` to disable timing reuse. The runner stops scheduling new pooled lanes after the first failure unless `OPENCLAW_DOCKER_ALL_FAIL_FAST=0` is set, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. Per-lane logs are written under `.artifacts/docker-tests/<run-id>/`.
+- `pnpm test:docker:all`: Builds the shared live-test image and Docker E2E image once, then runs the Docker smoke lanes with `OPENCLAW_SKIP_DOCKER_BUILD=1` through a weighted scheduler. `OPENCLAW_DOCKER_ALL_PARALLELISM=<n>` controls process slots and defaults to 10; `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM=<n>` controls the provider-sensitive tail pool and defaults to 10. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=9`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=10`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7`; provider caps default to one heavy lane per provider via `OPENCLAW_DOCKER_ALL_LIVE_CLAUDE_LIMIT=4`, `OPENCLAW_DOCKER_ALL_LIVE_CODEX_LIMIT=4`, and `OPENCLAW_DOCKER_ALL_LIVE_GEMINI_LIMIT=4`. Use `OPENCLAW_DOCKER_ALL_WEIGHT_LIMIT` or `OPENCLAW_DOCKER_ALL_DOCKER_LIMIT` for larger hosts. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=<ms>`. The runner preflights Docker by default, cleans stale OpenClaw E2E containers, emits active-lane status every 30 seconds, shares provider CLI tool caches between compatible lanes, retries transient live-provider failures once by default (`OPENCLAW_DOCKER_ALL_LIVE_RETRIES=<n>`), and stores lane timings in `.artifacts/docker-tests/lane-timings.json` for longest-first ordering on later runs. Use `OPENCLAW_DOCKER_ALL_DRY_RUN=1` to print the lane manifest without running Docker, `OPENCLAW_DOCKER_ALL_STATUS_INTERVAL_MS=<ms>` to tune status output, or `OPENCLAW_DOCKER_ALL_TIMINGS=0` to disable timing reuse. Use `OPENCLAW_DOCKER_ALL_LIVE_MODE=skip` for deterministic/local lanes only or `OPENCLAW_DOCKER_ALL_LIVE_MODE=only` for live-provider lanes only; package aliases are `pnpm test:docker:local:all` and `pnpm test:docker:live:all`. Live-only mode merges main and tail live lanes into one longest-first pool so provider buckets can pack Claude, Codex, and Gemini work together. The runner stops scheduling new pooled lanes after the first failure unless `OPENCLAW_DOCKER_ALL_FAIL_FAST=0` is set, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. CLI backend Docker setup commands have their own timeout via `OPENCLAW_LIVE_CLI_BACKEND_SETUP_TIMEOUT_SECONDS` (default 180). Per-lane logs are written under `.artifacts/docker-tests/<run-id>/`.
+- CLI backend live Docker probes can be run as focused lanes, for example `pnpm test:docker:live-cli-backend:codex`, `pnpm test:docker:live-cli-backend:codex:resume`, or `pnpm test:docker:live-cli-backend:codex:mcp`. Claude and Gemini have matching `:resume` and `:mcp` aliases.
 - `pnpm test:docker:openwebui`: Starts Dockerized OpenClaw + Open WebUI, signs in through Open WebUI, checks `/api/models`, then runs a real proxied chat through `/api/chat/completions`. Requires a usable live model key (for example OpenAI in `~/.profile`), pulls an external Open WebUI image, and is not expected to be CI-stable like the normal unit/e2e suites.
 - `pnpm test:docker:mcp-channels`: Starts a seeded Gateway container and a second client container that spawns `openclaw mcp serve`, then verifies routed conversation discovery, transcript reads, attachment metadata, live event queue behavior, outbound send routing, and Claude-style channel + permission notifications over the real stdio bridge. The Claude notification assertion reads the raw stdio MCP frames directly so the smoke reflects what the bridge actually emits.

--- a/package.json
+++ b/package.json
@@ -1492,8 +1492,14 @@
    "test:docker:live-cli-backend": "bash scripts/test-live-cli-backend-docker.sh",
    "test:docker:live-cli-backend:claude": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=claude-cli/claude-sonnet-4-6 bash scripts/test-live-cli-backend-docker.sh",
    "test:docker:live-cli-backend:claude-subscription": "OPENCLAW_LIVE_CLI_BACKEND_AUTH=subscription OPENCLAW_LIVE_CLI_BACKEND_MODEL=claude-cli/claude-sonnet-4-6 OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG=1 OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE=0 OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1 OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE=0 OPENCLAW_LIVE_CLI_BACKEND_MCP_PROBE=0 bash scripts/test-live-cli-backend-docker.sh",
+    "test:docker:live-cli-backend:claude:mcp": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=claude-cli/claude-sonnet-4-6 OPENCLAW_LIVE_CLI_BACKEND_MCP_PROBE=1 bash scripts/test-live-cli-backend-docker.sh",
+    "test:docker:live-cli-backend:claude:resume": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=claude-cli/claude-sonnet-4-6 OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1 bash scripts/test-live-cli-backend-docker.sh",
    "test:docker:live-cli-backend:codex": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=codex-cli/gpt-5.2 bash scripts/test-live-cli-backend-docker.sh",
+    "test:docker:live-cli-backend:codex:mcp": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=codex-cli/gpt-5.2 OPENCLAW_LIVE_CLI_BACKEND_MCP_PROBE=1 bash scripts/test-live-cli-backend-docker.sh",
+    "test:docker:live-cli-backend:codex:resume": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=codex-cli/gpt-5.2 OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1 bash scripts/test-live-cli-backend-docker.sh",
    "test:docker:live-cli-backend:gemini": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=google-gemini-cli/gemini-3-flash-preview bash scripts/test-live-cli-backend-docker.sh",
+    "test:docker:live-cli-backend:gemini:mcp": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=google-gemini-cli/gemini-3-flash-preview OPENCLAW_LIVE_CLI_BACKEND_MCP_PROBE=1 bash scripts/test-live-cli-backend-docker.sh",
+    "test:docker:live-cli-backend:gemini:resume": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=google-gemini-cli/gemini-3-flash-preview OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1 bash scripts/test-live-cli-backend-docker.sh",
    "test:docker:live-codex-bind": "OPENCLAW_LIVE_CODEX_BIND=1 OPENCLAW_LIVE_CODEX_TEST_FILES=src/gateway/gateway-codex-bind.live.test.ts bash scripts/test-live-codex-harness-docker.sh",
    "test:docker:live-codex-harness": "bash scripts/test-live-codex-harness-docker.sh",
    "test:docker:live-gateway": "bash scripts/test-live-gateway-models-docker.sh",
@@ -1504,6 +1510,8 @@
    "test:docker:live-models:claude": "OPENCLAW_LIVE_PROVIDERS=claude-cli OPENCLAW_LIVE_MODELS=claude-cli/claude-sonnet-4-6 bash scripts/test-live-models-docker.sh",
    "test:docker:live-models:codex": "OPENCLAW_LIVE_PROVIDERS=codex-cli OPENCLAW_LIVE_MODELS=codex-cli/gpt-5.2 bash scripts/test-live-models-docker.sh",
    "test:docker:live-models:gemini": "OPENCLAW_LIVE_PROVIDERS=google-gemini-cli OPENCLAW_LIVE_MODELS=google-gemini-cli/gemini-3.1-pro-preview bash scripts/test-live-models-docker.sh",
+    "test:docker:live:all": "OPENCLAW_DOCKER_ALL_LIVE_MODE=only node scripts/test-docker-all.mjs",
+    "test:docker:local:all": "OPENCLAW_DOCKER_ALL_LIVE_MODE=skip node scripts/test-docker-all.mjs",
    "test:docker:mcp-channels": "bash scripts/e2e/mcp-channels-docker.sh",
    "test:docker:npm-onboard-channel-agent": "bash scripts/e2e/npm-onboard-channel-agent-docker.sh",
    "test:docker:npm-telegram-live": "bash scripts/e2e/npm-telegram-live-docker.sh",
--- a/scripts/test-docker-all.mjs
+++ b/scripts/test-docker-all.mjs
@@ -11,6 +11,7 @@ const DEFAULT_TAIL_PARALLELISM = 10;
 const DEFAULT_FAILURE_TAIL_LINES = 80;
 const DEFAULT_LANE_TIMEOUT_MS = 120 * 60 * 1000;
 const DEFAULT_LANE_START_STAGGER_MS = 2_000;
+const DEFAULT_LIVE_RETRIES = 1;
 const DEFAULT_STATUS_INTERVAL_MS = 30_000;
 const DEFAULT_PREFLIGHT_RUN_TIMEOUT_MS = 60_000;
 const DEFAULT_TIMINGS_FILE = path.join(ROOT_DIR, ".artifacts/docker-tests/lane-timings.json");
@@ -21,29 +22,71 @@ const OPENWEBUI_TIMEOUT_MS = 20 * 60 * 1000;
 const BUNDLED_UPDATE_TIMEOUT_MS = 20 * 60 * 1000;
 const DEFAULT_RESOURCE_LIMITS = {
  docker: DEFAULT_PARALLELISM,
-  live: 6,
-  npm: 8,
+  live: 9,
+  "live:claude": 4,
+  "live:codex": 4,
+  "live:gemini": 4,
+  npm: 10,
  service: 7,
 };
+const LIVE_RETRY_PATTERNS = [
+  /529\b/i,
+  /overloaded/i,
+  /capacity/i,
+  /rate.?limit/i,
+  /gateway closed \(1000 normal closure\)/i,
+  /ECONNRESET|ETIMEDOUT|ENOTFOUND/i,
+];

 const bundledChannelLaneCommand =
  "OPENCLAW_SKIP_DOCKER_BUILD=1 OPENCLAW_BUNDLED_CHANNEL_UPDATE_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_ROOT_OWNED_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_SETUP_ENTRY_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_LOAD_FAILURE_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_DISABLED_CONFIG_SCENARIO=0 pnpm test:docker:bundled-channel-deps";

 function lane(name, command, options = {}) {
  return {
+    cacheKey: options.cacheKey,
    command,
    estimateSeconds: options.estimateSeconds,
+    live: options.live === true,
    name,
+    retryPatterns: options.retryPatterns ?? [],
+    retries: options.retries ?? 0,
    resources: options.resources ?? [],
    timeoutMs: options.timeoutMs,
    weight: options.weight ?? 1,
  };
 }

+function liveProviderResource(provider) {
+  if (!provider) {
+    return undefined;
+  }
+  if (provider === "claude-cli" || provider === "claude") {
+    return "live:claude";
+  }
+  if (provider === "codex-cli" || provider === "codex") {
+    return "live:codex";
+  }
+  if (provider === "google-gemini-cli" || provider === "gemini") {
+    return "live:gemini";
+  }
+  if (provider === "openai") {
+    return "live:openai";
+  }
+  return `live:${provider}`;
+}
+
+function liveProviderResources(options) {
+  const providers = options.providers ?? (options.provider ? [options.provider] : []);
+  return providers.map(liveProviderResource).filter(Boolean);
+}
+
 function liveLane(name, command, options = {}) {
  return lane(name, command, {
    ...options,
-    resources: ["live", ...(options.resources ?? [])],
+    live: true,
+    resources: ["live", ...liveProviderResources(options), ...(options.resources ?? [])],
+    retryPatterns: options.retryPatterns ?? LIVE_RETRY_PATTERNS,
+    retries: options.retries ?? DEFAULT_LIVE_RETRIES,
    weight: options.weight ?? 3,
  });
 }
@@ -132,22 +175,36 @@ const bundledScenarioLanes = [

 const lanes = [
  liveLane("live-models", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-models", {
+    providers: ["claude-cli", "codex-cli", "google-gemini-cli"],
    timeoutMs: LIVE_PROFILE_TIMEOUT_MS,
    weight: 4,
  }),
  liveLane("live-gateway", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-gateway", {
+    providers: ["claude-cli", "codex-cli", "google-gemini-cli"],
    timeoutMs: LIVE_PROFILE_TIMEOUT_MS,
    weight: 4,
  }),
  liveLane(
    "live-cli-backend-claude",
    "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-cli-backend:claude",
-    { resources: ["npm"], timeoutMs: LIVE_CLI_TIMEOUT_MS, weight: 3 },
+    {
+      cacheKey: "cli-backend-claude",
+      provider: "claude-cli",
+      resources: ["npm"],
+      timeoutMs: LIVE_CLI_TIMEOUT_MS,
+      weight: 3,
+    },
  ),
  liveLane(
    "live-cli-backend-gemini",
    "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-cli-backend:gemini",
-    { resources: ["npm"], timeoutMs: LIVE_CLI_TIMEOUT_MS, weight: 3 },
+    {
+      cacheKey: "cli-backend-gemini",
+      provider: "google-gemini-cli",
+      resources: ["npm"],
+      timeoutMs: LIVE_CLI_TIMEOUT_MS,
+      weight: 3,
+    },
  ),
  serviceLane("openwebui", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:openwebui", {
    timeoutMs: OPENWEBUI_TIMEOUT_MS,
@@ -204,9 +261,17 @@ const exclusiveLanes = [
  liveLane(
    "live-codex-harness",
    "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-codex-harness",
-    { resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
+    {
+      cacheKey: "codex-harness",
+      provider: "codex-cli",
+      resources: ["npm"],
+      timeoutMs: LIVE_ACP_TIMEOUT_MS,
+      weight: 3,
+    },
  ),
  liveLane("live-codex-bind", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-codex-bind", {
+    cacheKey: "codex-harness",
+    provider: "codex-cli",
    resources: ["npm"],
    timeoutMs: LIVE_ACP_TIMEOUT_MS,
    weight: 3,
@@ -214,22 +279,46 @@ const exclusiveLanes = [
  liveLane(
    "live-cli-backend-codex",
    "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-cli-backend:codex",
-    { resources: ["npm"], timeoutMs: LIVE_CLI_TIMEOUT_MS, weight: 3 },
+    {
+      cacheKey: "cli-backend-codex",
+      provider: "codex-cli",
+      resources: ["npm"],
+      timeoutMs: LIVE_CLI_TIMEOUT_MS,
+      weight: 3,
+    },
  ),
  liveLane(
    "live-acp-bind-claude",
    "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-acp-bind:claude",
-    { resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
+    {
+      cacheKey: "acp-bind-claude",
+      provider: "claude-cli",
+      resources: ["npm"],
+      timeoutMs: LIVE_ACP_TIMEOUT_MS,
+      weight: 3,
+    },
  ),
  liveLane(
    "live-acp-bind-codex",
    "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-acp-bind:codex",
-    { resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
+    {
+      cacheKey: "acp-bind-codex",
+      provider: "codex-cli",
+      resources: ["npm"],
+      timeoutMs: LIVE_ACP_TIMEOUT_MS,
+      weight: 3,
+    },
  ),
  liveLane(
    "live-acp-bind-gemini",
    "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-acp-bind:gemini",
-    { resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
+    {
+      cacheKey: "acp-bind-gemini",
+      provider: "google-gemini-cli",
+      resources: ["npm"],
+      timeoutMs: LIVE_ACP_TIMEOUT_MS,
+      weight: 3,
+    },
  ),
 ];

@@ -264,8 +353,39 @@ function parseBool(raw, fallback) {
  return !/^(?:0|false|no)$/i.test(raw);
 }

+function parseLiveMode(raw) {
+  const mode = raw || "all";
+  if (mode === "all" || mode === "skip" || mode === "only") {
+    return mode;
+  }
+  throw new Error(
+    `OPENCLAW_DOCKER_ALL_LIVE_MODE must be one of: all, skip, only. Got: ${JSON.stringify(raw)}`,
+  );
+}
+
+function applyLiveMode(poolLanes, mode) {
+  if (mode === "all") {
+    return poolLanes;
+  }
+  return poolLanes.filter((poolLane) => (mode === "only" ? poolLane.live : !poolLane.live));
+}
+
+function applyLiveRetries(poolLanes, retries) {
+  return poolLanes.map((poolLane) => (poolLane.live ? { ...poolLane, retries } : poolLane));
+}
+
+function resourceLimitsSummary(resourceLimits) {
+  return Object.entries(resourceLimits)
+    .map(([resource, limit]) => `${resource}=${String(limit)}`)
+    .join(" ");
+}
+
+function resourceLimitEnvName(resource) {
+  return `OPENCLAW_DOCKER_ALL_${resource.toUpperCase().replace(/[^A-Z0-9]+/g, "_")}_LIMIT`;
+}
+
 function parseResourceLimit(env, resource, parallelism, fallback) {
-  const envName = `OPENCLAW_DOCKER_ALL_${resource.toUpperCase()}_LIMIT`;
+  const envName = resourceLimitEnvName(resource);
  return parsePositiveInt(env[envName], Math.min(parallelism, fallback), envName);
 }

@@ -275,13 +395,12 @@ function parseSchedulerOptions(env, parallelism) {
    parallelism,
    "OPENCLAW_DOCKER_ALL_WEIGHT_LIMIT",
  );
+  const resourceLimits = {};
+  for (const [resource, fallback] of Object.entries(DEFAULT_RESOURCE_LIMITS)) {
+    resourceLimits[resource] = parseResourceLimit(env, resource, parallelism, fallback);
+  }
  return {
-    resourceLimits: {
-      docker: parseResourceLimit(env, "docker", parallelism, parallelism),
-      live: parseResourceLimit(env, "live", parallelism, DEFAULT_RESOURCE_LIMITS.live),
-      npm: parseResourceLimit(env, "npm", parallelism, DEFAULT_RESOURCE_LIMITS.npm),
-      service: parseResourceLimit(env, "service", parallelism, DEFAULT_RESOURCE_LIMITS.service),
-    },
+    resourceLimits,
    weightLimit,
  };
 }
@@ -297,7 +416,9 @@ function laneResources(poolLane) {
 function laneSummary(poolLane) {
  const resources = laneResources(poolLane).join(",");
  const timeout = poolLane.timeoutMs ? ` timeout=${Math.round(poolLane.timeoutMs / 1000)}s` : "";
-  return `${poolLane.name}(w=${laneWeight(poolLane)} r=${resources}${timeout})`;
+  const retries = poolLane.retries > 0 ? ` retries=${poolLane.retries}` : "";
+  const cache = poolLane.cacheKey ? ` cache=${poolLane.cacheKey}` : "";
+  return `${poolLane.name}(w=${laneWeight(poolLane)} r=${resources}${timeout}${retries}${cache})`;
 }

 function sleep(ms) {
@@ -638,15 +759,16 @@ async function prepareBundledChannelPackage(baseEnv, logDir) {
  console.log(`==> Bundled channel package: ${baseEnv.OPENCLAW_BUNDLED_CHANNEL_PACKAGE_TGZ}`);
 }

-function laneEnv(name, baseEnv, logDir) {
+function laneEnv(name, baseEnv, logDir, cacheKey) {
  const env = {
    ...baseEnv,
  };
+  const cacheName = cacheKey || name;
  if (!process.env.OPENCLAW_DOCKER_CLI_TOOLS_DIR) {
-    env.OPENCLAW_DOCKER_CLI_TOOLS_DIR = path.join(logDir, `${name}-cli-tools`);
+    env.OPENCLAW_DOCKER_CLI_TOOLS_DIR = path.join(logDir, `${cacheName}-cli-tools`);
  }
  if (!process.env.OPENCLAW_DOCKER_CACHE_HOME_DIR) {
-    env.OPENCLAW_DOCKER_CACHE_HOME_DIR = path.join(logDir, `${name}-cache`);
+    env.OPENCLAW_DOCKER_CACHE_HOME_DIR = path.join(logDir, `${cacheName}-cache`);
  }
  return env;
 }
@@ -655,7 +777,7 @@ async function runLane(lane, baseEnv, logDir, fallbackTimeoutMs) {
  const { command, name } = lane;
  const timeoutMs = lane.timeoutMs ?? fallbackTimeoutMs;
  const logFile = path.join(logDir, `${name}.log`);
-  const env = laneEnv(name, baseEnv, logDir);
+  const env = laneEnv(name, baseEnv, logDir, lane.cacheKey);
  await mkdir(env.OPENCLAW_DOCKER_CLI_TOOLS_DIR, { recursive: true });
  await mkdir(env.OPENCLAW_DOCKER_CACHE_HOME_DIR, { recursive: true });
  await fs.promises.writeFile(
@@ -664,12 +786,29 @@ async function runLane(lane, baseEnv, logDir, fallbackTimeoutMs) {
      `==> [${name}] cli tools dir: ${env.OPENCLAW_DOCKER_CLI_TOOLS_DIR}`,
      `==> [${name}] cache dir: ${env.OPENCLAW_DOCKER_CACHE_HOME_DIR}`,
      `==> [${name}] timeout: ${timeoutMs}ms`,
+      `==> [${name}] retries: ${lane.retries ?? 0}`,
      "",
    ].join("\n"),
  );
  console.log(`==> [${name}] start`);
  const startedAt = Date.now();
-  const result = await runShellCommand({ command, env, label: name, logFile, timeoutMs });
+  let result;
+  const maxAttempts = 1 + Math.max(0, lane.retries ?? 0);
+  for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
+    if (attempt > 1) {
+      await fs.promises.appendFile(logFile, `\n==> [${name}] retry attempt ${attempt}\n`);
+      console.log(`==> [${name}] retry ${attempt}/${maxAttempts}`);
+    }
+    result = await runShellCommand({ command, env, label: name, logFile, timeoutMs });
+    if (result.status === 0 || attempt >= maxAttempts) {
+      break;
+    }
+    const retryable =
+      result.timedOut || (await laneLogMatchesRetryPattern(logFile, lane.retryPatterns));
+    if (!retryable) {
+      break;
+    }
+  }
  const elapsedSeconds = Math.round((Date.now() - startedAt) / 1000);
  if (result.status === 0) {
    console.log(`==> [${name}] pass ${elapsedSeconds}s`);
@@ -847,6 +986,14 @@ async function tailFile(file, lines) {
  return tail.trimEnd();
 }

+async function laneLogMatchesRetryPattern(logFile, patterns) {
+  if (!patterns || patterns.length === 0) {
+    return false;
+  }
+  const tail = await tailFile(logFile, 160);
+  return patterns.some((pattern) => pattern.test(tail));
+}
+
 async function printFailureSummary(failures, tailLines) {
  console.error(`ERROR: ${failures.length} Docker lane(s) failed.`);
  for (const failure of failures) {
@@ -927,6 +1074,12 @@ async function main() {
  const preflightEnabled = parseBool(process.env.OPENCLAW_DOCKER_ALL_PREFLIGHT, true);
  const preflightCleanup = parseBool(process.env.OPENCLAW_DOCKER_ALL_PREFLIGHT_CLEANUP, true);
  const timingsEnabled = parseBool(process.env.OPENCLAW_DOCKER_ALL_TIMINGS, true);
+  const liveMode = parseLiveMode(process.env.OPENCLAW_DOCKER_ALL_LIVE_MODE);
+  const liveRetries = parseNonNegativeInt(
+    process.env.OPENCLAW_DOCKER_ALL_LIVE_RETRIES,
+    DEFAULT_LIVE_RETRIES,
+    "OPENCLAW_DOCKER_ALL_LIVE_RETRIES",
+  );
  const timingsFile = path.resolve(
    process.env.OPENCLAW_DOCKER_ALL_TIMINGS_FILE || DEFAULT_TIMINGS_FILE,
  );
@@ -945,13 +1098,22 @@ async function main() {
  appendExtension(baseEnv, "codex");

  const timingStore = await loadTimingStore(timingsFile, timingsEnabled);
-  const orderedLanes = orderLanes(lanes, timingStore);
-  const orderedTailLanes = orderLanes(tailLanes, timingStore);
+  const retriedMainLanes = applyLiveRetries(lanes, liveRetries);
+  const retriedTailLanes = applyLiveRetries(tailLanes, liveRetries);
+  const configuredLanes =
+    liveMode === "only"
+      ? applyLiveMode([...retriedMainLanes, ...retriedTailLanes], liveMode)
+      : applyLiveMode(retriedMainLanes, liveMode);
+  const configuredTailLanes = liveMode === "only" ? [] : applyLiveMode(retriedTailLanes, liveMode);
+  const orderedLanes = orderLanes(configuredLanes, timingStore);
+  const orderedTailLanes = orderLanes(configuredTailLanes, timingStore);

  console.log(`==> Docker test logs: ${logDir}`);
  console.log(`==> Parallelism: ${parallelism}`);
  console.log(`==> Tail parallelism: ${tailParallelism}`);
  console.log(`==> Lane timeout: ${laneTimeoutMs}ms`);
+  console.log(`==> Live mode: ${liveMode}`);
+  console.log(`==> Live retries: ${liveRetries}`);
  console.log(`==> Lane start stagger: ${laneStartStaggerMs}ms`);
  console.log(`==> Status interval: ${statusIntervalMs}ms`);
  console.log(`==> Fail fast: ${failFast ? "yes" : "no"}`);
@@ -966,10 +1128,10 @@ async function main() {
  const schedulerOptions = parseSchedulerOptions(process.env, parallelism);
  const tailSchedulerOptions = parseSchedulerOptions(process.env, tailParallelism);
  console.log(
-    `==> Scheduler: weight=${schedulerOptions.weightLimit} docker=${schedulerOptions.resourceLimits.docker} live=${schedulerOptions.resourceLimits.live} npm=${schedulerOptions.resourceLimits.npm} service=${schedulerOptions.resourceLimits.service}`,
+    `==> Scheduler: weight=${schedulerOptions.weightLimit} ${resourceLimitsSummary(schedulerOptions.resourceLimits)}`,
  );
  console.log(
-    `==> Tail scheduler: weight=${tailSchedulerOptions.weightLimit} docker=${tailSchedulerOptions.resourceLimits.docker} live=${tailSchedulerOptions.resourceLimits.live} npm=${tailSchedulerOptions.resourceLimits.npm} service=${tailSchedulerOptions.resourceLimits.service}`,
+    `==> Tail scheduler: weight=${tailSchedulerOptions.weightLimit} ${resourceLimitsSummary(tailSchedulerOptions.resourceLimits)}`,
  );
  printLaneManifest("Main", orderedLanes, timingStore);
  printLaneManifest("Tail", orderedTailLanes, timingStore);
--- a/scripts/test-live-cli-backend-docker.sh
+++ b/scripts/test-live-cli-backend-docker.sh
@@ -13,6 +13,7 @@ CLI_MODEL="${OPENCLAW_LIVE_CLI_BACKEND_MODEL:-}"
 CLI_PROVIDER="${CLI_MODEL%%/*}"
 CLI_DISABLE_MCP_CONFIG="${OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG:-}"
 CLI_AUTH_MODE="${OPENCLAW_LIVE_CLI_BACKEND_AUTH:-auto}"
+CLI_SETUP_TIMEOUT_SECONDS="${OPENCLAW_LIVE_CLI_BACKEND_SETUP_TIMEOUT_SECONDS:-180}"
 TEMP_DIRS=()
 DOCKER_USER="${OPENCLAW_DOCKER_USER:-node}"
 DOCKER_HOME_MOUNT=()
@@ -236,6 +237,9 @@ export npm_config_cache="$NPM_CONFIG_CACHE"
 mkdir -p "$NPM_CONFIG_PREFIX" "$XDG_CACHE_HOME" "$COREPACK_HOME" "$NPM_CONFIG_CACHE"
 chmod 700 "$XDG_CACHE_HOME" "$COREPACK_HOME" "$NPM_CONFIG_CACHE" || true
 export PATH="$NPM_CONFIG_PREFIX/bin:$PATH"
+run_setup_command() {
+  timeout --foreground "${OPENCLAW_LIVE_CLI_BACKEND_SETUP_TIMEOUT_SECONDS:-180}s" "$@"
+}
 if [ "${OPENCLAW_DOCKER_AUTH_PRESTAGED:-0}" != "1" ]; then
  IFS=',' read -r -a auth_dirs <<<"${OPENCLAW_DOCKER_AUTH_DIRS_RESOLVED:-}"
  IFS=',' read -r -a auth_files <<<"${OPENCLAW_DOCKER_AUTH_FILES_RESOLVED:-}"
@@ -285,9 +289,9 @@ package_has_explicit_version() {
  esac
 }
 if [ -n "${OPENCLAW_LIVE_CLI_BACKEND_COMMAND:-}" ] && [ ! -x "${OPENCLAW_LIVE_CLI_BACKEND_COMMAND}" ] && [ -n "$docker_package" ]; then
-  npm install -g "$docker_package"
+  run_setup_command npm install -g "$docker_package"
 elif [ -n "$docker_package" ] && package_has_explicit_version "$docker_package"; then
-  npm install -g "$docker_package"
+  run_setup_command npm install -g "$docker_package"
 fi
 if [ "$provider" = "codex-cli" ] && [ "${OPENCLAW_LIVE_CLI_BACKEND_AUTH:-auto}" = "api-key" ]; then
  codex_login_command="${OPENCLAW_LIVE_CLI_BACKEND_COMMAND:-$NPM_CONFIG_PREFIX/bin/codex}"
@@ -397,6 +401,7 @@ echo "==> Run CLI backend live test in Docker"
 echo "==> Model: $CLI_MODEL"
 echo "==> Provider: $CLI_PROVIDER"
 echo "==> Auth mode: $CLI_AUTH_MODE"
+echo "==> Setup timeout: ${CLI_SETUP_TIMEOUT_SECONDS}s"
 echo "==> Profile file: $PROFILE_STATUS"
 if [[ "$CLI_PROVIDER" == "codex-cli" ]]; then
  echo "==> CI-safe Codex config: $CLI_USE_CI_SAFE_CODEX_CONFIG"
@@ -449,6 +454,7 @@ DOCKER_RUN_ARGS=(docker run --rm -t \
  -e OPENCLAW_DOCKER_AUTH_FILES_RESOLVED="$AUTH_FILES_CSV" \
  -e OPENCLAW_LIVE_DOCKER_SOURCE_STAGE_MODE="${OPENCLAW_LIVE_DOCKER_SOURCE_STAGE_MODE:-copy}" \
  -e OPENCLAW_LIVE_CLI_BACKEND_USE_CI_SAFE_CODEX_CONFIG="$CLI_USE_CI_SAFE_CODEX_CONFIG" \
+  -e OPENCLAW_LIVE_CLI_BACKEND_SETUP_TIMEOUT_SECONDS="$CLI_SETUP_TIMEOUT_SECONDS" \
  -e OPENCLAW_DOCKER_CLI_BACKEND_PROVIDER="$CLI_PROVIDER" \
  -e OPENCLAW_DOCKER_CLI_BACKEND_COMMAND_DEFAULT="$CLI_DEFAULT_COMMAND" \
  -e OPENCLAW_DOCKER_CLI_BACKEND_NPM_PACKAGE="$CLI_DOCKER_NPM_PACKAGE" \