diff --git a/dispatcher-strategies.js b/dispatcher-strategies.js index 62dce99..4634ddb 100644 --- a/dispatcher-strategies.js +++ b/dispatcher-strategies.js @@ -1206,11 +1206,16 @@ export async function executeAgent(job, ctx, deps) { const { waitForGateway, updateRunSession, setAgentStatus, buildJobPrompt, runAgentTurnWithActivityTimeout, + // Sanctioned isolated dispatch primitive. Falls back to the activity-aware + // runner when callers (e.g. tests) wire only the older name -- both helpers + // share the same HTTP-only contract, no subprocess spawn. + runIsolatedAgentTurn, updateContextSummary, releaseDispatch, releaseIdempotencyKey, updateJob, matchesSentinel, detectTransientError, listSessions, sqliteNow, log, } = deps; + const dispatchAgentTurn = runIsolatedAgentTurn || runAgentTurnWithActivityTimeout; const result = makeDefaultResult(); // Gateway health check @@ -1304,7 +1309,12 @@ export async function executeAgent(job, ctx, deps) { } } - const turnResult = await runAgentTurnWithActivityTimeout({ + // Isolated dispatch primitive: HTTP-only chat completions call. The + // scheduler must never fork a sibling `openclaw` process to spawn an + // isolated session -- that variant has historically SIGTERM'd the + // launchd-tracked gateway parent and orphaned a node process on port + // 18789 (see ISOLATED_DISPATCH_PRIMITIVE in gateway.js). + const turnResult = await dispatchAgentTurn({ message: prompt, agentId: job.agent_id || 'main', sessionKey, diff --git a/dispatcher.js b/dispatcher.js index 87753f3..03b518e 100644 --- a/dispatcher.js +++ b/dispatcher.js @@ -51,7 +51,8 @@ import { import { buildRetrievalContext } from './retrieval.js'; import { upsertAgent, setAgentStatus } from './agents.js'; import { - runAgentTurnWithActivityTimeout, sendSystemEvent, getAllSubAgentSessions, listSessions, + runAgentTurnWithActivityTimeout, runIsolatedAgentTurn, + sendSystemEvent, getAllSubAgentSessions, listSessions, deliverMessage, checkGatewayHealth, waitForGateway, resolveDeliveryAlias, applyAuthProfileToSessionStore, syncAuthStoreToSession, @@ -306,6 +307,10 @@ function buildDispatchDeps() { // Agent waitForGateway, updateRunSession, setAgentStatus, buildJobPrompt, runAgentTurnWithActivityTimeout, + // Isolated cron-dispatch primitive: HTTP-only wrapper around the + // chat-completions API; never forks a sibling openclaw process that + // could SIGTERM the launchd-tracked gateway parent. + runIsolatedAgentTurn, updateContextSummary, releaseIdempotencyKey, matchesSentinel, detectTransientError, listSessions, diff --git a/gateway.js b/gateway.js index 2d5c1a6..1d381fe 100644 --- a/gateway.js +++ b/gateway.js @@ -9,6 +9,22 @@ const GATEWAY_URL = process.env.OPENCLAW_GATEWAY_URL || 'http://127.0.0.1:18789' const HOME_DIR = process.env.HOME || homedir(); export const TELEGRAM_MAX_MESSAGE_LENGTH = 4096; +// -- Isolated dispatch primitive contract -------------------- +// +// Cron jobs with session_target=isolated must reach the gateway via the +// public HTTP API only. Forking a sibling `openclaw` process to spawn the +// session is rejected: in production that primitive has SIGTERM'd the +// launchd-tracked gateway parent (the child inherits the parent's listening +// socket on port 18789 and the parent dies), leaving an orphan node process +// holding the port. See rh-bot.lan zombie-cascade incident report. +// +// runIsolatedAgentTurn is the only sanctioned dispatch primitive for +// session_target=isolated cron jobs. It MUST NOT spawn, fork, or exec any +// child process. Any future change that needs subprocess execution belongs +// behind a different, explicitly-named helper so reviewers can keep this +// contract intact. +export const ISOLATED_DISPATCH_PRIMITIVE = 'http-chat-completions'; + let _cachedToken; let _tokenLoaded = false; @@ -246,6 +262,29 @@ export async function runAgentTurnWithActivityTimeout(opts) { } } +// -- Isolated dispatch primitive ----------------------------- + +/** + * Sanctioned dispatch primitive for session_target=isolated cron jobs. + * + * This is a thin wrapper around runAgentTurnWithActivityTimeout that names + * the contract: HTTP-only request to the gateway, no child process spawn. + * The scheduler routes every session_target=isolated job through this + * helper so the no-fork invariant is reviewable at one call site and + * testable in isolation (see the no-subprocess regression test in test.js). + * + * Why a named wrapper instead of calling runAgentTurnWithActivityTimeout + * directly: the dispatch primitive is the load-bearing surface that the + * rh-bot.lan zombie-on-port outage cascaded through. A named entry point + * gives operators and reviewers a single grep target ("runIsolatedAgentTurn") + * to audit the no-spawn invariant. + * + * Accepts the same options as runAgentTurnWithActivityTimeout. + */ +export async function runIsolatedAgentTurn(opts) { + return await runAgentTurnWithActivityTimeout(opts); +} + // -- System Events (main session) ---------------------------- /** diff --git a/package-lock.json b/package-lock.json index 25b424b..0534f35 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "openclaw-scheduler", - "version": "0.2.4", + "version": "0.2.5", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "openclaw-scheduler", - "version": "0.2.4", + "version": "0.2.5", "license": "MIT", "dependencies": { "better-sqlite3": "^11.10.0", @@ -24,7 +24,7 @@ "typescript": "^5.9.2" }, "engines": { - "node": ">=20" + "node": ">=22" } }, "node_modules/@bcoe/v8-coverage": { diff --git a/test.js b/test.js index 3d9cb8f..60eabb9 100644 --- a/test.js +++ b/test.js @@ -3625,12 +3625,27 @@ console.log('\n-- Sync Auth Store Integration with executeAgent --'); const strategiesSrc = readFileSync(join(import.meta.dirname || '.', 'dispatcher-strategies.js'), 'utf-8'); const syncCallIdx = strategiesSrc.indexOf('syncAuthStoreToSession: syncAuth'); assert(syncCallIdx > -1, 'dispatcher-strategies.js destructures syncAuthStoreToSession'); - // Find the actual syncAuth() invocation and the actual turnResult = await runAgentTurnWithActivityTimeout() call + // Find the actual syncAuth() invocation and the dispatch-primitive invocation. + // executeAgent calls the sanctioned isolated dispatch primitive via a local + // alias `dispatchAgentTurn` that resolves to runIsolatedAgentTurn (or its + // legacy alias runAgentTurnWithActivityTimeout) -- accept either name so the + // ordering invariant stays meaningful as the helper is renamed. const syncInvokeIdx = strategiesSrc.indexOf('syncAuth(job.agent_id'); - const turnInvokeIdx = strategiesSrc.indexOf('await runAgentTurnWithActivityTimeout('); + const turnInvokeIdx = (() => { + const candidates = [ + 'await dispatchAgentTurn(', + 'await runIsolatedAgentTurn(', + 'await runAgentTurnWithActivityTimeout(', + ]; + for (const needle of candidates) { + const idx = strategiesSrc.indexOf(needle); + if (idx > -1) return idx; + } + return -1; + })(); assert(syncInvokeIdx > -1, 'syncAuth invocation found'); - assert(turnInvokeIdx > -1, 'runAgentTurnWithActivityTimeout invocation found'); - assert(turnInvokeIdx > syncInvokeIdx, 'syncAuth is called before runAgentTurnWithActivityTimeout'); + assert(turnInvokeIdx > -1, 'isolated dispatch primitive invocation found'); + assert(turnInvokeIdx > syncInvokeIdx, 'syncAuth is called before the isolated dispatch primitive'); // Verify the sync call is NOT inside an if(resolvedAuthProfile) guard // Extract the syncAuth call context (the ~20 lines around it) @@ -3647,6 +3662,148 @@ console.log('\n-- Sync Auth Store Integration with executeAgent --'); console.log(' sync auth store integration with executeAgent: pass'); } +console.log('\n-- Isolated dispatch primitive: no subprocess spawn --'); +{ + // Regression guard. The session_target=isolated cron dispatch path must + // reach the gateway via the HTTP /v1/chat/completions endpoint only. A + // prior variant forked a sibling `openclaw` CLI to spawn the session and + // SIGTERM'd the launchd-tracked gateway parent (rh-bot.lan zombie-on-port + // outage, ~30 cascades per week). This test pins the no-fork contract two + // ways: + // 1. Source check -- the isolated dispatch primitive defined in + // gateway.js (runIsolatedAgentTurn) and the executeAgent strategy + // neither import nor reference child_process. + // 2. Behavior check -- executeAgent dispatched against the real gateway + // HTTP client hits /v1/chat/completions via fetch and surfaces the + // stubbed assistant reply. + + const { executeAgent } = await import('./dispatcher-strategies.js'); + const gateway = await import('./gateway.js'); + + // 1. Source-level check: the isolated dispatch primitive must not pull in + // child_process. Note: gateway.js still imports execFileSync for the + // legacy executeMain/fire-and-forget sendSystemEvent path, so the assertion + // narrows to the primitive's own definition rather than the whole module. + const gatewaySrc = readFileSync(join(import.meta.dirname || '.', 'gateway.js'), 'utf-8'); + const strategiesSrc = readFileSync(join(import.meta.dirname || '.', 'dispatcher-strategies.js'), 'utf-8'); + + const isolatedFnStart = gatewaySrc.indexOf('export async function runIsolatedAgentTurn'); + assert(isolatedFnStart > -1, 'runIsolatedAgentTurn is defined in gateway.js'); + // Capture the function body bounded by the next top-level export so the + // check is local to the primitive rather than the surrounding module. + const isolatedFnEnd = (() => { + const after = gatewaySrc.indexOf('\nexport ', isolatedFnStart + 1); + return after === -1 ? gatewaySrc.length : after; + })(); + const isolatedFnSrc = gatewaySrc.slice(isolatedFnStart, isolatedFnEnd); + for (const banned of ['child_process', 'execFile', 'spawn(', 'fork(', 'execSync']) { + assert(!isolatedFnSrc.includes(banned), + `runIsolatedAgentTurn body does not reference ${banned}`); + } + + const executeAgentStart = strategiesSrc.indexOf('export async function executeAgent'); + assert(executeAgentStart > -1, 'executeAgent is defined in dispatcher-strategies.js'); + const executeAgentEnd = (() => { + const after = strategiesSrc.indexOf('\nexport ', executeAgentStart + 1); + return after === -1 ? strategiesSrc.length : after; + })(); + const executeAgentSrc = strategiesSrc.slice(executeAgentStart, executeAgentEnd); + for (const banned of ['child_process', 'execFile', 'spawn(', 'fork(', 'execSync']) { + assert(!executeAgentSrc.includes(banned), + `executeAgent body does not reference ${banned}`); + } + + // 2. Behavior check: stub global fetch so the dispatch primitive can run + // without contacting a real gateway. The stub records every URL so we can + // assert /v1/chat/completions was hit and no other dispatch transport was + // used. Set a sentinel gateway token before triggering the first fetch so + // gateway.js seeds its module-level token cache with a real value rather + // than null; otherwise downstream tests that swap globalThis.fetch later + // see the cached null token and stop emitting the auth/scope headers they + // assert on. + if (!process.env.OPENCLAW_GATEWAY_TOKEN) { + process.env.OPENCLAW_GATEWAY_TOKEN = 'isolated-dispatch-test-token'; + } + const fetchCalls = []; + const originalFetch = globalThis.fetch; + globalThis.fetch = async (url, init) => { + fetchCalls.push({ url: String(url), method: init?.method || 'GET' }); + return { + ok: true, + status: 200, + headers: { get: () => null }, + json: async () => ({ + choices: [{ message: { role: 'assistant', content: 'isolated-ok' } }], + usage: { total_tokens: 7 }, + }), + text: async () => '', + }; + }; + + let executionError = null; + let executionResult = null; + try { + const job = { + id: 'isolated-no-spawn', + name: 'isolated-no-spawn', + agent_id: 'main', + session_target: 'isolated', + payload_kind: 'agentTurn', + payload_message: 'noop', + delivery_mode: 'none', + run_timeout_ms: 10_000, + payload_timeout_seconds: 5, + }; + const ctx = { + run: { id: 'run-isolated-no-spawn', started_at: new Date().toISOString() }, + dispatchRecord: null, + idemKey: null, + v02Outcomes: null, + }; + const deps = { + waitForGateway: async () => true, + updateRunSession: () => {}, + setAgentStatus: () => {}, + buildJobPrompt: () => ({ prompt: 'noop', contextMeta: {} }), + // Using the real exported helper proves the production code path goes + // through HTTP, not a test-local mock. + runIsolatedAgentTurn: gateway.runIsolatedAgentTurn, + runAgentTurnWithActivityTimeout: gateway.runAgentTurnWithActivityTimeout, + updateContextSummary: () => {}, + releaseDispatch: () => {}, + releaseIdempotencyKey: () => {}, + updateJob: () => {}, + matchesSentinel: () => false, + detectTransientError: () => false, + listSessions: async () => ({ result: { sessions: [] } }), + sqliteNow: (offsetMs = 0) => new Date(Date.now() + offsetMs).toISOString(), + log: () => {}, + finishRun: () => {}, + syncAuthStoreToSession: () => ({ ok: true }), + applyAuthProfileToSessionStore: () => ({ ok: true }), + }; + executionResult = await executeAgent(job, ctx, deps); + } catch (err) { + executionError = err; + } finally { + globalThis.fetch = originalFetch; + } + + assert(executionError === null, `executeAgent ran without error (got: ${executionError?.message || 'none'})`); + const chatCall = fetchCalls.find(c => c.url.includes('/v1/chat/completions')); + assert(chatCall !== undefined, 'isolated dispatch primitive hit /v1/chat/completions via HTTP'); + assert(chatCall?.method === 'POST', 'isolated dispatch primitive used HTTP POST'); + assert(executionResult?.content === 'isolated-ok', 'executeAgent surfaced the assistant reply'); + assert(executionResult?.status === 'ok', 'executeAgent reported ok status for the stubbed reply'); + + // Module-level contract marker is exported and stable so reviewers grepping + // for the primitive name find the dispatch path quickly. + assert(gateway.ISOLATED_DISPATCH_PRIMITIVE === 'http-chat-completions', + 'gateway.ISOLATED_DISPATCH_PRIMITIVE names the HTTP primitive'); + assert(typeof gateway.runIsolatedAgentTurn === 'function', + 'gateway.runIsolatedAgentTurn is exported as the sanctioned isolated dispatch helper'); +} + console.log('\n-- Migration Guard --'); { const legacyDir = mkdtempSync(join(tmpdir(), 'scheduler-migrate-'));