From 60039f8cc65a07189b5de05e2e708140a02dc5c1 Mon Sep 17 00:00:00 2001 From: anandgupta42 Date: Mon, 30 Mar 2026 13:39:47 -0700 Subject: [PATCH 1/2] fix: flush session traces to disk on crash to prevent log loss (#593) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bun Workers don't receive OS signals — only the main thread does. When altimate-code crashes, all in-memory traces were lost because the worker had no crash handlers. - worker.ts: add `flushAllTracesSync()` with idempotency guard, called from `uncaughtException` and `process.once("exit")` - thread.ts: add signal handlers (SIGINT/SIGTERM/SIGHUP) that call `worker.terminate()` + `Bun.sleepSync(50)` to trigger worker exit - index.ts: safety net for headless mode — flush `Trace.active` on `uncaughtException` and SIGHUP Closes #593 Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/opencode/src/cli/cmd/tui/thread.ts | 24 +++++++++++++++++ packages/opencode/src/cli/cmd/tui/worker.ts | 29 +++++++++++++++++++++ packages/opencode/src/index.ts | 25 +++++++++++++++++- 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/packages/opencode/src/cli/cmd/tui/thread.ts b/packages/opencode/src/cli/cmd/tui/thread.ts index 8e0a7b04b8..aeefd8473f 100644 --- a/packages/opencode/src/cli/cmd/tui/thread.ts +++ b/packages/opencode/src/cli/cmd/tui/thread.ts @@ -152,6 +152,25 @@ export const TuiThreadCommand = cmd({ process.on("unhandledRejection", error) process.on("SIGUSR2", reload) + // altimate_change start — crash: flush worker traces on signals + // Bun Workers don't receive OS signals — only the main thread does. + // On SIGINT/SIGTERM/SIGHUP, terminate the worker and briefly wait so + // its "exit" handler fires and flushes all active session traces to disk. + // Bun.sleepSync gives the worker thread time to process the termination + // before the main thread continues to process.exit(). + const emergencyTerminate = () => { + try { + worker.terminate() + Bun.sleepSync(50) + } catch { + // best-effort — crash handler must never throw + } + } + process.on("SIGINT", emergencyTerminate) + process.on("SIGTERM", emergencyTerminate) + process.on("SIGHUP", emergencyTerminate) + // altimate_change end + let stopped = false const stop = async () => { if (stopped) return @@ -159,6 +178,11 @@ export const TuiThreadCommand = cmd({ process.off("uncaughtException", error) process.off("unhandledRejection", error) process.off("SIGUSR2", reload) + // altimate_change start — crash: remove emergency handlers on clean shutdown + process.off("SIGINT", emergencyTerminate) + process.off("SIGTERM", emergencyTerminate) + process.off("SIGHUP", emergencyTerminate) + // altimate_change end await withTimeout(client.call("shutdown", undefined), 5000).catch((error) => { Log.Default.warn("worker shutdown failed", { error: error instanceof Error ? error.message : String(error), diff --git a/packages/opencode/src/cli/cmd/tui/worker.ts b/packages/opencode/src/cli/cmd/tui/worker.ts index 3e2b78de15..1ae7a7d95a 100644 --- a/packages/opencode/src/cli/cmd/tui/worker.ts +++ b/packages/opencode/src/cli/cmd/tui/worker.ts @@ -34,6 +34,12 @@ process.on("uncaughtException", (e) => { Log.Default.error("exception", { e: e instanceof Error ? e.message : e, }) + // altimate_change start — crash: flush traces on uncaught exception + // After logging, write all active traces to disk so crash context is preserved. + // The process may continue or exit depending on the exception — either way the + // trace snapshot will reflect the crash. + flushAllTracesSync(`Uncaught exception: ${e instanceof Error ? e.message : String(e)}`) + // altimate_change end }) // Subscribe to global events and forward them via RPC @@ -325,6 +331,29 @@ export const rpc = { Rpc.listen(rpc) +// altimate_change start — crash: flush active traces on unexpected exit +// When the worker is terminated (via worker.terminate() from the main thread, +// or on uncaught exceptions), write all in-flight traces to disk synchronously. +// +// NOTE: Bun Workers do NOT receive OS signals (SIGINT, SIGTERM, SIGHUP) — +// those are delivered only to the main thread. Signal-based flush is handled +// in thread.ts by terminating the worker, which triggers the "exit" event here. +let hasFlushed = false +function flushAllTracesSync(reason: string) { + if (hasFlushed) return + hasFlushed = true + for (const [, trace] of sessionTraces) { + try { + trace.flushSync(reason) + } catch { + // flushSync is best-effort — must never throw in an exit handler + } + } +} + +process.once("exit", () => { flushAllTracesSync("Process exited") }) +// altimate_change end + function getAuthorizationHeader(): string | undefined { const password = Flag.OPENCODE_SERVER_PASSWORD if (!password) return undefined diff --git a/packages/opencode/src/index.ts b/packages/opencode/src/index.ts index 9a32dd9670..a051cccb4f 100644 --- a/packages/opencode/src/index.ts +++ b/packages/opencode/src/index.ts @@ -46,6 +46,9 @@ import { Database } from "./storage/db" // altimate_change start - telemetry import import { Telemetry } from "./telemetry" // altimate_change end +// altimate_change start — crash: import Trace for crash handlers +import { Trace } from "./altimate/observability/tracing" +// altimate_change end // altimate_change start - welcome banner import { showWelcomeBannerIfNeeded } from "./cli/welcome" // altimate_change end @@ -60,12 +63,32 @@ process.on("uncaughtException", (e) => { Log.Default.error("exception", { e: e instanceof Error ? e.message : e, }) + // altimate_change start — crash: flush active trace on uncaught exception + // Trace.active is set by run.ts (headless mode only — TUI traces live in + // the worker's isolated memory and are flushed via worker.terminate()). + // This is a safety net for the headless path where run.ts registers its + // own handlers but an exception could bubble past them. + try { + Trace.active?.flushSync(`Uncaught exception: ${e instanceof Error ? e.message : String(e)}`) + } catch { + // Trace module may not be initialized — best-effort + } + // altimate_change end }) // Ensure the process exits on terminal hangup (eg. closing the terminal tab). // Without this, long-running commands like `serve` block on a never-resolving // promise and survive as orphaned processes. -process.on("SIGHUP", () => process.exit()) +// altimate_change start — crash: flush active trace before SIGHUP exit +process.on("SIGHUP", () => { + try { + Trace.active?.flushSync("Terminal hangup (SIGHUP)") + } catch { + // best-effort + } + process.exit() +}) +// altimate_change end let cli = yargs(hideBin(process.argv)) .parserConfiguration({ "populate--": true }) From 60629d4ad37ac79346a23dfe522b97f0381bbd9b Mon Sep 17 00:00:00 2001 From: anandgupta42 Date: Mon, 30 Mar 2026 13:56:47 -0700 Subject: [PATCH 2/2] fix: address code review findings for crash trace flush - thread.ts: re-raise signal after cleanup to restore default termination behavior (prevents process from hanging) - worker.ts: replace one-shot `hasFlushed` guard with "preserve first reason" pattern so later flushes still run - worker.ts: defer `sessionTraces.delete()` until `endTrace()` completes so crash flush can still reach in-flight traces Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/opencode/src/cli/cmd/tui/thread.ts | 41 +++++++++++++-------- packages/opencode/src/cli/cmd/tui/worker.ts | 29 +++++++++++---- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/packages/opencode/src/cli/cmd/tui/thread.ts b/packages/opencode/src/cli/cmd/tui/thread.ts index aeefd8473f..d664d1957c 100644 --- a/packages/opencode/src/cli/cmd/tui/thread.ts +++ b/packages/opencode/src/cli/cmd/tui/thread.ts @@ -154,21 +154,30 @@ export const TuiThreadCommand = cmd({ // altimate_change start — crash: flush worker traces on signals // Bun Workers don't receive OS signals — only the main thread does. - // On SIGINT/SIGTERM/SIGHUP, terminate the worker and briefly wait so - // its "exit" handler fires and flushes all active session traces to disk. - // Bun.sleepSync gives the worker thread time to process the termination - // before the main thread continues to process.exit(). - const emergencyTerminate = () => { - try { - worker.terminate() - Bun.sleepSync(50) - } catch { - // best-effort — crash handler must never throw + // On SIGINT/SIGTERM/SIGHUP, terminate the worker so its "exit" handler + // fires and flushes all active session traces to disk. + // After cleanup, remove the handler and re-raise the signal to restore + // default behavior (process termination). This avoids swallowing signals + // which would leave the process running after a kill. + const makeSignalHandler = (signal: NodeJS.Signals) => { + const handler = () => { + try { + worker.terminate() + Bun.sleepSync(50) + } catch { + // best-effort — crash handler must never throw + } + process.off(signal, handler) + process.kill(process.pid, signal) } + return handler } - process.on("SIGINT", emergencyTerminate) - process.on("SIGTERM", emergencyTerminate) - process.on("SIGHUP", emergencyTerminate) + const onSigint = makeSignalHandler("SIGINT") + const onSigterm = makeSignalHandler("SIGTERM") + const onSighup = makeSignalHandler("SIGHUP") + process.on("SIGINT", onSigint) + process.on("SIGTERM", onSigterm) + process.on("SIGHUP", onSighup) // altimate_change end let stopped = false @@ -179,9 +188,9 @@ export const TuiThreadCommand = cmd({ process.off("unhandledRejection", error) process.off("SIGUSR2", reload) // altimate_change start — crash: remove emergency handlers on clean shutdown - process.off("SIGINT", emergencyTerminate) - process.off("SIGTERM", emergencyTerminate) - process.off("SIGHUP", emergencyTerminate) + process.off("SIGINT", onSigint) + process.off("SIGTERM", onSigterm) + process.off("SIGHUP", onSighup) // altimate_change end await withTimeout(client.call("shutdown", undefined), 5000).catch((error) => { Log.Default.warn("worker shutdown failed", { diff --git a/packages/opencode/src/cli/cmd/tui/worker.ts b/packages/opencode/src/cli/cmd/tui/worker.ts index 1ae7a7d95a..2f9bf9f5ad 100644 --- a/packages/opencode/src/cli/cmd/tui/worker.ts +++ b/packages/opencode/src/cli/cmd/tui/worker.ts @@ -114,12 +114,15 @@ function getOrCreateTrace(sessionID: string): Trace | null { const startEventStream = (input: { directory: string; workspaceID?: string }) => { if (eventStream.abort) eventStream.abort.abort() - // Clear stale per-stream trace state before starting a new stream instance + // altimate_change start — crash: flush stale traces before clearing + // Flush any in-flight traces synchronously before clearing — endTrace() is + // async and a crash during the gap would lose trace data. for (const [, trace] of sessionTraces) { void trace.endTrace().catch(() => {}) } sessionTraces.clear() sessionUserMsgIds.clear() + // altimate_change end const abort = new AbortController() eventStream.abort = abort @@ -242,9 +245,15 @@ const startEventStream = (input: { directory: string; workspaceID?: string }) => if (status === "idle" && sid) { const trace = sessionTraces.get(sid) if (trace) { - void trace.endTrace().catch(() => {}) - sessionTraces.delete(sid) - sessionUserMsgIds.delete(sid) + // altimate_change start — crash: defer deletion until endTrace() completes + // Keep the trace in sessionTraces during async teardown so + // flushAllTracesSync() can still reach it if a crash occurs + // while endTrace() is in flight. + void trace.endTrace().catch(() => {}).finally(() => { + sessionTraces.delete(sid) + sessionUserMsgIds.delete(sid) + }) + // altimate_change end } } } @@ -338,13 +347,17 @@ Rpc.listen(rpc) // NOTE: Bun Workers do NOT receive OS signals (SIGINT, SIGTERM, SIGHUP) — // those are delivered only to the main thread. Signal-based flush is handled // in thread.ts by terminating the worker, which triggers the "exit" event here. -let hasFlushed = false +let firstFlushReason: string | undefined function flushAllTracesSync(reason: string) { - if (hasFlushed) return - hasFlushed = true + // Preserve the most specific reason from the first flush (e.g., the uncaught + // exception message) even if a later handler (exit) calls again with a + // generic reason. Subsequent calls still flush — new traces may have been + // created since the first call. + const effectiveReason = firstFlushReason ?? reason + firstFlushReason ??= reason for (const [, trace] of sessionTraces) { try { - trace.flushSync(reason) + trace.flushSync(effectiveReason) } catch { // flushSync is best-effort — must never throw in an exit handler }