From 1cf008d5ab3705efe09bc362360c3d69fec2c4b5 Mon Sep 17 00:00:00 2001 From: Mason <31372737+Ovaculos@users.noreply.github.com> Date: Wed, 13 May 2026 14:57:09 -0500 Subject: [PATCH 1/5] feat(observability): surface bundle start failures at boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boot-time bundle startup failures previously went only to container stderr — operators had to grep to discover that a workspace bundle silently never came up. The user-visible symptom was a missing tool with no event trail in the workspace log, no SSE notification, and no entry in `/v1/health`. Three changes, all on the catch path in `startWorkspaceBundles`: 1. New `bundle.start_failed` engine event. Routed workspace-scoped via `SSE_ROUTES` (drives SSE fan-out) and `WORKSPACE_EVENTS` (persists to the workspace log). 2. `startWorkspaceBundles` returns a `failures: BundleStartFailure[]` array alongside `entries`. Runtime stashes it on `_bundleStartFailures` and exposes via `bundleStartFailures()`. 3. `HealthMonitor` takes a `startFailures` option at construction; `getStatus()` merges them in as terminal `dead` entries so `/v1/health` reflects the failed bundle instead of omitting it. Distinct from `bundle.crashed`, which requires a running source that went away. A start failure means no `McpSource` ever existed, so the record can't be restarted by the periodic health-check loop; the `dead` entry is terminal. Tests cover the catch path keeps siblings unaffected, the merged status in `getStatus()`, SSE routing, and workspace-log persistence. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/adapters/workspace-log-sink.ts | 1 + src/api/events.ts | 1 + src/api/server.ts | 4 +- src/engine/types.ts | 6 + src/runtime/runtime.ts | 32 ++++- src/runtime/workspace-runtime.ts | 31 ++++- src/tools/health-monitor.ts | 19 ++- test/unit/health-monitor.test.ts | 66 +++++++++++ test/unit/runtime/workspace-runtime.test.ts | 125 ++++++++++++++++++++ test/unit/sse-event-manager.test.ts | 20 ++++ test/unit/workspace-log-sink.test.ts | 23 ++++ 11 files changed, 321 insertions(+), 7 deletions(-) diff --git a/src/adapters/workspace-log-sink.ts b/src/adapters/workspace-log-sink.ts index 81e0dab3..a4bdedb8 100644 --- a/src/adapters/workspace-log-sink.ts +++ b/src/adapters/workspace-log-sink.ts @@ -9,6 +9,7 @@ const WORKSPACE_EVENTS = new Set([ "bundle.crashed", "bundle.recovered", "bundle.dead", + "bundle.start_failed", "data.changed", "config.changed", "skill.created", diff --git a/src/api/events.ts b/src/api/events.ts index 16528e84..9daffa42 100644 --- a/src/api/events.ts +++ b/src/api/events.ts @@ -49,6 +49,7 @@ const SSE_ROUTES: Partial> = { "bundle.crashed": { scope: "workspace", wsIdField: "wsId" }, "bundle.recovered": { scope: "workspace", wsIdField: "wsId" }, "bundle.dead": { scope: "workspace", wsIdField: "wsId" }, + "bundle.start_failed": { scope: "workspace", wsIdField: "wsId" }, // Per-principal connection state — workspace-scoped. Drives the // pending-auth banner; without forwarding here, the banner never auto-clears // after a user completes interactive OAuth. diff --git a/src/api/server.ts b/src/api/server.ts index 1ce7bc04..a96c3b18 100644 --- a/src/api/server.ts +++ b/src/api/server.ts @@ -62,7 +62,9 @@ export function startServer(options: ServerOptions): ServerHandle { const internalToken = runtime.getInternalToken(); const mcpSources = runtime.mcpSources(); - const healthMonitor = new HealthMonitor(mcpSources, runtime.getEventSink()); + const healthMonitor = new HealthMonitor(mcpSources, runtime.getEventSink(), { + startFailures: runtime.bundleStartFailures(), + }); healthMonitor.start(); // SSE event manager — listens to runtime events and broadcasts to clients diff --git a/src/engine/types.ts b/src/engine/types.ts index 1729b34d..b96b0df6 100644 --- a/src/engine/types.ts +++ b/src/engine/types.ts @@ -66,6 +66,12 @@ export type EngineEventType = | "bundle.crashed" | "bundle.recovered" | "bundle.dead" + /** + * Bundle failed to start at boot (no McpSource was ever produced). + * Distinct from `bundle.crashed`, which requires a running source that + * went away. Payload: { wsId, serverName, bundleName, error }. + */ + | "bundle.start_failed" /** * Per-principal connection state change for a remote URL bundle. * Payload: { wsId, serverName, principalId, state, authorizationUrl? }. diff --git a/src/runtime/runtime.ts b/src/runtime/runtime.ts index 5814140d..06b93686 100644 --- a/src/runtime/runtime.ts +++ b/src/runtime/runtime.ts @@ -192,6 +192,13 @@ export class Runtime { _systemSource: import("../tools/types.ts").ToolSource | null; /** Platform sources (home, conversations, files, etc.) — retained for JIT workspace registration. */ private _platformSources: import("../tools/types.ts").ToolSource[] = []; + /** + * Boot-time bundle startup failures recorded by `startWorkspaceBundles`. + * These never produced an McpSource; HealthMonitor reads this list at + * construction so `/v1/health` reports the failures as terminal `dead` + * entries rather than silently omitting them. + */ + private _bundleStartFailures: import("./workspace-runtime.ts").BundleStartFailure[] = []; /** * Domain-context getter for the automations bundle. Set by the * automations source factory; consumed by internal callers (CLI's @@ -540,13 +547,24 @@ export class Runtime { // Phase 3: Start workspace bundles with per-workspace registries const configDir = config.configPath ? dirname(config.configPath) : undefined; - const { registries: workspaceRegistries, entries: workspaceBundleEntries } = - await startWorkspaceBundles(workspaceStore, platformSources, systemTools, events, configDir, { + const { + registries: workspaceRegistries, + entries: workspaceBundleEntries, + failures: bundleStartFailures, + } = await startWorkspaceBundles( + workspaceStore, + platformSources, + systemTools, + events, + configDir, + { workDir: resolveWorkDir(config), allowInsecureRemotes: config.allowInsecureRemotes, - }); + }, + ); rt._workspaceRegistries = workspaceRegistries; rt._platformSources = platformSources; + rt._bundleStartFailures = bundleStartFailures; // Wire the workspace registries into lifecycle so workspace-scope // startAuth / disconnect / install can add+remove sources without @@ -1083,6 +1101,14 @@ export class Runtime { return [...names]; } + /** + * Boot-time bundle startup failures. Read once at HealthMonitor + * construction so failed bundles appear as `dead` in `/v1/health`. + */ + bundleStartFailures(): import("./workspace-runtime.ts").BundleStartFailure[] { + return this._bundleStartFailures; + } + /** Get MCP sources across all workspace registries (for health monitoring). */ mcpSources(): McpSource[] { const sources: McpSource[] = []; diff --git a/src/runtime/workspace-runtime.ts b/src/runtime/workspace-runtime.ts index d4f1a009..2abd4a7a 100644 --- a/src/runtime/workspace-runtime.ts +++ b/src/runtime/workspace-runtime.ts @@ -22,6 +22,19 @@ import type { WorkspaceStore } from "../workspace/workspace-store.ts"; // Types // --------------------------------------------------------------------------- +/** + * A boot-time bundle startup failure — recorded when `startBundleSource` + * throws. Surfaced via `bundle.start_failed` event (workspace log + SSE) + * and threaded into HealthMonitor so `/v1/health` reports the failed + * bundle as `state: "dead"` rather than omitting it entirely. + */ +export interface BundleStartFailure { + wsId: string; + serverName: string; + bundleName: string; + error: string; +} + /** A single entry in the process inventory — one per (workspace, bundle) pair. */ export interface ProcessInventoryEntry { /** Workspace id (e.g., "ws_engineering"). */ @@ -133,7 +146,11 @@ export async function startWorkspaceBundles( allowInsecureRemotes?: boolean; workDir?: string; }, -): Promise<{ registries: Map; entries: ProcessInventoryEntry[] }> { +): Promise<{ + registries: Map; + entries: ProcessInventoryEntry[]; + failures: BundleStartFailure[]; +}> { const workDir = opts?.workDir ?? join(process.env.NB_WORK_DIR ?? "", ".nimblebrain"); const workspaces = await workspaceStore.list(); const inventory = buildProcessInventory(workspaces, workDir); @@ -174,6 +191,7 @@ export async function startWorkspaceBundles( wsEntries.map((entry) => ({ wsId, entry })), ); const resultEntries: ProcessInventoryEntry[] = new Array(flat.length); + const failures: BundleStartFailure[] = []; const concurrency = resolveBundleStartConcurrency(); const startMs = Date.now(); @@ -260,9 +278,18 @@ export async function startWorkspaceBundles( resultEntries[idx] = { ...entry, serverName: result.sourceName, meta: result.meta }; } catch (err) { const msg = err instanceof Error ? err.message : String(err); + const bundleName = bundleNameFromRef(entry.bundle); process.stderr.write( `[workspace-runtime] Failed to start ${entry.serverName} in ${wsId}: ${msg}\n`, ); + // Persistent observability: workspace log (via WorkspaceLogSink) + + // SSE broadcast (via SseEventManager). Without this, operators have + // to grep container stderr to discover a failed boot. + eventSink.emit({ + type: "bundle.start_failed", + data: { wsId, serverName: entry.serverName, bundleName, error: msg }, + }); + failures.push({ wsId, serverName: entry.serverName, bundleName, error: msg }); } }); @@ -273,7 +300,7 @@ export async function startWorkspaceBundles( `[workspace-runtime] Started ${finalEntries.length}/${flat.length} bundles in ${elapsedMs}ms (concurrency=${concurrency})`, ); } - return { registries, entries: finalEntries }; + return { registries, entries: finalEntries, failures }; } /** diff --git a/src/tools/health-monitor.ts b/src/tools/health-monitor.ts index 3d1266e9..387f7a30 100644 --- a/src/tools/health-monitor.ts +++ b/src/tools/health-monitor.ts @@ -1,4 +1,5 @@ import type { EventSink } from "../engine/types.ts"; +import type { BundleStartFailure } from "../runtime/workspace-runtime.ts"; import type { McpSource } from "./mcp-source.ts"; export type BundleState = "healthy" | "restarting" | "dead"; @@ -23,6 +24,13 @@ const DEFAULT_CHECK_INTERVAL_MS = 30_000; export interface HealthMonitorOptions { checkIntervalMs?: number; baseDelayMs?: number; + /** + * Boot-time start failures from `startWorkspaceBundles`. These never + * produced an `McpSource`, so they can't be restarted; we surface them + * as terminal `dead` entries in `getStatus()` so `/v1/health` reflects + * the failure instead of silently omitting the bundle. + */ + startFailures?: BundleStartFailure[]; } /** @@ -31,6 +39,7 @@ export interface HealthMonitorOptions { */ export class HealthMonitor { private records: BundleRecord[]; + private startFailures: BundleStartFailure[]; private timer: ReturnType | null = null; private checkIntervalMs: number; private baseDelayMs: number; @@ -47,6 +56,7 @@ export class HealthMonitor { state: "healthy" as BundleState, restartCount: 0, })); + this.startFailures = opts.startFailures ?? []; } /** Start the periodic health check loop. */ @@ -71,12 +81,19 @@ export class HealthMonitor { /** Get per-bundle health info. */ getStatus(): BundleHealth[] { - return this.records.map((r) => ({ + const live: BundleHealth[] = this.records.map((r) => ({ name: r.source.name, state: r.state, uptime: r.source.uptime(), restartCount: r.restartCount, })); + const dead: BundleHealth[] = this.startFailures.map((f) => ({ + name: f.serverName, + state: "dead" as const, + uptime: null, + restartCount: 0, + })); + return [...live, ...dead]; } private async checkOne(record: BundleRecord): Promise { diff --git a/test/unit/health-monitor.test.ts b/test/unit/health-monitor.test.ts index baf59b45..bc3933a7 100644 --- a/test/unit/health-monitor.test.ts +++ b/test/unit/health-monitor.test.ts @@ -147,6 +147,72 @@ describe("HealthMonitor", () => { monitor.stop(); }); + it("includes boot-time start failures as dead entries in getStatus", async () => { + const sink = makeEventCollector(); + const monitor = new HealthMonitor([], sink, { + checkIntervalMs: 60_000, + baseDelayMs: 1, + startFailures: [ + { wsId: "ws_a", serverName: "broken", bundleName: "@nb/broken", error: "no manifest" }, + { wsId: "ws_b", serverName: "remote-x", bundleName: "https://x", error: "ECONNREFUSED" }, + ], + }); + + const status = monitor.getStatus(); + expect(status).toHaveLength(2); + + const broken = status.find((s) => s.name === "broken"); + expect(broken?.state).toBe("dead"); + expect(broken?.uptime).toBeNull(); + expect(broken?.restartCount).toBe(0); + + const remote = status.find((s) => s.name === "remote-x"); + expect(remote?.state).toBe("dead"); + + monitor.stop(); + }); + + it("does not attempt to restart boot-time failed bundles", async () => { + const sink = makeEventCollector(); + const monitor = new HealthMonitor([], sink, { + checkIntervalMs: 60_000, + baseDelayMs: 1, + startFailures: [ + { wsId: "ws_a", serverName: "broken", bundleName: "@nb/broken", error: "no manifest" }, + ], + }); + + await monitor.check(); + + // No restart, no extra events — these never produced a source to restart. + expect(sink.events).toHaveLength(0); + const status = monitor.getStatus(); + expect(status[0]!.state).toBe("dead"); + + monitor.stop(); + }); + + it("merges live sources and start failures in getStatus", async () => { + const source = makeMockSource("live-one"); + const sink = makeEventCollector(); + const monitor = new HealthMonitor([source], sink, { + checkIntervalMs: 60_000, + baseDelayMs: 1, + startFailures: [ + { wsId: "ws_a", serverName: "dead-one", bundleName: "@nb/dead", error: "boom" }, + ], + }); + + const status = monitor.getStatus(); + expect(status).toHaveLength(2); + const names = status.map((s) => s.name).sort(); + expect(names).toEqual(["dead-one", "live-one"]); + expect(status.find((s) => s.name === "live-one")?.state).toBe("healthy"); + expect(status.find((s) => s.name === "dead-one")?.state).toBe("dead"); + + monitor.stop(); + }); + it("stop() clears the interval so no more checks run", async () => { const source = makeMockSource("interval-bundle"); const sink = makeEventCollector(); diff --git a/test/unit/runtime/workspace-runtime.test.ts b/test/unit/runtime/workspace-runtime.test.ts index 8137ca89..49d5fb09 100644 --- a/test/unit/runtime/workspace-runtime.test.ts +++ b/test/unit/runtime/workspace-runtime.test.ts @@ -1,13 +1,18 @@ import { afterEach, beforeEach, describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; import { join } from "node:path"; import type { BundleRef } from "../../../src/bundles/types.ts"; +import type { EngineEvent, EventSink } from "../../../src/engine/types.ts"; import type { Workspace } from "../../../src/workspace/types.ts"; import { buildProcessInventory, mapWithConcurrency, type ProcessInventoryEntry, resolveBundleStartConcurrency, + startWorkspaceBundles, } from "../../../src/runtime/workspace-runtime.ts"; +import type { WorkspaceStore } from "../../../src/workspace/workspace-store.ts"; // --------------------------------------------------------------------------- // Fixtures @@ -301,3 +306,123 @@ describe("mapWithConcurrency", () => { ).rejects.toBe(err); }); }); + +// --------------------------------------------------------------------------- +// startWorkspaceBundles — bundle.start_failed observability +// --------------------------------------------------------------------------- + +function makeStore(workspaces: Workspace[]): WorkspaceStore { + return { + async list() { + return workspaces; + }, + } as unknown as WorkspaceStore; +} + +function makeEventCollector(): EventSink & { events: EngineEvent[] } { + const events: EngineEvent[] = []; + return { + events, + emit(event: EngineEvent) { + events.push(event); + }, + }; +} + +describe("startWorkspaceBundles — bundle.start_failed", () => { + let workDir: string; + + beforeEach(() => { + workDir = mkdtempSync(join(tmpdir(), "ws-runtime-test-")); + }); + + afterEach(() => { + rmSync(workDir, { recursive: true, force: true }); + }); + + it("emits bundle.start_failed when a bundle fails to start", async () => { + // Path-based bundle pointing at a nonexistent directory: startBundleSource + // throws "Local bundle not found" from buildLocalSource. No fs setup needed. + const ws: Workspace = { + id: "ws_test", + name: "Test", + members: [], + bundles: [{ path: join(workDir, "does-not-exist") }], + createdAt: "2026-01-01T00:00:00Z", + updatedAt: "2026-01-01T00:00:00Z", + }; + const sink = makeEventCollector(); + + const result = await startWorkspaceBundles(makeStore([ws]), [], null, sink, undefined, { + workDir, + }); + + // Catch path keeps siblings unaffected — registry exists, just no source registered. + expect(result.registries.get("ws_test")).toBeDefined(); + expect(result.entries).toHaveLength(0); + + const failedEvents = sink.events.filter((e) => e.type === "bundle.start_failed"); + expect(failedEvents).toHaveLength(1); + const { data } = failedEvents[0]!; + expect(data.wsId).toBe("ws_test"); + expect(data.serverName).toBe("does-not-exist"); + expect(typeof data.error).toBe("string"); + expect(data.error as string).toContain("Local bundle not found"); + }); + + it("returns failures array alongside entries", async () => { + const ws: Workspace = { + id: "ws_test", + name: "Test", + members: [], + bundles: [{ path: join(workDir, "missing-bundle") }], + createdAt: "2026-01-01T00:00:00Z", + updatedAt: "2026-01-01T00:00:00Z", + }; + const sink = makeEventCollector(); + + const result = await startWorkspaceBundles(makeStore([ws]), [], null, sink, undefined, { + workDir, + }); + + expect(result.failures).toBeDefined(); + expect(result.failures).toHaveLength(1); + const failure = result.failures![0]!; + expect(failure.wsId).toBe("ws_test"); + expect(failure.serverName).toBe("missing-bundle"); + expect(failure.error).toContain("Local bundle not found"); + }); + + it("a failed bundle does not abort siblings — workspace still gets a registry", async () => { + const ws1: Workspace = { + id: "ws_failing", + name: "Failing", + members: [], + bundles: [{ path: join(workDir, "broken") }], + createdAt: "2026-01-01T00:00:00Z", + updatedAt: "2026-01-01T00:00:00Z", + }; + const ws2: Workspace = { + id: "ws_empty", + name: "Empty", + members: [], + bundles: [], + createdAt: "2026-01-01T00:00:00Z", + updatedAt: "2026-01-01T00:00:00Z", + }; + const sink = makeEventCollector(); + + const result = await startWorkspaceBundles( + makeStore([ws1, ws2]), + [], + null, + sink, + undefined, + { workDir }, + ); + + expect(result.registries.get("ws_failing")).toBeDefined(); + expect(result.registries.get("ws_empty")).toBeDefined(); + expect(result.failures).toHaveLength(1); + }); +}); diff --git a/test/unit/sse-event-manager.test.ts b/test/unit/sse-event-manager.test.ts index ffcc7d4a..ef560c24 100644 --- a/test/unit/sse-event-manager.test.ts +++ b/test/unit/sse-event-manager.test.ts @@ -117,6 +117,26 @@ describe("SseEventManager — routing table", () => { expect(wsB.events).toEqual(["bundle.crashed"]); }); + test("bundle.start_failed is forwarded to the matching workspace only", async () => { + const wsA = collect(mgr.addClient("ws_a")); + const wsB = collect(mgr.addClient("ws_b")); + released.push(wsA.release, wsB.release); + + mgr.emit({ + type: "bundle.start_failed", + data: { + wsId: "ws_a", + serverName: "broken", + bundleName: "@nb/broken", + error: "spawn failed", + }, + }); + await flush(); + + expect(wsA.events).toContain("bundle.start_failed"); + expect(wsB.events).not.toContain("bundle.start_failed"); + }); + test("workspace-scoped event with missing wsId is dropped (no global fan-out)", async () => { const wsA = collect(mgr.addClient("ws_a")); const wsB = collect(mgr.addClient("ws_b")); diff --git a/test/unit/workspace-log-sink.test.ts b/test/unit/workspace-log-sink.test.ts index fd181484..ab8c5009 100644 --- a/test/unit/workspace-log-sink.test.ts +++ b/test/unit/workspace-log-sink.test.ts @@ -91,6 +91,7 @@ describe("WorkspaceLogSink", () => { "bundle.crashed", "bundle.recovered", "bundle.dead", + "bundle.start_failed", "data.changed", "config.changed", "skill.created", @@ -113,6 +114,28 @@ describe("WorkspaceLogSink", () => { expect(lines).toHaveLength(workspaceTypes.length); }); + it("writes bundle.start_failed event with error details", () => { + const sink = new WorkspaceLogSink({ dir }); + sink.emit( + makeEvent("bundle.start_failed", { + wsId: "ws_a", + serverName: "broken", + bundleName: "@nb/broken", + error: "ENOENT: manifest.json missing", + }), + ); + + const files = readdirSync(join(dir, "workspace")); + const lines = readFileSync(join(dir, "workspace", files[0]!), "utf-8").trim().split("\n"); + expect(lines).toHaveLength(1); + + const record = JSON.parse(lines[0]!); + expect(record.event).toBe("bundle.start_failed"); + expect(record.wsId).toBe("ws_a"); + expect(record.serverName).toBe("broken"); + expect(record.error).toBe("ENOENT: manifest.json missing"); + }); + it("close() is a no-op", () => { const sink = new WorkspaceLogSink({ dir }); expect(() => sink.close()).not.toThrow(); From f7372c100ed921a01eb85899658fad648190e966 Mon Sep 17 00:00:00 2001 From: Mason <31372737+Ovaculos@users.noreply.github.com> Date: Wed, 13 May 2026 14:57:36 -0500 Subject: [PATCH 2/5] feat(health-monitor): propagate wsId on dead BundleHealth entries `BundleHealth.name` is the source name (slugged from manifest or URL). Two workspaces installing the same connector produce identical names, so a same-named start failure across workspaces would render as indistinguishable `dead` rows in `/v1/health`. Add an optional `wsId?: string` to `BundleHealth`. Populated only for boot-time start failures (the data is on `BundleStartFailure`); live entries leave it undefined because `McpSource` doesn't carry a wsId. Consumers can disambiguate without a schema migration on the live path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/tools/health-monitor.ts | 7 +++++++ test/unit/health-monitor.test.ts | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/src/tools/health-monitor.ts b/src/tools/health-monitor.ts index 387f7a30..d348a6e2 100644 --- a/src/tools/health-monitor.ts +++ b/src/tools/health-monitor.ts @@ -9,6 +9,12 @@ export interface BundleHealth { state: BundleState; uptime: number | null; restartCount: number; + /** + * Workspace id — populated only for boot-time start failures (live + * sources don't carry a wsId on `McpSource`). Lets `/v1/health` + * consumers distinguish same-named failed bundles across workspaces. + */ + wsId?: string; } interface BundleRecord { @@ -92,6 +98,7 @@ export class HealthMonitor { state: "dead" as const, uptime: null, restartCount: 0, + wsId: f.wsId, })); return [...live, ...dead]; } diff --git a/test/unit/health-monitor.test.ts b/test/unit/health-monitor.test.ts index bc3933a7..327b1645 100644 --- a/test/unit/health-monitor.test.ts +++ b/test/unit/health-monitor.test.ts @@ -165,9 +165,11 @@ describe("HealthMonitor", () => { expect(broken?.state).toBe("dead"); expect(broken?.uptime).toBeNull(); expect(broken?.restartCount).toBe(0); + expect(broken?.wsId).toBe("ws_a"); const remote = status.find((s) => s.name === "remote-x"); expect(remote?.state).toBe("dead"); + expect(remote?.wsId).toBe("ws_b"); monitor.stop(); }); @@ -208,7 +210,9 @@ describe("HealthMonitor", () => { const names = status.map((s) => s.name).sort(); expect(names).toEqual(["dead-one", "live-one"]); expect(status.find((s) => s.name === "live-one")?.state).toBe("healthy"); + expect(status.find((s) => s.name === "live-one")?.wsId).toBeUndefined(); expect(status.find((s) => s.name === "dead-one")?.state).toBe("dead"); + expect(status.find((s) => s.name === "dead-one")?.wsId).toBe("ws_a"); monitor.stop(); }); From 4170a2d8397df092a0c491cdd6a6aa2c78457a4e Mon Sep 17 00:00:00 2001 From: Mason <31372737+Ovaculos@users.noreply.github.com> Date: Wed, 13 May 2026 14:57:47 -0500 Subject: [PATCH 3/5] refactor(runtime): defensive-copy bundleStartFailures() return value Return a shallow copy instead of the internal array reference, matching the pattern used by `bundleNames()` one method up. HealthMonitor stores the reference and doesn't mutate it today, but exposing the live array invites future callers to splice or push into it and silently corrupt the boot-time failure record. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/runtime/runtime.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/runtime.ts b/src/runtime/runtime.ts index 06b93686..98eb55f6 100644 --- a/src/runtime/runtime.ts +++ b/src/runtime/runtime.ts @@ -1106,7 +1106,7 @@ export class Runtime { * construction so failed bundles appear as `dead` in `/v1/health`. */ bundleStartFailures(): import("./workspace-runtime.ts").BundleStartFailure[] { - return this._bundleStartFailures; + return [...this._bundleStartFailures]; } /** Get MCP sources across all workspace registries (for health monitoring). */ From af8c59f22441547fdb1604714d0a4f07ebc0c31e Mon Sep 17 00:00:00 2001 From: Mason <31372737+Ovaculos@users.noreply.github.com> Date: Wed, 13 May 2026 14:58:50 -0500 Subject: [PATCH 4/5] refactor(events): rename bundle.start_failed to bundle.startFailed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match the casing convention of sibling bundle.* events (`installed`, `uninstalled`, `crashed`, `recovered`, `dead`) which use a single camel/lowercase token after the dot. Pure rename — no payload or routing change. Internal-only event with no external subscribers yet, so safe to rename without a deprecation window. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/adapters/workspace-log-sink.ts | 2 +- src/api/events.ts | 2 +- src/engine/types.ts | 2 +- src/runtime/workspace-runtime.ts | 4 ++-- test/unit/runtime/workspace-runtime.test.ts | 8 ++++---- test/unit/sse-event-manager.test.ts | 8 ++++---- test/unit/workspace-log-sink.test.ts | 8 ++++---- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/adapters/workspace-log-sink.ts b/src/adapters/workspace-log-sink.ts index a4bdedb8..95cfd753 100644 --- a/src/adapters/workspace-log-sink.ts +++ b/src/adapters/workspace-log-sink.ts @@ -9,7 +9,7 @@ const WORKSPACE_EVENTS = new Set([ "bundle.crashed", "bundle.recovered", "bundle.dead", - "bundle.start_failed", + "bundle.startFailed", "data.changed", "config.changed", "skill.created", diff --git a/src/api/events.ts b/src/api/events.ts index 9daffa42..8dbadb18 100644 --- a/src/api/events.ts +++ b/src/api/events.ts @@ -49,7 +49,7 @@ const SSE_ROUTES: Partial> = { "bundle.crashed": { scope: "workspace", wsIdField: "wsId" }, "bundle.recovered": { scope: "workspace", wsIdField: "wsId" }, "bundle.dead": { scope: "workspace", wsIdField: "wsId" }, - "bundle.start_failed": { scope: "workspace", wsIdField: "wsId" }, + "bundle.startFailed": { scope: "workspace", wsIdField: "wsId" }, // Per-principal connection state — workspace-scoped. Drives the // pending-auth banner; without forwarding here, the banner never auto-clears // after a user completes interactive OAuth. diff --git a/src/engine/types.ts b/src/engine/types.ts index b96b0df6..6ffa11e6 100644 --- a/src/engine/types.ts +++ b/src/engine/types.ts @@ -71,7 +71,7 @@ export type EngineEventType = * Distinct from `bundle.crashed`, which requires a running source that * went away. Payload: { wsId, serverName, bundleName, error }. */ - | "bundle.start_failed" + | "bundle.startFailed" /** * Per-principal connection state change for a remote URL bundle. * Payload: { wsId, serverName, principalId, state, authorizationUrl? }. diff --git a/src/runtime/workspace-runtime.ts b/src/runtime/workspace-runtime.ts index 2abd4a7a..f3cf85dc 100644 --- a/src/runtime/workspace-runtime.ts +++ b/src/runtime/workspace-runtime.ts @@ -24,7 +24,7 @@ import type { WorkspaceStore } from "../workspace/workspace-store.ts"; /** * A boot-time bundle startup failure — recorded when `startBundleSource` - * throws. Surfaced via `bundle.start_failed` event (workspace log + SSE) + * throws. Surfaced via `bundle.startFailed` event (workspace log + SSE) * and threaded into HealthMonitor so `/v1/health` reports the failed * bundle as `state: "dead"` rather than omitting it entirely. */ @@ -286,7 +286,7 @@ export async function startWorkspaceBundles( // SSE broadcast (via SseEventManager). Without this, operators have // to grep container stderr to discover a failed boot. eventSink.emit({ - type: "bundle.start_failed", + type: "bundle.startFailed", data: { wsId, serverName: entry.serverName, bundleName, error: msg }, }); failures.push({ wsId, serverName: entry.serverName, bundleName, error: msg }); diff --git a/test/unit/runtime/workspace-runtime.test.ts b/test/unit/runtime/workspace-runtime.test.ts index 49d5fb09..465bdf2b 100644 --- a/test/unit/runtime/workspace-runtime.test.ts +++ b/test/unit/runtime/workspace-runtime.test.ts @@ -308,7 +308,7 @@ describe("mapWithConcurrency", () => { }); // --------------------------------------------------------------------------- -// startWorkspaceBundles — bundle.start_failed observability +// startWorkspaceBundles — bundle.startFailed observability // --------------------------------------------------------------------------- function makeStore(workspaces: Workspace[]): WorkspaceStore { @@ -329,7 +329,7 @@ function makeEventCollector(): EventSink & { events: EngineEvent[] } { }; } -describe("startWorkspaceBundles — bundle.start_failed", () => { +describe("startWorkspaceBundles — bundle.startFailed", () => { let workDir: string; beforeEach(() => { @@ -340,7 +340,7 @@ describe("startWorkspaceBundles — bundle.start_failed", () => { rmSync(workDir, { recursive: true, force: true }); }); - it("emits bundle.start_failed when a bundle fails to start", async () => { + it("emits bundle.startFailed when a bundle fails to start", async () => { // Path-based bundle pointing at a nonexistent directory: startBundleSource // throws "Local bundle not found" from buildLocalSource. No fs setup needed. const ws: Workspace = { @@ -361,7 +361,7 @@ describe("startWorkspaceBundles — bundle.start_failed", () => { expect(result.registries.get("ws_test")).toBeDefined(); expect(result.entries).toHaveLength(0); - const failedEvents = sink.events.filter((e) => e.type === "bundle.start_failed"); + const failedEvents = sink.events.filter((e) => e.type === "bundle.startFailed"); expect(failedEvents).toHaveLength(1); const { data } = failedEvents[0]!; expect(data.wsId).toBe("ws_test"); diff --git a/test/unit/sse-event-manager.test.ts b/test/unit/sse-event-manager.test.ts index ef560c24..bf56908b 100644 --- a/test/unit/sse-event-manager.test.ts +++ b/test/unit/sse-event-manager.test.ts @@ -117,13 +117,13 @@ describe("SseEventManager — routing table", () => { expect(wsB.events).toEqual(["bundle.crashed"]); }); - test("bundle.start_failed is forwarded to the matching workspace only", async () => { + test("bundle.startFailed is forwarded to the matching workspace only", async () => { const wsA = collect(mgr.addClient("ws_a")); const wsB = collect(mgr.addClient("ws_b")); released.push(wsA.release, wsB.release); mgr.emit({ - type: "bundle.start_failed", + type: "bundle.startFailed", data: { wsId: "ws_a", serverName: "broken", @@ -133,8 +133,8 @@ describe("SseEventManager — routing table", () => { }); await flush(); - expect(wsA.events).toContain("bundle.start_failed"); - expect(wsB.events).not.toContain("bundle.start_failed"); + expect(wsA.events).toContain("bundle.startFailed"); + expect(wsB.events).not.toContain("bundle.startFailed"); }); test("workspace-scoped event with missing wsId is dropped (no global fan-out)", async () => { diff --git a/test/unit/workspace-log-sink.test.ts b/test/unit/workspace-log-sink.test.ts index ab8c5009..c2a066b6 100644 --- a/test/unit/workspace-log-sink.test.ts +++ b/test/unit/workspace-log-sink.test.ts @@ -91,7 +91,7 @@ describe("WorkspaceLogSink", () => { "bundle.crashed", "bundle.recovered", "bundle.dead", - "bundle.start_failed", + "bundle.startFailed", "data.changed", "config.changed", "skill.created", @@ -114,10 +114,10 @@ describe("WorkspaceLogSink", () => { expect(lines).toHaveLength(workspaceTypes.length); }); - it("writes bundle.start_failed event with error details", () => { + it("writes bundle.startFailed event with error details", () => { const sink = new WorkspaceLogSink({ dir }); sink.emit( - makeEvent("bundle.start_failed", { + makeEvent("bundle.startFailed", { wsId: "ws_a", serverName: "broken", bundleName: "@nb/broken", @@ -130,7 +130,7 @@ describe("WorkspaceLogSink", () => { expect(lines).toHaveLength(1); const record = JSON.parse(lines[0]!); - expect(record.event).toBe("bundle.start_failed"); + expect(record.event).toBe("bundle.startFailed"); expect(record.wsId).toBe("ws_a"); expect(record.serverName).toBe("broken"); expect(record.error).toBe("ENOENT: manifest.json missing"); From e55eefb43052da06f397946ab3c73d4608d3011b Mon Sep 17 00:00:00 2001 From: Mason <31372737+Ovaculos@users.noreply.github.com> Date: Wed, 13 May 2026 14:59:19 -0500 Subject: [PATCH 5/5] test(workspace-runtime): assert shape over error message string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The startWorkspaceBundles failure tests asserted on a substring of the error message ("Local bundle not found") emitted by buildLocalSource. That couples the test to wording inside a different module — rewording the error there would break tests here for no behavioral reason. Switch to shape assertions: error and bundleName are non-empty strings, plus the existing wsId / serverName equality checks. The behavior under test is "a failure was recorded with the expected fields populated," not "the message reads exactly this." Co-Authored-By: Claude Opus 4.7 (1M context) --- test/unit/runtime/workspace-runtime.test.ts | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/unit/runtime/workspace-runtime.test.ts b/test/unit/runtime/workspace-runtime.test.ts index 465bdf2b..4ba25400 100644 --- a/test/unit/runtime/workspace-runtime.test.ts +++ b/test/unit/runtime/workspace-runtime.test.ts @@ -342,7 +342,7 @@ describe("startWorkspaceBundles — bundle.startFailed", () => { it("emits bundle.startFailed when a bundle fails to start", async () => { // Path-based bundle pointing at a nonexistent directory: startBundleSource - // throws "Local bundle not found" from buildLocalSource. No fs setup needed. + // throws from buildLocalSource. No fs setup needed. const ws: Workspace = { id: "ws_test", name: "Test", @@ -366,8 +366,10 @@ describe("startWorkspaceBundles — bundle.startFailed", () => { const { data } = failedEvents[0]!; expect(data.wsId).toBe("ws_test"); expect(data.serverName).toBe("does-not-exist"); + expect(typeof data.bundleName).toBe("string"); + expect((data.bundleName as string).length).toBeGreaterThan(0); expect(typeof data.error).toBe("string"); - expect(data.error as string).toContain("Local bundle not found"); + expect((data.error as string).length).toBeGreaterThan(0); }); it("returns failures array alongside entries", async () => { @@ -390,7 +392,10 @@ describe("startWorkspaceBundles — bundle.startFailed", () => { const failure = result.failures![0]!; expect(failure.wsId).toBe("ws_test"); expect(failure.serverName).toBe("missing-bundle"); - expect(failure.error).toContain("Local bundle not found"); + expect(typeof failure.bundleName).toBe("string"); + expect(failure.bundleName.length).toBeGreaterThan(0); + expect(typeof failure.error).toBe("string"); + expect(failure.error.length).toBeGreaterThan(0); }); it("a failed bundle does not abort siblings — workspace still gets a registry", async () => {