Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions container/agent-runner/src/poll-loop.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { initTestSessionDb, closeSessionDb, getInboundDb, getOutboundDb } from '
import { getPendingMessages, markCompleted } from './db/messages-in.js';
import { getUndeliveredMessages } from './db/messages-out.js';
import { formatMessages, extractRouting } from './formatter.js';
import { isCorruptionError } from './poll-loop.js';
import { MockProvider } from './providers/mock.js';

beforeEach(() => {
Expand Down Expand Up @@ -377,3 +378,20 @@ describe('end-to-end with mock provider', () => {
expect(outMessages[0].in_reply_to).toBe('m1');
});
});

describe('isCorruptionError', () => {
it('matches the Docker Desktop macOS torn-read symptom', () => {
expect(isCorruptionError('database disk image is malformed')).toBe(true);
});

it('matches wrapped SQLite corruption codes', () => {
expect(isCorruptionError('SqliteError: SQLITE_CORRUPT_VTAB: ...')).toBe(true);
expect(isCorruptionError('file is not a database')).toBe(true);
});

it('returns false for unrelated errors', () => {
expect(isCorruptionError('database is locked')).toBe(false);
expect(isCorruptionError('no such table: messages_in')).toBe(false);
expect(isCorruptionError('')).toBe(false);
});
});
50 changes: 50 additions & 0 deletions container/agent-runner/src/poll-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,30 @@ import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types
const POLL_INTERVAL_MS = 1000;
const ACTIVE_POLL_INTERVAL_MS = 500;

/**
* Number of consecutive `database disk image is malformed` errors after which
* the follow-up poll gives up and exits the process. At ACTIVE_POLL_INTERVAL_MS
* = 500ms this is roughly 5 seconds — long enough to dodge a transient torn
* read during a host write, short enough to recover quickly from a poisoned
* page cache (host-sweep then respawns with a fresh mount).
*/
const CORRUPTION_STREAK_EXIT = 10;

/**
* True for SQLite errors that indicate a corrupt READ view — almost always a
* cross-mount page-cache coherency issue on Docker Desktop macOS rather than
* actual file damage (host-side integrity_check passes). Reopening the DB
* handle inside this process does NOT recover; only a fresh container mount
* does. Caller's job is to exit so host-sweep respawns the container.
*/
export function isCorruptionError(msg: string): boolean {
return (
msg.includes('database disk image is malformed') ||
msg.includes('SQLITE_CORRUPT') ||
msg.includes('file is not a database')
);
}

function log(msg: string): void {
console.error(`[poll-loop] ${msg}`);
}
Expand Down Expand Up @@ -291,6 +315,7 @@ async function processQuery(
// will kill the container and messages get reset to pending.
let pollInFlight = false;
let endedForCommand = false;
let corruptionStreak = 0;
const pollHandle = setInterval(() => {
if (done || pollInFlight || endedForCommand) return;
pollInFlight = true;
Expand Down Expand Up @@ -362,6 +387,31 @@ async function processQuery(
// path is not, so it needs its own.
const errMsg = err instanceof Error ? err.message : String(err);
log(`Follow-up poll error: ${errMsg}`);

// Detect SQLite cross-mount corruption (Docker Desktop macOS virtiofs /
// gRPC-FUSE coherency bug — the kernel page cache for the inbound.db
// bind mount can latch a torn snapshot mid-host-write, after which
// every fresh openInboundDb() in this process sees the same broken
// view. Reopening inside the container does NOT recover; only a fresh
// container mount does. Exit so the host sweep respawns us.
if (isCorruptionError(errMsg)) {
corruptionStreak += 1;
if (corruptionStreak >= CORRUPTION_STREAK_EXIT) {
log(
`Follow-up poll: ${corruptionStreak} consecutive '${errMsg}' errors — ` +
`inbound.db page cache is poisoned. Exiting so host respawns with a fresh mount.`,
);
// Stop touching the heartbeat so host-sweep stale detection fires
// promptly even if exit() races with in-flight async work.
done = true;
clearInterval(pollHandle);
// Defer exit one tick so this log line flushes through Docker's
// log driver before the process dies.
setTimeout(() => process.exit(75), 100);
}
} else {
corruptionStreak = 0;
}
} finally {
pollInFlight = false;
}
Expand Down