diff --git a/bin/lib/registry.js b/bin/lib/registry.js index a3e2692b1..79df06c3d 100644 --- a/bin/lib/registry.js +++ b/bin/lib/registry.js @@ -7,6 +7,102 @@ const fs = require("fs"); const path = require("path"); const REGISTRY_FILE = path.join(process.env.HOME || "/tmp", ".nemoclaw", "sandboxes.json"); +const LOCK_DIR = REGISTRY_FILE + ".lock"; +const LOCK_OWNER = path.join(LOCK_DIR, "owner"); +const LOCK_STALE_MS = 10_000; +const LOCK_RETRY_MS = 100; +const LOCK_MAX_RETRIES = 120; + +/** + * Acquire an advisory lock using mkdir (atomic on POSIX). + * Writes an owner file with PID for stale-lock detection via process liveness. + */ +function acquireLock() { + fs.mkdirSync(path.dirname(REGISTRY_FILE), { recursive: true, mode: 0o700 }); + const sleepBuf = new Int32Array(new SharedArrayBuffer(4)); + for (let i = 0; i < LOCK_MAX_RETRIES; i++) { + try { + fs.mkdirSync(LOCK_DIR); + const ownerTmp = LOCK_OWNER + ".tmp." + process.pid; + try { + fs.writeFileSync(ownerTmp, String(process.pid), { mode: 0o600 }); + fs.renameSync(ownerTmp, LOCK_OWNER); + } catch (ownerErr) { + // Remove the directory we just created so it doesn't look like a stale lock + try { fs.unlinkSync(ownerTmp); } catch { /* best effort */ } + try { fs.unlinkSync(LOCK_OWNER); } catch { /* best effort */ } + try { fs.rmdirSync(LOCK_DIR); } catch { /* best effort */ } + throw ownerErr; + } + return; + } catch (err) { + if (err.code !== "EEXIST") throw err; + // Check if the lock owner is still alive + let ownerChecked = false; + try { + const ownerPid = parseInt(fs.readFileSync(LOCK_OWNER, "utf-8").trim(), 10); + if (Number.isFinite(ownerPid) && ownerPid > 0) { + ownerChecked = true; + let alive; + try { + process.kill(ownerPid, 0); + alive = true; + } catch (killErr) { + // EPERM means the process exists but we lack permission — still alive + alive = killErr.code === "EPERM"; + } + if (!alive) { + // Verify PID hasn't changed (TOCTOU guard) + const recheck = parseInt(fs.readFileSync(LOCK_OWNER, "utf-8").trim(), 10); + if (recheck === ownerPid) { + fs.rmSync(LOCK_DIR, { recursive: true, force: true }); + continue; + } + } + } + // Owner file empty/corrupt — another process may be mid-write + // (between mkdirSync and renameSync). Fall through to mtime check. + } catch { + // No owner file or lock dir released — fall through to mtime staleness + } + if (!ownerChecked) { + // No valid owner PID available — use mtime as fallback + try { + const stat = fs.statSync(LOCK_DIR); + if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) { + fs.rmSync(LOCK_DIR, { recursive: true, force: true }); + continue; + } + } catch { + // Lock was released between our check — retry immediately + continue; + } + } + Atomics.wait(sleepBuf, 0, 0, LOCK_RETRY_MS); + } + } + throw new Error(`Failed to acquire lock on ${REGISTRY_FILE} after ${LOCK_MAX_RETRIES} retries`); +} + +function releaseLock() { + try { fs.unlinkSync(LOCK_OWNER); } catch (err) { + if (err.code !== "ENOENT") throw err; + } + // rmSync handles leftover tmp files from crashed acquireLock attempts + try { fs.rmSync(LOCK_DIR, { recursive: true, force: true }); } catch (err) { + if (err.code !== "ENOENT") throw err; + } +} + +/** Run fn while holding the registry lock. Returns fn's return value. */ +function withLock(fn) { + acquireLock(); + try { + return fn(); + } finally { + releaseLock(); + } +} function load() { try { @@ -17,10 +113,19 @@ function load() { return { sandboxes: {}, defaultSandbox: null }; } +/** Atomic write: tmp file + rename on the same filesystem. */ function save(data) { const dir = path.dirname(REGISTRY_FILE); fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); - fs.writeFileSync(REGISTRY_FILE, JSON.stringify(data, null, 2), { mode: 0o600 }); + const tmp = REGISTRY_FILE + ".tmp." + process.pid; + try { + fs.writeFileSync(tmp, JSON.stringify(data, null, 2), { mode: 0o600 }); + fs.renameSync(tmp, REGISTRY_FILE); + } catch (err) { + // Clean up partial temp file on failure + try { fs.unlinkSync(tmp); } catch { /* best effort */ } + throw err; + } } function getSandbox(name) { @@ -39,40 +144,49 @@ function getDefault() { } function registerSandbox(entry) { - const data = load(); - data.sandboxes[entry.name] = { - name: entry.name, - createdAt: entry.createdAt || new Date().toISOString(), - model: entry.model || null, - nimContainer: entry.nimContainer || null, - provider: entry.provider || null, - gpuEnabled: entry.gpuEnabled || false, - policies: entry.policies || [], - }; - if (!data.defaultSandbox) { - data.defaultSandbox = entry.name; - } - save(data); + return withLock(() => { + const data = load(); + data.sandboxes[entry.name] = { + name: entry.name, + createdAt: entry.createdAt || new Date().toISOString(), + model: entry.model || null, + nimContainer: entry.nimContainer || null, + provider: entry.provider || null, + gpuEnabled: entry.gpuEnabled || false, + policies: entry.policies || [], + }; + if (!data.defaultSandbox) { + data.defaultSandbox = entry.name; + } + save(data); + }); } function updateSandbox(name, updates) { - const data = load(); - if (!data.sandboxes[name]) return false; - Object.assign(data.sandboxes[name], updates); - save(data); - return true; + return withLock(() => { + const data = load(); + if (!data.sandboxes[name]) return false; + if (Object.prototype.hasOwnProperty.call(updates, "name") && updates.name !== name) { + return false; + } + Object.assign(data.sandboxes[name], updates); + save(data); + return true; + }); } function removeSandbox(name) { - const data = load(); - if (!data.sandboxes[name]) return false; - delete data.sandboxes[name]; - if (data.defaultSandbox === name) { - const remaining = Object.keys(data.sandboxes); - data.defaultSandbox = remaining.length > 0 ? remaining[0] : null; - } - save(data); - return true; + return withLock(() => { + const data = load(); + if (!data.sandboxes[name]) return false; + delete data.sandboxes[name]; + if (data.defaultSandbox === name) { + const remaining = Object.keys(data.sandboxes); + data.defaultSandbox = remaining.length > 0 ? remaining[0] : null; + } + save(data); + return true; + }); } function listSandboxes() { @@ -84,11 +198,13 @@ function listSandboxes() { } function setDefault(name) { - const data = load(); - if (!data.sandboxes[name]) return false; - data.defaultSandbox = name; - save(data); - return true; + return withLock(() => { + const data = load(); + if (!data.sandboxes[name]) return false; + data.defaultSandbox = name; + save(data); + return true; + }); } module.exports = { @@ -101,4 +217,8 @@ module.exports = { removeSandbox, listSandboxes, setDefault, + // Exported for testing + acquireLock, + releaseLock, + withLock, }; diff --git a/test/registry.test.js b/test/registry.test.js index 720209460..25a3138f2 100644 --- a/test/registry.test.js +++ b/test/registry.test.js @@ -66,6 +66,15 @@ describe("registry", () => { expect(registry.updateSandbox("nope", {})).toBe(false); }); + it("updateSandbox rejects name changes", () => { + registry.registerSandbox({ name: "orig" }); + expect(registry.updateSandbox("orig", { name: "renamed" })).toBe(false); + // Original entry unchanged + expect(registry.getSandbox("orig").name).toBe("orig"); + // No ghost entry under new name + expect(registry.getSandbox("renamed")).toBe(null); + }); + it("removeSandbox deletes and shifts default", () => { registry.registerSandbox({ name: "x" }); registry.registerSandbox({ name: "y" }); @@ -106,3 +115,174 @@ describe("registry", () => { expect(sandboxes.length).toBe(0); }); }); + +describe("atomic writes", () => { + const regDir = path.dirname(regFile); + + beforeEach(() => { + if (fs.existsSync(regFile)) fs.unlinkSync(regFile); + // Clean up any leftover tmp files + if (fs.existsSync(regDir)) { + for (const f of fs.readdirSync(regDir)) { + if (f.startsWith("sandboxes.json.tmp.")) { + fs.unlinkSync(path.join(regDir, f)); + } + } + } + }); + + it("save() writes via temp file + rename (no partial writes on disk)", () => { + registry.registerSandbox({ name: "atomic-test" }); + // File must exist and be valid JSON after save + const raw = fs.readFileSync(regFile, "utf-8"); + const data = JSON.parse(raw); + expect(data.sandboxes["atomic-test"].name).toBe("atomic-test"); + // No leftover .tmp files + const tmpFiles = fs.readdirSync(regDir).filter((f) => f.startsWith("sandboxes.json.tmp.")); + expect(tmpFiles).toHaveLength(0); + }); + + it("save() cleans up temp file when rename fails", () => { + fs.mkdirSync(regDir, { recursive: true }); + fs.writeFileSync(regFile, '{"sandboxes":{},"defaultSandbox":null}', { mode: 0o600 }); + + // Stub renameSync so writeFileSync succeeds (temp file is created) + // but the rename step throws — exercising the cleanup branch. + const original = fs.renameSync; + fs.renameSync = () => { throw Object.assign(new Error("EACCES"), { code: "EACCES" }); }; + try { + expect(() => registry.save({ sandboxes: {}, defaultSandbox: null })).toThrow("EACCES"); + } finally { + fs.renameSync = original; + } + // The save() catch block should have removed the temp file + const tmpFiles = fs.readdirSync(regDir).filter((f) => f.startsWith("sandboxes.json.tmp.")); + expect(tmpFiles).toHaveLength(0); + }); +}); + +describe("advisory file locking", () => { + const lockDir = regFile + ".lock"; + const ownerFile = path.join(lockDir, "owner"); + + beforeEach(() => { + if (fs.existsSync(regFile)) fs.unlinkSync(regFile); + fs.rmSync(lockDir, { recursive: true, force: true }); + }); + + it("acquireLock creates lock directory with owner file and releaseLock removes both", () => { + registry.acquireLock(); + expect(fs.existsSync(lockDir)).toBe(true); + expect(fs.existsSync(ownerFile)).toBe(true); + expect(fs.readFileSync(ownerFile, "utf-8").trim()).toBe(String(process.pid)); + registry.releaseLock(); + expect(fs.existsSync(lockDir)).toBe(false); + }); + + it("withLock releases lock even when callback throws", () => { + expect(() => { + registry.withLock(() => { + expect(fs.existsSync(lockDir)).toBe(true); + throw new Error("intentional"); + }); + }).toThrow("intentional"); + expect(fs.existsSync(lockDir)).toBe(false); + }); + + it("acquireLock cleans up lock dir when owner file write fails", () => { + const origWrite = fs.writeFileSync; + let firstCall = true; + fs.writeFileSync = (...args) => { + // Fail only the first writeFileSync targeting the owner tmp file + if (String(args[0]).includes("owner.tmp.") && firstCall) { + firstCall = false; + throw Object.assign(new Error("ENOSPC"), { code: "ENOSPC" }); + } + return origWrite.apply(fs, args); + }; + try { + // First attempt should throw, but no stale lock dir left behind + expect(() => registry.acquireLock()).toThrow("ENOSPC"); + expect(fs.existsSync(lockDir)).toBe(false); + } finally { + fs.writeFileSync = origWrite; + } + }); + + it("acquireLock removes stale lock owned by dead process", () => { + // Create a lock with a PID that doesn't exist (99999999) + fs.mkdirSync(lockDir, { recursive: true }); + fs.writeFileSync(ownerFile, "99999999", { mode: 0o600 }); + + // Should succeed by detecting the dead owner and removing the stale lock + registry.acquireLock(); + expect(fs.existsSync(lockDir)).toBe(true); + expect(fs.readFileSync(ownerFile, "utf-8").trim()).toBe(String(process.pid)); + registry.releaseLock(); + }); + + it("mutating operations acquire and release the lock", () => { + const mkdirCalls = []; + const rmCalls = []; + const origMkdir = fs.mkdirSync; + const origRm = fs.rmSync; + fs.mkdirSync = (...args) => { + if (args[0] === lockDir) mkdirCalls.push(args[0]); + return origMkdir.apply(fs, args); + }; + fs.rmSync = (...args) => { + if (args[0] === lockDir) rmCalls.push(args[0]); + return origRm.apply(fs, args); + }; + try { + registry.registerSandbox({ name: "lock-test" }); + } finally { + fs.mkdirSync = origMkdir; + fs.rmSync = origRm; + } + expect(mkdirCalls.length).toBeGreaterThanOrEqual(1); + expect(rmCalls.length).toBeGreaterThanOrEqual(1); + expect(registry.getSandbox("lock-test").name).toBe("lock-test"); + }); + + it("concurrent writers do not corrupt the registry", () => { + const { spawnSync } = require("child_process"); + const registryPath = path.resolve(path.join(import.meta.dirname, "..", "bin", "lib", "registry.js")); + const homeDir = path.dirname(path.dirname(regFile)); + // Script that spawns 4 workers in parallel, each writing 5 sandboxes + const orchestrator = ` + const { spawn } = require("child_process"); + const workerScript = \` + process.env.HOME = ${JSON.stringify(homeDir)}; + const reg = require(${JSON.stringify(registryPath)}); + const id = process.argv[1]; + for (let i = 0; i < 5; i++) { + reg.registerSandbox({ name: id + "-" + i, model: "m" }); + } + \`; + const workers = []; + for (let w = 0; w < 4; w++) { + workers.push(spawn(process.execPath, ["-e", workerScript, "w" + w])); + } + let exitCount = 0; + let allOk = true; + for (const child of workers) { + child.on("exit", (code) => { + if (code !== 0) allOk = false; + exitCount++; + if (exitCount === workers.length) { + process.exit(allOk ? 0 : 1); + } + }); + } + `; + const result = spawnSync(process.execPath, ["-e", orchestrator], { + encoding: "utf-8", + timeout: 30_000, + }); + expect(result.status, result.stderr).toBe(0); + // All 20 sandboxes (4 workers × 5 each) must be present + const { sandboxes } = registry.listSandboxes(); + expect(sandboxes.length).toBe(20); + }); +});