-
Notifications
You must be signed in to change notification settings - Fork 10
fix(turn-orchestrator): serialize concurrent approval wakes #198
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| /** | ||
| * Per-session mutual-exclusion lease for turn FSM transitions. | ||
| * | ||
| * The `turn-step` durable queue has no per-session ordering — `Enqueue` takes | ||
| * only a queue name (see iii-sdk `TriggerAction.Enqueue`). So two | ||
| * `turn::function_awaiting_approval` wakes for one session (one per | ||
| * `approval::resolve` write, fanned out by the `turn::on_approval` state | ||
| * trigger) can run concurrently. Without serialization both load the same | ||
| * parked `turn_state`, execute every call, and finalize — duplicating side | ||
| * effects (the function runs twice) and emitting duplicate | ||
| * `function_execution_end` / `turn_end` frames, which wedges the turn. | ||
| * | ||
| * The only atomic primitive the state worker exposes is `state::update` with | ||
| * `increment` — a locked read-modify-write that returns the prior value (the | ||
| * kv adapter holds the store write-lock for the whole op). Acquire increments | ||
| * a per-session holder counter; the caller that observes prior `0` (or a | ||
| * missing key) won and may proceed. Release resets the counter to `0`. | ||
| * | ||
| * Crash recovery: a holder that dies mid-transition never resets the counter, | ||
| * which would wedge the session forever. A contender whose acquire fails | ||
| * therefore steals a lease whose recorded acquire time is older than | ||
| * {@link LEASE_TTL_MS}. The steal is best-effort (a post-crash window can let | ||
| * two contenders through), but that degrades to the pre-fix behavior only | ||
| * briefly after a crash — far better than a permanent deadlock. | ||
| */ | ||
|
|
||
| import type { ISdk } from '../../runtime/iii.js'; | ||
| import { stateGet, stateSet, stateUpdate } from '../../runtime/state.js'; | ||
|
|
||
| export const LEASE_SCOPE = 'turn_lease'; | ||
| export const LEASE_AT_SCOPE = 'turn_lease_at'; | ||
| /** A holder older than this is assumed crashed and may be stolen. */ | ||
| export const LEASE_TTL_MS = 30_000; | ||
|
|
||
| /** Atomically bump the holder counter; returns the prior count (0 when free/missing). */ | ||
| async function bumpHolders(iii: ISdk, session_id: string): Promise<number> { | ||
| const res = await stateUpdate(iii, LEASE_SCOPE, session_id, [ | ||
| { type: 'increment', path: '', by: 1 }, | ||
| ]); | ||
| const prior = (res as { old_value?: unknown } | null)?.old_value; | ||
| return typeof prior === 'number' ? prior : 0; | ||
| } | ||
|
|
||
| /** | ||
| * Try to acquire the session lease. Returns `true` when this caller holds it | ||
| * and must call {@link releaseSessionLease}; `false` when another transition | ||
| * holds it (the caller should back off / retry). | ||
| */ | ||
| export async function acquireSessionLease(iii: ISdk, session_id: string): Promise<boolean> { | ||
| if ((await bumpHolders(iii, session_id)) === 0) { | ||
| await stateSet(iii, LEASE_AT_SCOPE, session_id, Date.now()); | ||
| return true; | ||
| } | ||
| // Contended — recover a lease abandoned by a crashed holder. | ||
| const acquiredAt = await stateGet(iii, LEASE_AT_SCOPE, session_id); | ||
| if (typeof acquiredAt === 'number' && Date.now() - acquiredAt > LEASE_TTL_MS) { | ||
| await stateSet(iii, LEASE_SCOPE, session_id, 0); | ||
| if ((await bumpHolders(iii, session_id)) === 0) { | ||
| await stateSet(iii, LEASE_AT_SCOPE, session_id, Date.now()); | ||
| return true; | ||
| } | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| /** Release the session lease. Safe to call only from the holder. */ | ||
| export async function releaseSessionLease(iii: ISdk, session_id: string): Promise<void> { | ||
| await stateSet(iii, LEASE_SCOPE, session_id, 0); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ import { | |
| handleApprovalStateWrite, | ||
| handleAwaitingApproval, | ||
| } from '../../src/turn-orchestrator/function-awaiting-approval/process.js'; | ||
| import { TransientError } from '../../src/turn-orchestrator/errors.js'; | ||
| import { handleExecute } from '../../src/turn-orchestrator/function-execute/process.js'; | ||
| import { enterFunctionExecute } from '../../src/turn-orchestrator/function-execute/run.js'; | ||
| import { runTransition } from '../../src/turn-orchestrator/run-transition.js'; | ||
|
|
@@ -77,14 +78,39 @@ async function runTurnStep(iii: ISdk, function_id: string, session_id: string): | |
| return; | ||
| } | ||
| if (function_id === 'turn::function_awaiting_approval') { | ||
| await runTransition(iii, 'function_awaiting_approval', handleAwaitingApproval, payload); | ||
| await runTransition(iii, 'function_awaiting_approval', handleAwaitingApproval, payload, { | ||
| serialize: true, | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Model the durable queue's TransientError retry: a wake that loses the | ||
| * per-session lease re-runs after the holder releases. Yields between attempts | ||
| * so the lease holder can make progress. | ||
| */ | ||
| async function runTurnStepWithRetry( | ||
| iii: ISdk, | ||
| function_id: string, | ||
| session_id: string, | ||
| ): Promise<void> { | ||
| for (let attempt = 0; attempt < 100; attempt += 1) { | ||
| try { | ||
| await runTurnStep(iii, function_id, session_id); | ||
| return; | ||
| } catch (err) { | ||
| if (err instanceof TransientError) { | ||
| await flushMicrotasks(); | ||
| continue; | ||
| } | ||
| throw err; | ||
| } | ||
| } | ||
| } | ||
|
Comment on lines
+97
to
109
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't silently succeed after retry exhaustion. If all 100 attempts hit Proposed fix async function runTurnStepWithRetry(
iii: ISdk,
function_id: string,
session_id: string,
): Promise<void> {
+ let lastTransient: TransientError | null = null;
for (let attempt = 0; attempt < 100; attempt += 1) {
try {
await runTurnStep(iii, function_id, session_id);
return;
} catch (err) {
if (err instanceof TransientError) {
+ lastTransient = err;
await flushMicrotasks();
continue;
}
throw err;
}
}
+ throw lastTransient ?? new Error(`retry budget exhausted for ${function_id}`);
}🤖 Prompt for AI Agents |
||
|
|
||
| export function createParallelApprovalHarness(): ParallelApprovalHarness { | ||
| const stateStore = new Map<string, unknown>(); | ||
| const emitted: AgentEvent[] = []; | ||
| let eventSeq = 0; | ||
|
|
||
| const iii = { | ||
| trigger: vi.fn( | ||
|
|
@@ -126,12 +152,35 @@ export function createParallelApprovalHarness(): ParallelApprovalHarness { | |
| } | ||
|
|
||
| if (function_id === 'state::update') { | ||
| eventSeq += 1; | ||
| return { old_value: eventSeq - 1 }; | ||
| // Faithful atomic read-modify-write per (scope, key): the engine's | ||
| // kv adapter holds the store write-lock for the whole op, so | ||
| // increment returns the prior value (null/absent → treated as 0). | ||
| // Both the event counter and the per-session lease depend on this. | ||
| const p = payload as { | ||
| scope: string; | ||
| key: string; | ||
| ops?: Array<{ type: string; path?: string; by?: number }>; | ||
| }; | ||
| const storeKey = `${p.scope}/${p.key}`; | ||
| const old_value = stateStore.has(storeKey) | ||
| ? structuredClone(stateStore.get(storeKey)) | ||
| : null; | ||
| let next: unknown = old_value; | ||
| for (const op of p.ops ?? []) { | ||
| if (op.type === 'increment' && (op.path ?? '') === '') { | ||
| next = (typeof next === 'number' ? next : 0) + (op.by ?? 1); | ||
| } | ||
| } | ||
| stateStore.set(storeKey, next); | ||
| return { old_value, new_value: structuredClone(next) }; | ||
| } | ||
|
|
||
| if (function_id === 'stream::set') { | ||
| const p = payload as { data: AgentEvent }; | ||
| const p = payload as { stream_name?: string; data: AgentEvent }; | ||
| // events.ts mirrors every turn_end onto a second `agent::turn_end` | ||
| // stream for compaction. Record only the primary `agent::events` | ||
| // stream so `emitted` is a faithful one-entry-per-event log. | ||
| if (p.stream_name === 'agent::turn_end') return null; | ||
| emitted.push(p.data); | ||
| return null; | ||
| } | ||
|
|
@@ -154,7 +203,7 @@ export function createParallelApprovalHarness(): ParallelApprovalHarness { | |
|
|
||
| if (function_id.startsWith('turn::') && action != null) { | ||
| const p = payload as { session_id: string }; | ||
| await runTurnStep(iii as unknown as ISdk, function_id, p.session_id); | ||
| await runTurnStepWithRetry(iii as unknown as ISdk, function_id, p.session_id); | ||
| return null; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Split-brain lease acquisition is still possible in the normal (non-crash) path.
At Line 50, the holder counter is acquired before the new acquire timestamp is published (Line 51). A concurrent contender can then hit the stale-steal branch at Lines 56-59 using an old
LEASE_AT_SCOPEvalue and acquire too, so two workers may proceed concurrently.This reintroduces duplicate transition execution risk. The lease needs ownership fencing (e.g., token/epoch validation on acquire/steal/release) or native KV lock primitives to avoid this race.