Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions src/app/api/run-artifacts/[chatId]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ interface SwapEntry {
ts: number;
fromErrorKind?: string;
fromErrorMessage?: string;
/** True iff the TO voice's run wrote `_stats.json` for this slot
* (lineage + model stamp matches the slot's final _stats.json).
* False when the chain advanced past this entry via collision or
* exhaustion. UI uses this to gate the fallback banner so a slot
* whose primary produced the displayed answer doesn't render a
* misleading "fallback fired / actually ran" badge. */
actuallyRan?: boolean;
}

/**
Expand Down Expand Up @@ -138,6 +145,31 @@ function readChatSwaps(chatId: string): SwapEntry[] {
// the swap's fromModel. Lets the UI render "kimi-k2.6 failed:
// cli_failed — model not found" instead of a bare arrow.
const attemptsByModel = readAttemptsByModel(partDir);
// Read the slot's `_stats.json` to find which lineage/model
// actually produced the final answer.md. reviewer.ts stamps
// `lineage` + `model` on every successful completion; the
// LAST writer wins. We use this to mark `actuallyRan: true`
// on only the swap entry whose `to` matches the final
// writer. Collision-exhausted chains (gemini → sonnet where
// sonnet was claimed elsewhere) leave _stats.json carrying
// the primary's stamp, so no swap entry gets actuallyRan —
// and the UI suppresses the misleading banners.
let finalLineage: string | undefined;
let finalModel: string | undefined;
try {
const statsPath = path.join(partDir, "_stats.json");
if (fs.existsSync(statsPath)) {
const stats = JSON.parse(fs.readFileSync(statsPath, "utf-8"));
if (typeof stats?.lineage === "string") {
finalLineage = stats.lineage;
}
if (typeof stats?.model === "string") {
finalModel = stats.model;
}
}
} catch {
/* malformed stats — leave finalLineage undefined */
}
for (const entry of parsed) {
const valid = isValidSwapEntry(entry);
if (!valid) continue;
Expand All @@ -146,6 +178,10 @@ function readChatSwaps(chatId: string): SwapEntry[] {
valid.fromErrorKind = att.errorKind;
valid.fromErrorMessage = att.errorMessage;
}
const toMatchesFinal =
finalLineage === valid.toLineage &&
(finalModel ?? null) === (valid.toModel ?? null);
valid.actuallyRan = toMatchesFinal;
out.push(valid);
}
}
Expand Down
33 changes: 20 additions & 13 deletions src/app/runs/[runId]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,23 @@ async function getRunData(runId: string) {
return { chat, template };
}

const AGENT_TO_LINEAGE: Record<string, "claude" | "codex" | "gemini" | "opencode" | "kimi" | "openrouter"> = {
"claude-code": "claude",
"codex-cli": "codex",
"gemini-cli": "gemini",
"opencode-cli": "opencode",
"kimi-cli": "kimi",
// HTTP-dispatched shim — runner creates `reviewer-openrouter-N` dirs;
// without this entry the lineage fell through to "claude" and rendered
// OpenRouter answers with the wrong brand on the run page.
openrouter: "openrouter",
};
// Single-source-of-truth map shared with the cockpit API route and the
// run-viewer types. The previous inline copy here was missing
// `antigravity-cli` and `grok-cli` — those participants fell through to
// the `?? "claude"` default at the lookup site and the SSR snapshot
// rendered phantom CLAUDE cards alongside the synthesized ANTIGRAVITY
// placeholder. The /api/run-artifacts route reads AGENT_TO_UI_LINEAGE
// (which has the full map), so a client-side poll later reclassified the
// participant and the phantom vanished — "appears on refresh, goes away
// after seconds" was exactly that drift.
import { AGENT_TO_UI_LINEAGE as AGENT_TO_LINEAGE } from "@/lib/agent-name-map";
import type { ReviewerLineage } from "@/lib/types";

interface ParticipantSnapshot {
participant: string;
role: "doer" | "reviewer";
agentName: string;
lineage: "claude" | "codex" | "gemini" | "opencode" | "kimi" | "openrouter";
lineage: ReviewerLineage;
hasAnswer: boolean;
answer?: string;
findingsPreview?: string[];
Expand Down Expand Up @@ -100,7 +100,14 @@ function readChatRounds(chatId: string): RoundSnapshot[] {
.map((d) => {
const role: "doer" | "reviewer" = d.name.startsWith("doer-") ? "doer" : "reviewer";
const rawAgent = d.name.replace(/^(doer-|reviewer-)/, "").replace(/-\d+$/, "");
const lineage = AGENT_TO_LINEAGE[rawAgent] ?? "claude";
// Mirror /api/run-artifacts route: pass the raw agent name
// through when unknown so the run-page banner shows the actual
// CLI name. The hardcoded "claude" fallback misclassified every
// future CLI as a phantom claude card. The cast widens the lookup
// result back to the ReviewerLineage union — `displayLineage` and
// the lineage-maps tolerate unknown strings by falling through to
// the lowercased label.
const lineage = (AGENT_TO_LINEAGE[rawAgent] ?? rawAgent) as ReviewerLineage;
const answerPath = path.join(roundDir, d.name, "answer.md");
// hasAnswer must mirror the API route: gated on the `## DONE`
// sentinel so a mid-stream doer doesn't render as "done · no
Expand Down
26 changes: 19 additions & 7 deletions src/components/run-viewer/participant-card.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
*/
export function ParticipantCard({
participant,
isActive,

Check warning on line 46 in src/components/run-viewer/participant-card.tsx

View workflow job for this annotation

GitHub Actions / typecheck · test · build (Node 20)

'isActive' is defined but never used

Check warning on line 46 in src/components/run-viewer/participant-card.tsx

View workflow job for this annotation

GitHub Actions / typecheck · test · build (Node 22)

'isActive' is defined but never used
liveTail,
chatTerminal,
chatId,
Expand Down Expand Up @@ -203,16 +203,28 @@
</div>

{swaps && swaps.length > 0 && (() => {
// Only the LAST entry's `to` voice actually produced an answer;
// intermediate `to` voices were attempted and themselves failed
// (which is what triggered the next swap). Showing "actually ran"
// on every row is wrong for chains of length > 1.
const sorted = swaps.slice().sort((a, b) => a.fallbackIdx - b.fallbackIdx);
// Suppress the banner block entirely when the primary produced
// the displayed answer AND no swap target actually ran. This
// is the gemini bowerbird case (verdict_ambiguous false-
// positive → fallback fired → collision → chain exhausted)
// where the slot card was rendering DONE + content + three
// contradictory amber banners. The primary's content stands
// on its own; the failed-fallback breadcrumbs are noise.
const someSwapActuallyRan = sorted.some((s) => s.actuallyRan);
if (participant.hasAnswer && !someSwapActuallyRan) return null;
return (
<div className="space-y-1.5 border-b border-amber-500/30 bg-amber-500/5 px-4 py-2 text-[11px]">
{sorted.map((s, i) => {
const isCross = s.reason === "lineage_fallback";
const isLast = i === sorted.length - 1;
// `actuallyRan` is the truth-bearing signal — derived
// server-side from the slot's `_stats.json` lineage+
// model stamp matching this entry's `to` voice. The
// pre-fix `isLast` heuristic was wrong for collision-
// exhausted chains (where the last entry's TO voice
// never ran). Strikethrough + dim styling applies to
// any entry where the TO didn't run.
const ran = s.actuallyRan === true;
return (
<div
key={`${s.fromLineage}-${s.fromModel}-${i}`}
Expand All @@ -230,14 +242,14 @@
<ArrowRight className="h-3 w-3 shrink-0 text-amber-300" />
<span
className={
isLast
ran
? "font-medium text-amber-100"
: "text-amber-100/60 line-through"
}
>
{s.toLineage}/{s.toModel}
</span>
{isLast && (
{ran && (
<span className="rounded bg-amber-500/15 px-1 py-0.5 font-mono text-[9px] text-amber-200">
actually ran
</span>
Expand Down
11 changes: 11 additions & 0 deletions src/components/run-viewer/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,17 @@ export interface FallbackSwap {
fromErrorKind?: string;
/** One-line message from the failing attempt; trimmed to ~200 chars. */
fromErrorMessage?: string;
/** True iff the TO voice was the LAST writer of `_stats.json` for
* this slot (the runner stamps lineage+model on every successful
* `message_done`). Not "did it run at all" — a fallback that
* ran-and-failed before message_done won't stamp stats and will
* report false here. For the UI gate (suppress banner when primary
* has answer + no swap actuallyRan), this is the right semantic:
* the banner mismatch only matters when the displayed answer is the
* primary's, which means no successful fallback writer overrode it.
* Previously the UI inferred "actually ran" from `isLast` in the
* swaps array — wrong for collision/exhausted chains. */
actuallyRan?: boolean;
}

export interface RoundSnapshot {
Expand Down
33 changes: 33 additions & 0 deletions src/daemon/runner/reviewer-driver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,39 @@ async function runReviewer(
return null;
} catch (err) {
threw = true;
// Thrown exceptions bypass reviewer.ts's _attempts.jsonl
// writer (the finally there only writes on errored=true,
// and a throw from spawn/timeout/abort never sets that
// flag). Without this row, the operator sees "fallback
// fired" with no record of why. Append a structurally-
// identical diagnostic before re-throwing so post-mortem
// grep across _attempts.jsonl is uniform across crash
// mode (errored, silent, thrown).
const message =
err instanceof Error ? err.message : String(err);
try {
const attemptsFile = path.join(reviewerDir, '_attempts.jsonl');
fs.appendFileSync(
attemptsFile,
JSON.stringify({
ts: Date.now(),
round,
lineage: entry.lineage,
model: entry.model ?? null,
errorKind: 'runtime_error',
errorMessage: message,
durationMs: 0,
}) + '\n',
);
} catch {
/* best-effort */
}
console.warn(
`[reviewer] attempt threw chat=${chatId} round=${round} ` +
`slot=${agentName}-${reviewerIdx} ` +
`lineage=${entry.lineage} model=${entry.model ?? '(default)'} ` +
`kind=runtime_error message=${JSON.stringify(message).slice(0, 300)}`,
);
throw err;
} finally {
if (threw) {
Expand Down
131 changes: 103 additions & 28 deletions src/daemon/runner/reviewer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,15 @@ export async function runReviewerHeadless(args: {
path.join(reviewerDir, '_stats.json'),
JSON.stringify({
durationMs: Date.now() - startedAt,
// Lineage + model stamp lets the run-artifacts route
// tell whether the LAST voice that succeeded was the
// primary or a fallback. Without this stamp the cockpit
// can't distinguish "fallback actually ran" from
// "fallback collided, primary's answer is what's shown"
// — and renders contradictory banners on collision-
// exhausted slots like the bowerbird gemini case.
lineage: candidateLineage,
...(candidateModel ? { model: candidateModel } : {}),
...(usageForStats ? { usage: usageForStats } : {}),
}),
'utf-8',
Expand Down Expand Up @@ -434,6 +443,13 @@ export async function runReviewerHeadless(args: {
// lands regardless of how the run exits (success, error, abort).
// Append-only JSONL keyed by (round, model) so multi-step fallback
// chains leave a trail.
//
// Errored path only — silent failures (empty content, no error event,
// verdict_ambiguous) are written below at the null-return sites so
// zero rows in _attempts.jsonl genuinely means "no failure to log",
// not "diagnostic gap". Bowerbird chat 019E6E3318873C26DCA60409B84F90E9
// had a kimi slot that fell through to fallback with NO _attempts row
// and NO daemon log line — that gap is what this refactor closes.
if (errored) {
const errorKind = errorSummary?.kind ?? 'unknown';
const errorMessage = errorSummary?.message ?? '(no message captured)';
Expand All @@ -445,33 +461,16 @@ export async function runReviewerHeadless(args: {
lastError.kind = errorKind;
lastError.message = errorMessage;
}
const durationMs = Date.now() - startedAt;
try {
const attemptsFile = path.join(reviewerDir, '_attempts.jsonl');
const entry = {
ts: Date.now(),
round,
lineage: candidateLineage,
model: candidateModel ?? null,
errorKind,
errorMessage,
durationMs,
};
fs.appendFileSync(attemptsFile, JSON.stringify(entry) + '\n');
} catch {
/* best-effort — diagnostics shouldn't fail the run */
}
// Daemon-log line — same content as the JSONL row but at the
// daemon level so a single tail of ~/.chorus/logs/daemon.log
// shows every failed reviewer attempt across every chat without
// walking per-chat dirs. Grep-friendly key=value format mirrors
// the openrouter shim's own warn lines.
console.warn(
`[reviewer] attempt failed chat=${chatId} round=${round} ` +
`lineage=${candidateLineage} model=${candidateModel ?? '(default)'} ` +
`kind=${errorKind} duration_ms=${durationMs} ` +
`message=${JSON.stringify(errorMessage).slice(0, 300)}`,
);
writeAttemptRow({
reviewerDir,
chatId,
round,
lineage: candidateLineage,
model: candidateModel,
kind: errorKind,
message: errorMessage,
durationMs: Date.now() - startedAt,
});
}
// Mirror runDoerHeadless: surface answer.md write failures as a
// cli_warning so the user sees "stream stopped writing" instead of
Expand Down Expand Up @@ -513,7 +512,31 @@ export async function runReviewerHeadless(args: {
const streamed = finalText && finalText.length > 0 ? finalText : accumulated;
const content = onDisk.trim().length > 0 ? onDisk : streamed;
if (errored && content.trim().length === 0) return null;
if (content.trim().length === 0) return null;
if (content.trim().length === 0) {
// Silent-failure path: stream ended without an error event AND
// produced no usable content. Previously this returned null with no
// _attempts.jsonl row and no daemon log line, leaving the operator
// with no record of why a fallback fired. Synthesise a kind +
// diagnostic row so every null return is traceable.
if (lastError && !lastError.kind) {
lastError.kind = 'empty_no_error';
lastError.message =
'CLI exited without writing usable content to answer.md and without emitting an error event.';
}
writeAttemptRow({
reviewerDir,
chatId,
round,
lineage: candidateLineage,
model: candidateModel,
kind: lastError?.kind ?? 'empty_no_error',
message:
lastError?.message ??
'CLI exited without writing usable content to answer.md and without emitting an error event.',
durationMs: Date.now() - startedAt,
});
return null;
}

// Record healthy on success so a stale quota_exhausted / auth_invalid
// record from a prior session clears. Without this, the Reviewer
Expand Down Expand Up @@ -557,6 +580,58 @@ export async function runReviewerHeadless(args: {
lastError.kind = 'verdict_ambiguous';
lastError.message =
'Reviewer wrote non-empty content but no approve/reject verdict was detected.';
writeAttemptRow({
reviewerDir,
chatId,
round,
lineage: candidateLineage,
model: candidateModel,
kind: 'verdict_ambiguous',
message: lastError.message,
durationMs: Date.now() - startedAt,
});
}
return verdict;
}

/**
* Append a diagnostic row to `_attempts.jsonl` and emit the matching
* `[reviewer] attempt failed …` daemon log line. Called from every path
* that returns a non-positive result (errored, empty-content, verdict-
* ambiguous) so post-mortem grep across reviewer-slot _attempts.jsonl
* sidecars shows every reason a slot fell back. Best-effort writes —
* diagnostic failure must never fail the run.
*/
function writeAttemptRow(args: {
reviewerDir: string;
chatId: string;
round: number;
lineage: string;
model: string | undefined;
kind: string;
message: string;
durationMs: number;
}): void {
const { reviewerDir, chatId, round, lineage, model, kind, message, durationMs } = args;
try {
const attemptsFile = path.join(reviewerDir, '_attempts.jsonl');
const entry = {
ts: Date.now(),
round,
lineage,
model: model ?? null,
errorKind: kind,
errorMessage: message,
durationMs,
};
fs.appendFileSync(attemptsFile, JSON.stringify(entry) + '\n');
} catch {
/* best-effort — diagnostics shouldn't fail the run */
}
console.warn(
`[reviewer] attempt failed chat=${chatId} round=${round} ` +
`lineage=${lineage} model=${model ?? '(default)'} ` +
`kind=${kind} duration_ms=${durationMs} ` +
`message=${JSON.stringify(message).slice(0, 300)}`,
);
}
Loading
Loading