From 3cc50b4b5b45406dabd63a04c9d1d68f5e822192 Mon Sep 17 00:00:00 2001
From: chorus-codes <280607145+chorus-codes@users.noreply.github.com>
Date: Thu, 28 May 2026 22:23:49 +1000
Subject: [PATCH] fix(reviewer): verdict severity tier + diagnostic write + UI
 banner + SSR lineage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five connected defects surfaced by live bowerbird chat
019E6E3318873C26DCA60409B84F90E9 (gemini wrote 29 lines of severity findings,
billed $0.05, was falsely marked verdict_ambiguous → fallback chain fired →
collision → contradictory UI banners; kimi failed silently with no
diagnostic trace; SSR phantom CLAUDE card for antigravity slots).

1. verdict.ts — recognise `### CRITICAL` / `**HIGH**` headers (with >=200-
   char body) as implicit request_changes. Trailing lookahead requires a
   heading terminator (`:`, `\n`, `**`, EOL) so prose like
   `### High-level review` or `**High-traffic endpoint**` doesn't false-
   positive. Codex + gemini both caught the hyphen-suffix issue in the
   chorus self-review 019E6E7A5D1DF943AD275D5595460D9A.

2. reviewer.ts + reviewer-driver.ts — factored `writeAttemptRow()` helper.
   Every null return (errored, new `empty_no_error`, `verdict_ambiguous`)
   AND every thrown exception now writes a row to `_attempts.jsonl` plus a
   `[reviewer]` daemon log line. Zero rows in a slot's _attempts.jsonl is
   now a true bug signal, not a diagnostic gap. Also stamps
   lineage+model into `_stats.json` on every successful completion.

3. run-artifacts route + participant-card — API derives per-swap
   `actuallyRan: boolean` by matching the swap's `to` against the slot's
   final _stats.json lineage stamp. UI suppresses the swap banner block
   entirely when primary produced the displayed answer AND no swap
   actually ran. Per-entry `actuallyRan` (not `isLast`) drives the
   strikethrough + "actually ran" badge.

4. runs/[runId]/page.tsx — SSR readChatRounds had its own inline
   AGENT_TO_LINEAGE missing `antigravity-cli` / `grok-cli` and defaulting
   unknown agents to literal "claude". Initial server-rendered HTML
   classified antigravity participants as claude → phantom CLAUDE card
   alongside the synthesised ANTIGRAVITY placeholder, vanishing on the
   first client-side /api/run-artifacts poll. Now uses the shared
   AGENT_TO_UI_LINEAGE map and passes raw agent name through on miss.

Tests: 993 pass (was 989; added 4 hyphen-suffix false-positive cases to
verdict tests, including a verbatim replay of the real bowerbird gemini
review classifying as request_changes).
---
 src/app/api/run-artifacts/[chatId]/route.ts   |  36 +++++
 src/app/runs/[runId]/page.tsx                 |  33 +++--
 .../run-viewer/participant-card.tsx           |  26 +++-
 src/components/run-viewer/types.ts            |  11 ++
 src/daemon/runner/reviewer-driver.ts          |  33 +++++
 src/daemon/runner/reviewer.ts                 | 131 ++++++++++++++----
 src/daemon/runner/verdict.ts                  |  35 +++++
 tests/verdict.test.ts                         | 100 +++++++++++++
 8 files changed, 357 insertions(+), 48 deletions(-)

diff --git a/src/app/api/run-artifacts/[chatId]/route.ts b/src/app/api/run-artifacts/[chatId]/route.ts
index 8c19efa..8044894 100644
--- a/src/app/api/run-artifacts/[chatId]/route.ts
+++ b/src/app/api/run-artifacts/[chatId]/route.ts
@@ -43,6 +43,13 @@ interface SwapEntry {
   ts: number;
   fromErrorKind?: string;
   fromErrorMessage?: string;
+  /** True iff the TO voice's run wrote `_stats.json` for this slot
+   *  (lineage + model stamp matches the slot's final _stats.json).
+   *  False when the chain advanced past this entry via collision or
+   *  exhaustion. UI uses this to gate the fallback banner so a slot
+   *  whose primary produced the displayed answer doesn't render a
+   *  misleading "fallback fired / actually ran" badge. */
+  actuallyRan?: boolean;
 }
 
 /**
@@ -138,6 +145,31 @@ function readChatSwaps(chatId: string): SwapEntry[] {
           // the swap's fromModel. Lets the UI render "kimi-k2.6 failed:
           // cli_failed — model not found" instead of a bare arrow.
           const attemptsByModel = readAttemptsByModel(partDir);
+          // Read the slot's `_stats.json` to find which lineage/model
+          // actually produced the final answer.md. reviewer.ts stamps
+          // `lineage` + `model` on every successful completion; the
+          // LAST writer wins. We use this to mark `actuallyRan: true`
+          // on only the swap entry whose `to` matches the final
+          // writer. Collision-exhausted chains (gemini → sonnet where
+          // sonnet was claimed elsewhere) leave _stats.json carrying
+          // the primary's stamp, so no swap entry gets actuallyRan —
+          // and the UI suppresses the misleading banners.
+          let finalLineage: string | undefined;
+          let finalModel: string | undefined;
+          try {
+            const statsPath = path.join(partDir, "_stats.json");
+            if (fs.existsSync(statsPath)) {
+              const stats = JSON.parse(fs.readFileSync(statsPath, "utf-8"));
+              if (typeof stats?.lineage === "string") {
+                finalLineage = stats.lineage;
+              }
+              if (typeof stats?.model === "string") {
+                finalModel = stats.model;
+              }
+            }
+          } catch {
+            /* malformed stats — leave finalLineage undefined */
+          }
           for (const entry of parsed) {
             const valid = isValidSwapEntry(entry);
             if (!valid) continue;
@@ -146,6 +178,10 @@ function readChatSwaps(chatId: string): SwapEntry[] {
               valid.fromErrorKind = att.errorKind;
               valid.fromErrorMessage = att.errorMessage;
             }
+            const toMatchesFinal =
+              finalLineage === valid.toLineage &&
+              (finalModel ?? null) === (valid.toModel ?? null);
+            valid.actuallyRan = toMatchesFinal;
             out.push(valid);
           }
         }
diff --git a/src/app/runs/[runId]/page.tsx b/src/app/runs/[runId]/page.tsx
index 43c0e5e..4acee46 100644
--- a/src/app/runs/[runId]/page.tsx
+++ b/src/app/runs/[runId]/page.tsx
@@ -44,23 +44,23 @@ async function getRunData(runId: string) {
   return { chat, template };
 }
 
-const AGENT_TO_LINEAGE: Record<string, "claude" | "codex" | "gemini" | "opencode" | "kimi" | "openrouter"> = {
-  "claude-code": "claude",
-  "codex-cli": "codex",
-  "gemini-cli": "gemini",
-  "opencode-cli": "opencode",
-  "kimi-cli": "kimi",
-  // HTTP-dispatched shim — runner creates `reviewer-openrouter-N` dirs;
-  // without this entry the lineage fell through to "claude" and rendered
-  // OpenRouter answers with the wrong brand on the run page.
-  openrouter: "openrouter",
-};
+// Single-source-of-truth map shared with the cockpit API route and the
+// run-viewer types. The previous inline copy here was missing
+// `antigravity-cli` and `grok-cli` — those participants fell through to
+// the `?? "claude"` default at the lookup site and the SSR snapshot
+// rendered phantom CLAUDE cards alongside the synthesized ANTIGRAVITY
+// placeholder. The /api/run-artifacts route reads AGENT_TO_UI_LINEAGE
+// (which has the full map), so a client-side poll later reclassified the
+// participant and the phantom vanished — "appears on refresh, goes away
+// after seconds" was exactly that drift.
+import { AGENT_TO_UI_LINEAGE as AGENT_TO_LINEAGE } from "@/lib/agent-name-map";
+import type { ReviewerLineage } from "@/lib/types";
 
 interface ParticipantSnapshot {
   participant: string;
   role: "doer" | "reviewer";
   agentName: string;
-  lineage: "claude" | "codex" | "gemini" | "opencode" | "kimi" | "openrouter";
+  lineage: ReviewerLineage;
   hasAnswer: boolean;
   answer?: string;
   findingsPreview?: string[];
@@ -100,7 +100,14 @@ function readChatRounds(chatId: string): RoundSnapshot[] {
       .map((d) => {
         const role: "doer" | "reviewer" = d.name.startsWith("doer-") ? "doer" : "reviewer";
         const rawAgent = d.name.replace(/^(doer-|reviewer-)/, "").replace(/-\d+$/, "");
-        const lineage = AGENT_TO_LINEAGE[rawAgent] ?? "claude";
+        // Mirror /api/run-artifacts route: pass the raw agent name
+        // through when unknown so the run-page banner shows the actual
+        // CLI name. The hardcoded "claude" fallback misclassified every
+        // future CLI as a phantom claude card. The cast widens the lookup
+        // result back to the ReviewerLineage union — `displayLineage` and
+        // the lineage-maps tolerate unknown strings by falling through to
+        // the lowercased label.
+        const lineage = (AGENT_TO_LINEAGE[rawAgent] ?? rawAgent) as ReviewerLineage;
         const answerPath = path.join(roundDir, d.name, "answer.md");
         // hasAnswer must mirror the API route: gated on the `## DONE`
         // sentinel so a mid-stream doer doesn't render as "done · no
diff --git a/src/components/run-viewer/participant-card.tsx b/src/components/run-viewer/participant-card.tsx
index 5d786f9..21c548c 100644
--- a/src/components/run-viewer/participant-card.tsx
+++ b/src/components/run-viewer/participant-card.tsx
@@ -203,16 +203,28 @@ export function ParticipantCard({
       </div>
 
       {swaps && swaps.length > 0 && (() => {
-        // Only the LAST entry's `to` voice actually produced an answer;
-        // intermediate `to` voices were attempted and themselves failed
-        // (which is what triggered the next swap). Showing "actually ran"
-        // on every row is wrong for chains of length > 1.
         const sorted = swaps.slice().sort((a, b) => a.fallbackIdx - b.fallbackIdx);
+        // Suppress the banner block entirely when the primary produced
+        // the displayed answer AND no swap target actually ran. This
+        // is the gemini bowerbird case (verdict_ambiguous false-
+        // positive → fallback fired → collision → chain exhausted)
+        // where the slot card was rendering DONE + content + three
+        // contradictory amber banners. The primary's content stands
+        // on its own; the failed-fallback breadcrumbs are noise.
+        const someSwapActuallyRan = sorted.some((s) => s.actuallyRan);
+        if (participant.hasAnswer && !someSwapActuallyRan) return null;
         return (
           <div className="space-y-1.5 border-b border-amber-500/30 bg-amber-500/5 px-4 py-2 text-[11px]">
             {sorted.map((s, i) => {
               const isCross = s.reason === "lineage_fallback";
-              const isLast = i === sorted.length - 1;
+              // `actuallyRan` is the truth-bearing signal — derived
+              // server-side from the slot's `_stats.json` lineage+
+              // model stamp matching this entry's `to` voice. The
+              // pre-fix `isLast` heuristic was wrong for collision-
+              // exhausted chains (where the last entry's TO voice
+              // never ran). Strikethrough + dim styling applies to
+              // any entry where the TO didn't run.
+              const ran = s.actuallyRan === true;
               return (
                 <div
                   key={`${s.fromLineage}-${s.fromModel}-${i}`}
@@ -230,14 +242,14 @@ export function ParticipantCard({
                       <ArrowRight className="h-3 w-3 shrink-0 text-amber-300" />
                       <span
                         className={
-                          isLast
+                          ran
                             ? "font-medium text-amber-100"
                             : "text-amber-100/60 line-through"
                         }
                       >
                         {s.toLineage}/{s.toModel}
                       </span>
-                      {isLast && (
+                      {ran && (
                         <span className="rounded bg-amber-500/15 px-1 py-0.5 font-mono text-[9px] text-amber-200">
                           actually ran
                         </span>
diff --git a/src/components/run-viewer/types.ts b/src/components/run-viewer/types.ts
index 83a7ab4..03fe183 100644
--- a/src/components/run-viewer/types.ts
+++ b/src/components/run-viewer/types.ts
@@ -111,6 +111,17 @@ export interface FallbackSwap {
   fromErrorKind?: string;
   /** One-line message from the failing attempt; trimmed to ~200 chars. */
   fromErrorMessage?: string;
+  /** True iff the TO voice was the LAST writer of `_stats.json` for
+   *  this slot (the runner stamps lineage+model on every successful
+   *  `message_done`). Not "did it run at all" — a fallback that
+   *  ran-and-failed before message_done won't stamp stats and will
+   *  report false here. For the UI gate (suppress banner when primary
+   *  has answer + no swap actuallyRan), this is the right semantic:
+   *  the banner mismatch only matters when the displayed answer is the
+   *  primary's, which means no successful fallback writer overrode it.
+   *  Previously the UI inferred "actually ran" from `isLast` in the
+   *  swaps array — wrong for collision/exhausted chains. */
+  actuallyRan?: boolean;
 }
 
 export interface RoundSnapshot {
diff --git a/src/daemon/runner/reviewer-driver.ts b/src/daemon/runner/reviewer-driver.ts
index d1ce491..b59914d 100644
--- a/src/daemon/runner/reviewer-driver.ts
+++ b/src/daemon/runner/reviewer-driver.ts
@@ -621,6 +621,39 @@ async function runReviewer(
             return null;
           } catch (err) {
             threw = true;
+            // Thrown exceptions bypass reviewer.ts's _attempts.jsonl
+            // writer (the finally there only writes on errored=true,
+            // and a throw from spawn/timeout/abort never sets that
+            // flag). Without this row, the operator sees "fallback
+            // fired" with no record of why. Append a structurally-
+            // identical diagnostic before re-throwing so post-mortem
+            // grep across _attempts.jsonl is uniform across crash
+            // mode (errored, silent, thrown).
+            const message =
+              err instanceof Error ? err.message : String(err);
+            try {
+              const attemptsFile = path.join(reviewerDir, '_attempts.jsonl');
+              fs.appendFileSync(
+                attemptsFile,
+                JSON.stringify({
+                  ts: Date.now(),
+                  round,
+                  lineage: entry.lineage,
+                  model: entry.model ?? null,
+                  errorKind: 'runtime_error',
+                  errorMessage: message,
+                  durationMs: 0,
+                }) + '\n',
+              );
+            } catch {
+              /* best-effort */
+            }
+            console.warn(
+              `[reviewer] attempt threw chat=${chatId} round=${round} ` +
+                `slot=${agentName}-${reviewerIdx} ` +
+                `lineage=${entry.lineage} model=${entry.model ?? '(default)'} ` +
+                `kind=runtime_error message=${JSON.stringify(message).slice(0, 300)}`,
+            );
             throw err;
           } finally {
             if (threw) {
diff --git a/src/daemon/runner/reviewer.ts b/src/daemon/runner/reviewer.ts
index 0b9e2e9..6d3d54b 100644
--- a/src/daemon/runner/reviewer.ts
+++ b/src/daemon/runner/reviewer.ts
@@ -223,6 +223,15 @@ export async function runReviewerHeadless(args: {
             path.join(reviewerDir, '_stats.json'),
             JSON.stringify({
               durationMs: Date.now() - startedAt,
+              // Lineage + model stamp lets the run-artifacts route
+              // tell whether the LAST voice that succeeded was the
+              // primary or a fallback. Without this stamp the cockpit
+              // can't distinguish "fallback actually ran" from
+              // "fallback collided, primary's answer is what's shown"
+              // — and renders contradictory banners on collision-
+              // exhausted slots like the bowerbird gemini case.
+              lineage: candidateLineage,
+              ...(candidateModel ? { model: candidateModel } : {}),
               ...(usageForStats ? { usage: usageForStats } : {}),
             }),
             'utf-8',
@@ -434,6 +443,13 @@ export async function runReviewerHeadless(args: {
     // lands regardless of how the run exits (success, error, abort).
     // Append-only JSONL keyed by (round, model) so multi-step fallback
     // chains leave a trail.
+    //
+    // Errored path only — silent failures (empty content, no error event,
+    // verdict_ambiguous) are written below at the null-return sites so
+    // zero rows in _attempts.jsonl genuinely means "no failure to log",
+    // not "diagnostic gap". Bowerbird chat 019E6E3318873C26DCA60409B84F90E9
+    // had a kimi slot that fell through to fallback with NO _attempts row
+    // and NO daemon log line — that gap is what this refactor closes.
     if (errored) {
       const errorKind = errorSummary?.kind ?? 'unknown';
       const errorMessage = errorSummary?.message ?? '(no message captured)';
@@ -445,33 +461,16 @@ export async function runReviewerHeadless(args: {
         lastError.kind = errorKind;
         lastError.message = errorMessage;
       }
-      const durationMs = Date.now() - startedAt;
-      try {
-        const attemptsFile = path.join(reviewerDir, '_attempts.jsonl');
-        const entry = {
-          ts: Date.now(),
-          round,
-          lineage: candidateLineage,
-          model: candidateModel ?? null,
-          errorKind,
-          errorMessage,
-          durationMs,
-        };
-        fs.appendFileSync(attemptsFile, JSON.stringify(entry) + '\n');
-      } catch {
-        /* best-effort — diagnostics shouldn't fail the run */
-      }
-      // Daemon-log line — same content as the JSONL row but at the
-      // daemon level so a single tail of ~/.chorus/logs/daemon.log
-      // shows every failed reviewer attempt across every chat without
-      // walking per-chat dirs. Grep-friendly key=value format mirrors
-      // the openrouter shim's own warn lines.
-      console.warn(
-        `[reviewer] attempt failed chat=${chatId} round=${round} ` +
-          `lineage=${candidateLineage} model=${candidateModel ?? '(default)'} ` +
-          `kind=${errorKind} duration_ms=${durationMs} ` +
-          `message=${JSON.stringify(errorMessage).slice(0, 300)}`,
-      );
+      writeAttemptRow({
+        reviewerDir,
+        chatId,
+        round,
+        lineage: candidateLineage,
+        model: candidateModel,
+        kind: errorKind,
+        message: errorMessage,
+        durationMs: Date.now() - startedAt,
+      });
     }
     // Mirror runDoerHeadless: surface answer.md write failures as a
     // cli_warning so the user sees "stream stopped writing" instead of
@@ -513,7 +512,31 @@ export async function runReviewerHeadless(args: {
   const streamed = finalText && finalText.length > 0 ? finalText : accumulated;
   const content = onDisk.trim().length > 0 ? onDisk : streamed;
   if (errored && content.trim().length === 0) return null;
-  if (content.trim().length === 0) return null;
+  if (content.trim().length === 0) {
+    // Silent-failure path: stream ended without an error event AND
+    // produced no usable content. Previously this returned null with no
+    // _attempts.jsonl row and no daemon log line, leaving the operator
+    // with no record of why a fallback fired. Synthesise a kind +
+    // diagnostic row so every null return is traceable.
+    if (lastError && !lastError.kind) {
+      lastError.kind = 'empty_no_error';
+      lastError.message =
+        'CLI exited without writing usable content to answer.md and without emitting an error event.';
+    }
+    writeAttemptRow({
+      reviewerDir,
+      chatId,
+      round,
+      lineage: candidateLineage,
+      model: candidateModel,
+      kind: lastError?.kind ?? 'empty_no_error',
+      message:
+        lastError?.message ??
+        'CLI exited without writing usable content to answer.md and without emitting an error event.',
+      durationMs: Date.now() - startedAt,
+    });
+    return null;
+  }
 
   // Record healthy on success so a stale quota_exhausted / auth_invalid
   // record from a prior session clears. Without this, the Reviewer
@@ -557,6 +580,58 @@ export async function runReviewerHeadless(args: {
     lastError.kind = 'verdict_ambiguous';
     lastError.message =
       'Reviewer wrote non-empty content but no approve/reject verdict was detected.';
+    writeAttemptRow({
+      reviewerDir,
+      chatId,
+      round,
+      lineage: candidateLineage,
+      model: candidateModel,
+      kind: 'verdict_ambiguous',
+      message: lastError.message,
+      durationMs: Date.now() - startedAt,
+    });
   }
   return verdict;
 }
+
+/**
+ * Append a diagnostic row to `_attempts.jsonl` and emit the matching
+ * `[reviewer] attempt failed …` daemon log line. Called from every path
+ * that returns a non-positive result (errored, empty-content, verdict-
+ * ambiguous) so post-mortem grep across reviewer-slot _attempts.jsonl
+ * sidecars shows every reason a slot fell back. Best-effort writes —
+ * diagnostic failure must never fail the run.
+ */
+function writeAttemptRow(args: {
+  reviewerDir: string;
+  chatId: string;
+  round: number;
+  lineage: string;
+  model: string | undefined;
+  kind: string;
+  message: string;
+  durationMs: number;
+}): void {
+  const { reviewerDir, chatId, round, lineage, model, kind, message, durationMs } = args;
+  try {
+    const attemptsFile = path.join(reviewerDir, '_attempts.jsonl');
+    const entry = {
+      ts: Date.now(),
+      round,
+      lineage,
+      model: model ?? null,
+      errorKind: kind,
+      errorMessage: message,
+      durationMs,
+    };
+    fs.appendFileSync(attemptsFile, JSON.stringify(entry) + '\n');
+  } catch {
+    /* best-effort — diagnostics shouldn't fail the run */
+  }
+  console.warn(
+    `[reviewer] attempt failed chat=${chatId} round=${round} ` +
+      `lineage=${lineage} model=${model ?? '(default)'} ` +
+      `kind=${kind} duration_ms=${durationMs} ` +
+      `message=${JSON.stringify(message).slice(0, 300)}`,
+  );
+}
diff --git a/src/daemon/runner/verdict.ts b/src/daemon/runner/verdict.ts
index c62cb74..28b61f4 100644
--- a/src/daemon/runner/verdict.ts
+++ b/src/daemon/runner/verdict.ts
@@ -47,6 +47,30 @@ export function verdictFromReviewerText(content: string): boolean | null {
   const positives =
     /\b(approve(?:d|s)?|lgtm|looks good to me|no concerns|ship it|ack)\b/;
 
+  // Severity-style reviews: gemini, grok, and several opencode-go models
+  // routinely structure findings as `### CRITICAL` / `**HIGH**` sections
+  // with substantive bodies but no explicit approve/reject prose. The
+  // earlier verdict heuristic returned null on these — burning the
+  // reviewer's $0.05+ spend, firing the fallback chain, and producing
+  // contradictory cockpit cards. A reviewer who flags CRITICAL/HIGH
+  // severity issues clearly wants changes; treat as implicit
+  // request_changes. MEDIUM/LOW are advisory and stay ambiguous.
+  //
+  // Trailing lookahead — `(?=\s*(?:[:\n]|\*\*|$))` — requires the
+  // severity word to be followed by a heading terminator: colon, end
+  // of line, end of string, or closing bold. Without it, prose like
+  // `### High-level review` or `**High-traffic endpoint**` matched via
+  // `\b` (hyphen is a word boundary) and forced an implicit reject.
+  // Codex + gemini both caught this in the chorus self-review
+  // 019E6E7A5D1DF943AD275D5595460D9A — convergent-dissenter rule
+  // applied. Stricter regex misses some valid loose forms (e.g.
+  // `### CRITICAL bug here`) but mid-text false-positive rejections
+  // are worse than missed implicit signals — when in doubt, leave
+  // verdict null and let the reviewer be explicit. 200-char floor
+  // still applies on top.
+  const severityHeader =
+    /(?:^|\n)\s*(?:#{1,6}\s+|\*\*\s*)(?:[\d.]+\s+)?(critical|high)(?=\s*(?:[:\n]|\*\*|$))/i;
+
   const whole = stripped.toLowerCase();
   const tail = stripped.slice(-400).toLowerCase();
 
@@ -60,6 +84,17 @@ export function verdictFromReviewerText(content: string): boolean | null {
   // anywhere in the text.
   if (negatives.test(whole)) return false;
 
+  // Severity-section second pass — between negatives and positives so
+  // an explicit `LGTM` tail still wins over the implicit signal (a
+  // contradictory review like "### CRITICAL bug … LGTM" is rare and
+  // best surfaced as positive so the reviewer's explicit final word
+  // counts). Tested against the original stripped text (not lowercased)
+  // because the regex is case-insensitive and section headers are
+  // typically uppercase in real reviews.
+  if (stripped.length >= 200 && severityHeader.test(stripped)) {
+    if (!positives.test(tail) && !positives.test(whole)) return false;
+  }
+
   // Positives prefer the tail so an analytical review mentioning
   // "good practice" mid-paragraph doesn't get auto-approved without an
   // explicit verdict at the end. Whole-text fallback catches the case
diff --git a/tests/verdict.test.ts b/tests/verdict.test.ts
index 7289143..af8d095 100644
--- a/tests/verdict.test.ts
+++ b/tests/verdict.test.ts
@@ -182,4 +182,104 @@ describe('verdictFromReviewerText', () => {
       ).toBe(true);
     });
   });
+
+  describe('severity-style reviews — implicit request_changes', () => {
+    // The actual gemini-3.1-pro-preview output from chat
+    // 019E6E3318873C26DCA60409B84F90E9 that the heuristic previously
+    // misclassified as null → verdict_ambiguous → wasted fallback chain.
+    const realGeminiReview =
+      `Here are the findings from reviewing the dual-source enrichment pipeline.\n\n` +
+      `### CRITICAL\n\n` +
+      `**1. PCA Index Builder is Non-Functional (Always Empty)**\n` +
+      `In \`pca_index.py\`, the \`build_index\` function never actually calls the \`_entry_from_item\` helper to normalize the data. ` +
+      `It iterates over the raw PCA API response and immediately attempts to use \`out[entry["key"]] = entry\`. ` +
+      `Since the raw PCA dictionary lacks a normalized \`"key"\` field, this will either raise a KeyError or use an incorrect internal ID.\n\n` +
+      `### HIGH\n\n` +
+      `**2. Unhandled PCA Exceptions Crash the Entire Item's Enrichment**\n` +
+      `In \`enrich_listing.py\`, the future completion loop lacks a generic except block for the PCA thread.\n## DONE`;
+
+    it('classifies the real gemini severity-style review as request_changes', () => {
+      expect(verdictFromReviewerText(realGeminiReview)).toBe(false);
+    });
+
+    it('### CRITICAL header with substantive body returns false', () => {
+      const text =
+        '### CRITICAL\n\nThe migration drops the index without rebuilding it, ' +
+        PAD.repeat(2) +
+        '\n## DONE';
+      expect(verdictFromReviewerText(text)).toBe(false);
+    });
+
+    it('**HIGH** bold header with substantive body returns false', () => {
+      const text =
+        '**HIGH**: race condition in queue claim — ' +
+        PAD.repeat(2) +
+        '\n## DONE';
+      expect(verdictFromReviewerText(text)).toBe(false);
+    });
+
+    it('### MEDIUM alone (no CRITICAL/HIGH) stays ambiguous (null)', () => {
+      // MEDIUM is advisory — without an explicit request_changes prose,
+      // the verdict stays null so the slot doesn't auto-block.
+      const text =
+        '### MEDIUM\n\nConsider adding a comment here. ' +
+        PAD.repeat(2) +
+        '\n## DONE';
+      expect(verdictFromReviewerText(text)).toBeNull();
+    });
+
+    it('### CRITICAL with short body (<200 chars) stays ambiguous', () => {
+      // Guards against a passing-mention "critical bug" comment being
+      // misclassified as a substantive rejection.
+      expect(verdictFromReviewerText('### CRITICAL\nnit\n## DONE')).toBeNull();
+    });
+
+    // Codex + gemini caught this false-positive class in the chorus
+    // self-review 019E6E7A5D1DF943AD275D5595460D9A. The trailing
+    // lookahead in verdict.ts now requires the severity word to be
+    // followed by a heading terminator (`:`, `\n`, `**`, EOL) so
+    // hyphen-prefix prose can't masquerade as a severity header.
+    it('`### High-level review` is NOT a severity header (codex review)', () => {
+      const text =
+        '### High-level review\n\nThe approach is sound and the implementation is clean. ' +
+        PAD.repeat(2) +
+        '\n## DONE';
+      expect(verdictFromReviewerText(text)).toBeNull();
+    });
+
+    it('`### High confidence` is NOT a severity header (codex review)', () => {
+      const text =
+        '### High confidence\n\nI have high confidence in this change. ' +
+        PAD.repeat(2) +
+        '\n## DONE';
+      expect(verdictFromReviewerText(text)).toBeNull();
+    });
+
+    it('`**High-traffic endpoint**` is NOT a severity header (codex review)', () => {
+      const text =
+        '**High-traffic endpoint**: this code path serves the homepage. ' +
+        PAD.repeat(2) +
+        '\n## DONE';
+      expect(verdictFromReviewerText(text)).toBeNull();
+    });
+
+    it('`### Critical considerations` is NOT a severity header', () => {
+      const text =
+        '### Critical considerations\n\nA few things to think about. ' +
+        PAD.repeat(2) +
+        '\n## DONE';
+      expect(verdictFromReviewerText(text)).toBeNull();
+    });
+
+    it('### CRITICAL followed by explicit LGTM tail still approves', () => {
+      // Contradictory review — explicit positive verdict in the tail
+      // wins over the implicit severity signal. Reviewer's final word
+      // is the truth-bearing signal.
+      const text =
+        '### CRITICAL\n\nThe issue I called out is now resolved. ' +
+        PAD.repeat(2) +
+        '\nOverall LGTM.\n## DONE';
+      expect(verdictFromReviewerText(text)).toBe(true);
+    });
+  });
 });