From 3cc50b4b5b45406dabd63a04c9d1d68f5e822192 Mon Sep 17 00:00:00 2001 From: chorus-codes <280607145+chorus-codes@users.noreply.github.com> Date: Thu, 28 May 2026 22:23:49 +1000 Subject: [PATCH] fix(reviewer): verdict severity tier + diagnostic write + UI banner + SSR lineage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five connected defects surfaced by live bowerbird chat 019E6E3318873C26DCA60409B84F90E9 (gemini wrote 29 lines of severity findings, billed $0.05, was falsely marked verdict_ambiguous → fallback chain fired → collision → contradictory UI banners; kimi failed silently with no diagnostic trace; SSR phantom CLAUDE card for antigravity slots). 1. verdict.ts — recognise `### CRITICAL` / `**HIGH**` headers (with >=200- char body) as implicit request_changes. Trailing lookahead requires a heading terminator (`:`, `\n`, `**`, EOL) so prose like `### High-level review` or `**High-traffic endpoint**` doesn't false- positive. Codex + gemini both caught the hyphen-suffix issue in the chorus self-review 019E6E7A5D1DF943AD275D5595460D9A. 2. reviewer.ts + reviewer-driver.ts — factored `writeAttemptRow()` helper. Every null return (errored, new `empty_no_error`, `verdict_ambiguous`) AND every thrown exception now writes a row to `_attempts.jsonl` plus a `[reviewer]` daemon log line. Zero rows in a slot's _attempts.jsonl is now a true bug signal, not a diagnostic gap. Also stamps lineage+model into `_stats.json` on every successful completion. 3. run-artifacts route + participant-card — API derives per-swap `actuallyRan: boolean` by matching the swap's `to` against the slot's final _stats.json lineage stamp. UI suppresses the swap banner block entirely when primary produced the displayed answer AND no swap actually ran. Per-entry `actuallyRan` (not `isLast`) drives the strikethrough + "actually ran" badge. 4. runs/[runId]/page.tsx — SSR readChatRounds had its own inline AGENT_TO_LINEAGE missing `antigravity-cli` / `grok-cli` and defaulting unknown agents to literal "claude". Initial server-rendered HTML classified antigravity participants as claude → phantom CLAUDE card alongside the synthesised ANTIGRAVITY placeholder, vanishing on the first client-side /api/run-artifacts poll. Now uses the shared AGENT_TO_UI_LINEAGE map and passes raw agent name through on miss. Tests: 993 pass (was 989; added 4 hyphen-suffix false-positive cases to verdict tests, including a verbatim replay of the real bowerbird gemini review classifying as request_changes). --- src/app/api/run-artifacts/[chatId]/route.ts | 36 +++++ src/app/runs/[runId]/page.tsx | 33 +++-- .../run-viewer/participant-card.tsx | 26 +++- src/components/run-viewer/types.ts | 11 ++ src/daemon/runner/reviewer-driver.ts | 33 +++++ src/daemon/runner/reviewer.ts | 131 ++++++++++++++---- src/daemon/runner/verdict.ts | 35 +++++ tests/verdict.test.ts | 100 +++++++++++++ 8 files changed, 357 insertions(+), 48 deletions(-) diff --git a/src/app/api/run-artifacts/[chatId]/route.ts b/src/app/api/run-artifacts/[chatId]/route.ts index 8c19efa..8044894 100644 --- a/src/app/api/run-artifacts/[chatId]/route.ts +++ b/src/app/api/run-artifacts/[chatId]/route.ts @@ -43,6 +43,13 @@ interface SwapEntry { ts: number; fromErrorKind?: string; fromErrorMessage?: string; + /** True iff the TO voice's run wrote `_stats.json` for this slot + * (lineage + model stamp matches the slot's final _stats.json). + * False when the chain advanced past this entry via collision or + * exhaustion. UI uses this to gate the fallback banner so a slot + * whose primary produced the displayed answer doesn't render a + * misleading "fallback fired / actually ran" badge. */ + actuallyRan?: boolean; } /** @@ -138,6 +145,31 @@ function readChatSwaps(chatId: string): SwapEntry[] { // the swap's fromModel. Lets the UI render "kimi-k2.6 failed: // cli_failed — model not found" instead of a bare arrow. const attemptsByModel = readAttemptsByModel(partDir); + // Read the slot's `_stats.json` to find which lineage/model + // actually produced the final answer.md. reviewer.ts stamps + // `lineage` + `model` on every successful completion; the + // LAST writer wins. We use this to mark `actuallyRan: true` + // on only the swap entry whose `to` matches the final + // writer. Collision-exhausted chains (gemini → sonnet where + // sonnet was claimed elsewhere) leave _stats.json carrying + // the primary's stamp, so no swap entry gets actuallyRan — + // and the UI suppresses the misleading banners. + let finalLineage: string | undefined; + let finalModel: string | undefined; + try { + const statsPath = path.join(partDir, "_stats.json"); + if (fs.existsSync(statsPath)) { + const stats = JSON.parse(fs.readFileSync(statsPath, "utf-8")); + if (typeof stats?.lineage === "string") { + finalLineage = stats.lineage; + } + if (typeof stats?.model === "string") { + finalModel = stats.model; + } + } + } catch { + /* malformed stats — leave finalLineage undefined */ + } for (const entry of parsed) { const valid = isValidSwapEntry(entry); if (!valid) continue; @@ -146,6 +178,10 @@ function readChatSwaps(chatId: string): SwapEntry[] { valid.fromErrorKind = att.errorKind; valid.fromErrorMessage = att.errorMessage; } + const toMatchesFinal = + finalLineage === valid.toLineage && + (finalModel ?? null) === (valid.toModel ?? null); + valid.actuallyRan = toMatchesFinal; out.push(valid); } } diff --git a/src/app/runs/[runId]/page.tsx b/src/app/runs/[runId]/page.tsx index 43c0e5e..4acee46 100644 --- a/src/app/runs/[runId]/page.tsx +++ b/src/app/runs/[runId]/page.tsx @@ -44,23 +44,23 @@ async function getRunData(runId: string) { return { chat, template }; } -const AGENT_TO_LINEAGE: Record = { - "claude-code": "claude", - "codex-cli": "codex", - "gemini-cli": "gemini", - "opencode-cli": "opencode", - "kimi-cli": "kimi", - // HTTP-dispatched shim — runner creates `reviewer-openrouter-N` dirs; - // without this entry the lineage fell through to "claude" and rendered - // OpenRouter answers with the wrong brand on the run page. - openrouter: "openrouter", -}; +// Single-source-of-truth map shared with the cockpit API route and the +// run-viewer types. The previous inline copy here was missing +// `antigravity-cli` and `grok-cli` — those participants fell through to +// the `?? "claude"` default at the lookup site and the SSR snapshot +// rendered phantom CLAUDE cards alongside the synthesized ANTIGRAVITY +// placeholder. The /api/run-artifacts route reads AGENT_TO_UI_LINEAGE +// (which has the full map), so a client-side poll later reclassified the +// participant and the phantom vanished — "appears on refresh, goes away +// after seconds" was exactly that drift. +import { AGENT_TO_UI_LINEAGE as AGENT_TO_LINEAGE } from "@/lib/agent-name-map"; +import type { ReviewerLineage } from "@/lib/types"; interface ParticipantSnapshot { participant: string; role: "doer" | "reviewer"; agentName: string; - lineage: "claude" | "codex" | "gemini" | "opencode" | "kimi" | "openrouter"; + lineage: ReviewerLineage; hasAnswer: boolean; answer?: string; findingsPreview?: string[]; @@ -100,7 +100,14 @@ function readChatRounds(chatId: string): RoundSnapshot[] { .map((d) => { const role: "doer" | "reviewer" = d.name.startsWith("doer-") ? "doer" : "reviewer"; const rawAgent = d.name.replace(/^(doer-|reviewer-)/, "").replace(/-\d+$/, ""); - const lineage = AGENT_TO_LINEAGE[rawAgent] ?? "claude"; + // Mirror /api/run-artifacts route: pass the raw agent name + // through when unknown so the run-page banner shows the actual + // CLI name. The hardcoded "claude" fallback misclassified every + // future CLI as a phantom claude card. The cast widens the lookup + // result back to the ReviewerLineage union — `displayLineage` and + // the lineage-maps tolerate unknown strings by falling through to + // the lowercased label. + const lineage = (AGENT_TO_LINEAGE[rawAgent] ?? rawAgent) as ReviewerLineage; const answerPath = path.join(roundDir, d.name, "answer.md"); // hasAnswer must mirror the API route: gated on the `## DONE` // sentinel so a mid-stream doer doesn't render as "done · no diff --git a/src/components/run-viewer/participant-card.tsx b/src/components/run-viewer/participant-card.tsx index 5d786f9..21c548c 100644 --- a/src/components/run-viewer/participant-card.tsx +++ b/src/components/run-viewer/participant-card.tsx @@ -203,16 +203,28 @@ export function ParticipantCard({ {swaps && swaps.length > 0 && (() => { - // Only the LAST entry's `to` voice actually produced an answer; - // intermediate `to` voices were attempted and themselves failed - // (which is what triggered the next swap). Showing "actually ran" - // on every row is wrong for chains of length > 1. const sorted = swaps.slice().sort((a, b) => a.fallbackIdx - b.fallbackIdx); + // Suppress the banner block entirely when the primary produced + // the displayed answer AND no swap target actually ran. This + // is the gemini bowerbird case (verdict_ambiguous false- + // positive → fallback fired → collision → chain exhausted) + // where the slot card was rendering DONE + content + three + // contradictory amber banners. The primary's content stands + // on its own; the failed-fallback breadcrumbs are noise. + const someSwapActuallyRan = sorted.some((s) => s.actuallyRan); + if (participant.hasAnswer && !someSwapActuallyRan) return null; return (
{sorted.map((s, i) => { const isCross = s.reason === "lineage_fallback"; - const isLast = i === sorted.length - 1; + // `actuallyRan` is the truth-bearing signal — derived + // server-side from the slot's `_stats.json` lineage+ + // model stamp matching this entry's `to` voice. The + // pre-fix `isLast` heuristic was wrong for collision- + // exhausted chains (where the last entry's TO voice + // never ran). Strikethrough + dim styling applies to + // any entry where the TO didn't run. + const ran = s.actuallyRan === true; return (
{s.toLineage}/{s.toModel} - {isLast && ( + {ran && ( actually ran diff --git a/src/components/run-viewer/types.ts b/src/components/run-viewer/types.ts index 83a7ab4..03fe183 100644 --- a/src/components/run-viewer/types.ts +++ b/src/components/run-viewer/types.ts @@ -111,6 +111,17 @@ export interface FallbackSwap { fromErrorKind?: string; /** One-line message from the failing attempt; trimmed to ~200 chars. */ fromErrorMessage?: string; + /** True iff the TO voice was the LAST writer of `_stats.json` for + * this slot (the runner stamps lineage+model on every successful + * `message_done`). Not "did it run at all" — a fallback that + * ran-and-failed before message_done won't stamp stats and will + * report false here. For the UI gate (suppress banner when primary + * has answer + no swap actuallyRan), this is the right semantic: + * the banner mismatch only matters when the displayed answer is the + * primary's, which means no successful fallback writer overrode it. + * Previously the UI inferred "actually ran" from `isLast` in the + * swaps array — wrong for collision/exhausted chains. */ + actuallyRan?: boolean; } export interface RoundSnapshot { diff --git a/src/daemon/runner/reviewer-driver.ts b/src/daemon/runner/reviewer-driver.ts index d1ce491..b59914d 100644 --- a/src/daemon/runner/reviewer-driver.ts +++ b/src/daemon/runner/reviewer-driver.ts @@ -621,6 +621,39 @@ async function runReviewer( return null; } catch (err) { threw = true; + // Thrown exceptions bypass reviewer.ts's _attempts.jsonl + // writer (the finally there only writes on errored=true, + // and a throw from spawn/timeout/abort never sets that + // flag). Without this row, the operator sees "fallback + // fired" with no record of why. Append a structurally- + // identical diagnostic before re-throwing so post-mortem + // grep across _attempts.jsonl is uniform across crash + // mode (errored, silent, thrown). + const message = + err instanceof Error ? err.message : String(err); + try { + const attemptsFile = path.join(reviewerDir, '_attempts.jsonl'); + fs.appendFileSync( + attemptsFile, + JSON.stringify({ + ts: Date.now(), + round, + lineage: entry.lineage, + model: entry.model ?? null, + errorKind: 'runtime_error', + errorMessage: message, + durationMs: 0, + }) + '\n', + ); + } catch { + /* best-effort */ + } + console.warn( + `[reviewer] attempt threw chat=${chatId} round=${round} ` + + `slot=${agentName}-${reviewerIdx} ` + + `lineage=${entry.lineage} model=${entry.model ?? '(default)'} ` + + `kind=runtime_error message=${JSON.stringify(message).slice(0, 300)}`, + ); throw err; } finally { if (threw) { diff --git a/src/daemon/runner/reviewer.ts b/src/daemon/runner/reviewer.ts index 0b9e2e9..6d3d54b 100644 --- a/src/daemon/runner/reviewer.ts +++ b/src/daemon/runner/reviewer.ts @@ -223,6 +223,15 @@ export async function runReviewerHeadless(args: { path.join(reviewerDir, '_stats.json'), JSON.stringify({ durationMs: Date.now() - startedAt, + // Lineage + model stamp lets the run-artifacts route + // tell whether the LAST voice that succeeded was the + // primary or a fallback. Without this stamp the cockpit + // can't distinguish "fallback actually ran" from + // "fallback collided, primary's answer is what's shown" + // — and renders contradictory banners on collision- + // exhausted slots like the bowerbird gemini case. + lineage: candidateLineage, + ...(candidateModel ? { model: candidateModel } : {}), ...(usageForStats ? { usage: usageForStats } : {}), }), 'utf-8', @@ -434,6 +443,13 @@ export async function runReviewerHeadless(args: { // lands regardless of how the run exits (success, error, abort). // Append-only JSONL keyed by (round, model) so multi-step fallback // chains leave a trail. + // + // Errored path only — silent failures (empty content, no error event, + // verdict_ambiguous) are written below at the null-return sites so + // zero rows in _attempts.jsonl genuinely means "no failure to log", + // not "diagnostic gap". Bowerbird chat 019E6E3318873C26DCA60409B84F90E9 + // had a kimi slot that fell through to fallback with NO _attempts row + // and NO daemon log line — that gap is what this refactor closes. if (errored) { const errorKind = errorSummary?.kind ?? 'unknown'; const errorMessage = errorSummary?.message ?? '(no message captured)'; @@ -445,33 +461,16 @@ export async function runReviewerHeadless(args: { lastError.kind = errorKind; lastError.message = errorMessage; } - const durationMs = Date.now() - startedAt; - try { - const attemptsFile = path.join(reviewerDir, '_attempts.jsonl'); - const entry = { - ts: Date.now(), - round, - lineage: candidateLineage, - model: candidateModel ?? null, - errorKind, - errorMessage, - durationMs, - }; - fs.appendFileSync(attemptsFile, JSON.stringify(entry) + '\n'); - } catch { - /* best-effort — diagnostics shouldn't fail the run */ - } - // Daemon-log line — same content as the JSONL row but at the - // daemon level so a single tail of ~/.chorus/logs/daemon.log - // shows every failed reviewer attempt across every chat without - // walking per-chat dirs. Grep-friendly key=value format mirrors - // the openrouter shim's own warn lines. - console.warn( - `[reviewer] attempt failed chat=${chatId} round=${round} ` + - `lineage=${candidateLineage} model=${candidateModel ?? '(default)'} ` + - `kind=${errorKind} duration_ms=${durationMs} ` + - `message=${JSON.stringify(errorMessage).slice(0, 300)}`, - ); + writeAttemptRow({ + reviewerDir, + chatId, + round, + lineage: candidateLineage, + model: candidateModel, + kind: errorKind, + message: errorMessage, + durationMs: Date.now() - startedAt, + }); } // Mirror runDoerHeadless: surface answer.md write failures as a // cli_warning so the user sees "stream stopped writing" instead of @@ -513,7 +512,31 @@ export async function runReviewerHeadless(args: { const streamed = finalText && finalText.length > 0 ? finalText : accumulated; const content = onDisk.trim().length > 0 ? onDisk : streamed; if (errored && content.trim().length === 0) return null; - if (content.trim().length === 0) return null; + if (content.trim().length === 0) { + // Silent-failure path: stream ended without an error event AND + // produced no usable content. Previously this returned null with no + // _attempts.jsonl row and no daemon log line, leaving the operator + // with no record of why a fallback fired. Synthesise a kind + + // diagnostic row so every null return is traceable. + if (lastError && !lastError.kind) { + lastError.kind = 'empty_no_error'; + lastError.message = + 'CLI exited without writing usable content to answer.md and without emitting an error event.'; + } + writeAttemptRow({ + reviewerDir, + chatId, + round, + lineage: candidateLineage, + model: candidateModel, + kind: lastError?.kind ?? 'empty_no_error', + message: + lastError?.message ?? + 'CLI exited without writing usable content to answer.md and without emitting an error event.', + durationMs: Date.now() - startedAt, + }); + return null; + } // Record healthy on success so a stale quota_exhausted / auth_invalid // record from a prior session clears. Without this, the Reviewer @@ -557,6 +580,58 @@ export async function runReviewerHeadless(args: { lastError.kind = 'verdict_ambiguous'; lastError.message = 'Reviewer wrote non-empty content but no approve/reject verdict was detected.'; + writeAttemptRow({ + reviewerDir, + chatId, + round, + lineage: candidateLineage, + model: candidateModel, + kind: 'verdict_ambiguous', + message: lastError.message, + durationMs: Date.now() - startedAt, + }); } return verdict; } + +/** + * Append a diagnostic row to `_attempts.jsonl` and emit the matching + * `[reviewer] attempt failed …` daemon log line. Called from every path + * that returns a non-positive result (errored, empty-content, verdict- + * ambiguous) so post-mortem grep across reviewer-slot _attempts.jsonl + * sidecars shows every reason a slot fell back. Best-effort writes — + * diagnostic failure must never fail the run. + */ +function writeAttemptRow(args: { + reviewerDir: string; + chatId: string; + round: number; + lineage: string; + model: string | undefined; + kind: string; + message: string; + durationMs: number; +}): void { + const { reviewerDir, chatId, round, lineage, model, kind, message, durationMs } = args; + try { + const attemptsFile = path.join(reviewerDir, '_attempts.jsonl'); + const entry = { + ts: Date.now(), + round, + lineage, + model: model ?? null, + errorKind: kind, + errorMessage: message, + durationMs, + }; + fs.appendFileSync(attemptsFile, JSON.stringify(entry) + '\n'); + } catch { + /* best-effort — diagnostics shouldn't fail the run */ + } + console.warn( + `[reviewer] attempt failed chat=${chatId} round=${round} ` + + `lineage=${lineage} model=${model ?? '(default)'} ` + + `kind=${kind} duration_ms=${durationMs} ` + + `message=${JSON.stringify(message).slice(0, 300)}`, + ); +} diff --git a/src/daemon/runner/verdict.ts b/src/daemon/runner/verdict.ts index c62cb74..28b61f4 100644 --- a/src/daemon/runner/verdict.ts +++ b/src/daemon/runner/verdict.ts @@ -47,6 +47,30 @@ export function verdictFromReviewerText(content: string): boolean | null { const positives = /\b(approve(?:d|s)?|lgtm|looks good to me|no concerns|ship it|ack)\b/; + // Severity-style reviews: gemini, grok, and several opencode-go models + // routinely structure findings as `### CRITICAL` / `**HIGH**` sections + // with substantive bodies but no explicit approve/reject prose. The + // earlier verdict heuristic returned null on these — burning the + // reviewer's $0.05+ spend, firing the fallback chain, and producing + // contradictory cockpit cards. A reviewer who flags CRITICAL/HIGH + // severity issues clearly wants changes; treat as implicit + // request_changes. MEDIUM/LOW are advisory and stay ambiguous. + // + // Trailing lookahead — `(?=\s*(?:[:\n]|\*\*|$))` — requires the + // severity word to be followed by a heading terminator: colon, end + // of line, end of string, or closing bold. Without it, prose like + // `### High-level review` or `**High-traffic endpoint**` matched via + // `\b` (hyphen is a word boundary) and forced an implicit reject. + // Codex + gemini both caught this in the chorus self-review + // 019E6E7A5D1DF943AD275D5595460D9A — convergent-dissenter rule + // applied. Stricter regex misses some valid loose forms (e.g. + // `### CRITICAL bug here`) but mid-text false-positive rejections + // are worse than missed implicit signals — when in doubt, leave + // verdict null and let the reviewer be explicit. 200-char floor + // still applies on top. + const severityHeader = + /(?:^|\n)\s*(?:#{1,6}\s+|\*\*\s*)(?:[\d.]+\s+)?(critical|high)(?=\s*(?:[:\n]|\*\*|$))/i; + const whole = stripped.toLowerCase(); const tail = stripped.slice(-400).toLowerCase(); @@ -60,6 +84,17 @@ export function verdictFromReviewerText(content: string): boolean | null { // anywhere in the text. if (negatives.test(whole)) return false; + // Severity-section second pass — between negatives and positives so + // an explicit `LGTM` tail still wins over the implicit signal (a + // contradictory review like "### CRITICAL bug … LGTM" is rare and + // best surfaced as positive so the reviewer's explicit final word + // counts). Tested against the original stripped text (not lowercased) + // because the regex is case-insensitive and section headers are + // typically uppercase in real reviews. + if (stripped.length >= 200 && severityHeader.test(stripped)) { + if (!positives.test(tail) && !positives.test(whole)) return false; + } + // Positives prefer the tail so an analytical review mentioning // "good practice" mid-paragraph doesn't get auto-approved without an // explicit verdict at the end. Whole-text fallback catches the case diff --git a/tests/verdict.test.ts b/tests/verdict.test.ts index 7289143..af8d095 100644 --- a/tests/verdict.test.ts +++ b/tests/verdict.test.ts @@ -182,4 +182,104 @@ describe('verdictFromReviewerText', () => { ).toBe(true); }); }); + + describe('severity-style reviews — implicit request_changes', () => { + // The actual gemini-3.1-pro-preview output from chat + // 019E6E3318873C26DCA60409B84F90E9 that the heuristic previously + // misclassified as null → verdict_ambiguous → wasted fallback chain. + const realGeminiReview = + `Here are the findings from reviewing the dual-source enrichment pipeline.\n\n` + + `### CRITICAL\n\n` + + `**1. PCA Index Builder is Non-Functional (Always Empty)**\n` + + `In \`pca_index.py\`, the \`build_index\` function never actually calls the \`_entry_from_item\` helper to normalize the data. ` + + `It iterates over the raw PCA API response and immediately attempts to use \`out[entry["key"]] = entry\`. ` + + `Since the raw PCA dictionary lacks a normalized \`"key"\` field, this will either raise a KeyError or use an incorrect internal ID.\n\n` + + `### HIGH\n\n` + + `**2. Unhandled PCA Exceptions Crash the Entire Item's Enrichment**\n` + + `In \`enrich_listing.py\`, the future completion loop lacks a generic except block for the PCA thread.\n## DONE`; + + it('classifies the real gemini severity-style review as request_changes', () => { + expect(verdictFromReviewerText(realGeminiReview)).toBe(false); + }); + + it('### CRITICAL header with substantive body returns false', () => { + const text = + '### CRITICAL\n\nThe migration drops the index without rebuilding it, ' + + PAD.repeat(2) + + '\n## DONE'; + expect(verdictFromReviewerText(text)).toBe(false); + }); + + it('**HIGH** bold header with substantive body returns false', () => { + const text = + '**HIGH**: race condition in queue claim — ' + + PAD.repeat(2) + + '\n## DONE'; + expect(verdictFromReviewerText(text)).toBe(false); + }); + + it('### MEDIUM alone (no CRITICAL/HIGH) stays ambiguous (null)', () => { + // MEDIUM is advisory — without an explicit request_changes prose, + // the verdict stays null so the slot doesn't auto-block. + const text = + '### MEDIUM\n\nConsider adding a comment here. ' + + PAD.repeat(2) + + '\n## DONE'; + expect(verdictFromReviewerText(text)).toBeNull(); + }); + + it('### CRITICAL with short body (<200 chars) stays ambiguous', () => { + // Guards against a passing-mention "critical bug" comment being + // misclassified as a substantive rejection. + expect(verdictFromReviewerText('### CRITICAL\nnit\n## DONE')).toBeNull(); + }); + + // Codex + gemini caught this false-positive class in the chorus + // self-review 019E6E7A5D1DF943AD275D5595460D9A. The trailing + // lookahead in verdict.ts now requires the severity word to be + // followed by a heading terminator (`:`, `\n`, `**`, EOL) so + // hyphen-prefix prose can't masquerade as a severity header. + it('`### High-level review` is NOT a severity header (codex review)', () => { + const text = + '### High-level review\n\nThe approach is sound and the implementation is clean. ' + + PAD.repeat(2) + + '\n## DONE'; + expect(verdictFromReviewerText(text)).toBeNull(); + }); + + it('`### High confidence` is NOT a severity header (codex review)', () => { + const text = + '### High confidence\n\nI have high confidence in this change. ' + + PAD.repeat(2) + + '\n## DONE'; + expect(verdictFromReviewerText(text)).toBeNull(); + }); + + it('`**High-traffic endpoint**` is NOT a severity header (codex review)', () => { + const text = + '**High-traffic endpoint**: this code path serves the homepage. ' + + PAD.repeat(2) + + '\n## DONE'; + expect(verdictFromReviewerText(text)).toBeNull(); + }); + + it('`### Critical considerations` is NOT a severity header', () => { + const text = + '### Critical considerations\n\nA few things to think about. ' + + PAD.repeat(2) + + '\n## DONE'; + expect(verdictFromReviewerText(text)).toBeNull(); + }); + + it('### CRITICAL followed by explicit LGTM tail still approves', () => { + // Contradictory review — explicit positive verdict in the tail + // wins over the implicit severity signal. Reviewer's final word + // is the truth-bearing signal. + const text = + '### CRITICAL\n\nThe issue I called out is now resolved. ' + + PAD.repeat(2) + + '\nOverall LGTM.\n## DONE'; + expect(verdictFromReviewerText(text)).toBe(true); + }); + }); });