From af20c67953fd4a8694d8e5b0e8213c919d0becfd Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sat, 3 Jan 2026 19:59:51 +0000 Subject: [PATCH 01/72] docs: Add fallacy checker refactor plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on user feedback from LessWrong/EA Forum about false positives, aggressive flagging, and missing context issues. Key changes planned: - Single-pass full document extraction (replaces chunking) - Multi-stage filtering (charity, supported elsewhere, dedup) - Simplified review (summarization only) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- ...5-12-15-fact-fallacy-check-improvements.md | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md index 21ea817f..f454a3af 100644 --- a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md +++ b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md @@ -84,3 +84,40 @@ Per-collection dimensions: - Results stored in DB (`MetaEvaluation` table) - CLI shell in `meta-evals/` for dev/testing - Future: run in production, show to users, enable voting + +--- + +## Part 3: Fallacy Checker Refactor (2025-01) + +Based on user feedback (LessWrong/EA Forum): too aggressive, flags intro claims supported later, misses rhetorical context. + +### Architecture + +``` +Extract (single-pass, wide net) + ↓ +Filter (multi-stage) + - Principle of Charity + - Supported Elsewhere? + - Dedup / severity threshold + ↓ +Comment (pure transformation) + ↓ +Review (summarize only β€” no filtering) +``` + +### 3.1 Single-Pass Extraction + +Replace chunked extraction with single LLM call on full document. Cast wide net. + +### 3.2 Filter: Principle of Charity + +Separate filtering step. For each issue: "Does this hold under the strongest interpretation of the argument?" + +### 3.3 Filter: Supported Elsewhere? + +"Is this claim supported, explained, or qualified elsewhere in the document?" + +### 3.4 Simplify Review + +Remove filtering logic from review prompt. Focus only on generating summaries. From ac79e4d242f4bc5b0552fb64ef7cfca32111c1ec Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sat, 3 Jan 2026 20:40:33 +0000 Subject: [PATCH 02/72] fix: Correct motte-bailey fallacy definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Was backwards: "defending weak claim by switching to strong one" Now correct: "defending controversial claim by retreating to defensible one" πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal-packages/ai/src/tools/fallacy-extractor/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal-packages/ai/src/tools/fallacy-extractor/index.ts b/internal-packages/ai/src/tools/fallacy-extractor/index.ts index 40cba3eb..e250a10c 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/index.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/index.ts @@ -164,7 +164,7 @@ export class FallacyExtractorTool extends Tool< 2. **Sophisticated Logical Fallacies** - False dichotomy (only presenting two options) - - Motte-bailey (defending weak claim by switching to strong one) + - Motte-bailey (defending controversial claim by retreating to defensible one) - Circular reasoning (conclusion in premises) - Hasty generalization (insufficient evidence β†’ broad claim) From 8b6642a384a98c3562e34cc6a28458aa914ce9dd Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sat, 3 Jan 2026 21:14:40 +0000 Subject: [PATCH 03/72] feat(meta-evals): Add document search and improve UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add DB-level title search with case-insensitive LIKE query - Increase document limit from 30 to 100 - Add debounced search input with spinner - Fix 'q' key quit issue when typing in search field - Improve date format to human-readable (Dec 27, 2025) - Fix alignment with fixed-width title padding πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../repositories/MetaEvaluationRepository.ts | 16 ++++++- meta-evals/package.json | 1 + meta-evals/src/app.tsx | 17 ++++++- meta-evals/src/components/CreateBaseline.tsx | 44 ++++++++++++++++--- meta-evals/src/components/helpers.ts | 8 ++-- pnpm-lock.yaml | 19 +++++++- 6 files changed, 93 insertions(+), 12 deletions(-) diff --git a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts index d88c7c63..1cef8079 100644 --- a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts +++ b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts @@ -519,11 +519,23 @@ export class MetaEvaluationRepository { /** * Get recent documents (non-ephemeral). + * @param titleFilter - Optional case-insensitive title search filter */ - async getRecentDocuments(): Promise { + async getRecentDocuments(titleFilter?: string): Promise { const documents = await this.prisma.document.findMany({ where: { ephemeralBatchId: null, + // Filter by title in versions if filter provided + ...(titleFilter && { + versions: { + some: { + title: { + contains: titleFilter, + mode: "insensitive" as const, + }, + }, + }, + }), }, include: { versions: { @@ -533,7 +545,7 @@ export class MetaEvaluationRepository { }, }, orderBy: { createdAt: "desc" }, - take: 30, + take: 100, }); return documents diff --git a/meta-evals/package.json b/meta-evals/package.json index 3aee4759..bf838fe2 100644 --- a/meta-evals/package.json +++ b/meta-evals/package.json @@ -14,6 +14,7 @@ "ink": "^6.5.1", "ink-select-input": "^6.2.0", "ink-spinner": "^5.0.0", + "ink-text-input": "^6.0.0", "react": "^19.2.1" }, "devDependencies": { diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index 4df1de03..1b35523d 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -171,9 +171,23 @@ export function App() { } } + async function searchDocuments(filter: string) { + try { + const docs = await metaEvaluationRepository.getRecentDocuments(filter || undefined); + setDocuments(docs); + } catch (e) { + // Silently fail - keep existing documents + } + } + // Handle keyboard shortcuts + // Disable "q" quit when on document step (text input is active) + const isTextInputActive = screen.type === "create-baseline" && screen.step === "document"; useInput((input, key) => { - if (input === "q" || (key.ctrl && input === "c")) { + if (key.ctrl && input === "c") { + exit(); + } + if (input === "q" && !isTextInputActive) { exit(); } if (key.escape) { @@ -240,6 +254,7 @@ export function App() { setSelectedAgents(ags); setScreen({ type: "create-baseline", step: "confirm" }); }} + onSearchDocuments={searchDocuments} onConfirm={async () => { setScreen({ type: "create-baseline", step: "creating" }); try { diff --git a/meta-evals/src/components/CreateBaseline.tsx b/meta-evals/src/components/CreateBaseline.tsx index 8d3f789a..2ba7c3c8 100644 --- a/meta-evals/src/components/CreateBaseline.tsx +++ b/meta-evals/src/components/CreateBaseline.tsx @@ -2,8 +2,9 @@ * Create Baseline Flow Component */ -import React, { useState } from "react"; +import React, { useState, useEffect, useRef } from "react"; import { Box, Text } from "ink"; +import TextInput from "ink-text-input"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import type { DocumentChoice, AgentChoice } from "./types"; @@ -19,6 +20,7 @@ interface CreateBaselineProps { height: number; onSelectDocument: (doc: DocumentChoice) => void; onSelectAgents: (agents: AgentChoice[]) => void; + onSearchDocuments: (filter: string) => void; onConfirm: () => void; onBack: () => void; } @@ -33,10 +35,33 @@ export function CreateBaseline({ height, onSelectDocument, onSelectAgents, + onSearchDocuments, onConfirm, onBack, }: CreateBaselineProps) { const [agentSelection, setAgentSelection] = useState>(new Set()); + const [filter, setFilter] = useState(""); + const [isSearching, setIsSearching] = useState(false); + const debounceRef = useRef(null); + + // Debounced DB search when filter changes + useEffect(() => { + if (debounceRef.current) { + clearTimeout(debounceRef.current); + } + + setIsSearching(true); + debounceRef.current = setTimeout(() => { + onSearchDocuments(filter); + setIsSearching(false); + }, 300); + + return () => { + if (debounceRef.current) { + clearTimeout(debounceRef.current); + } + }; + }, [filter]); if (step === "creating") { return ( @@ -59,14 +84,23 @@ export function CreateBaseline({ {step === "document" && ( <> - Step 1/2: Select a document ({documents.length} available) + Step 1/2: Select a document ({documents.length} found{filter ? ` for "${filter}"` : ""}) + + + Search: + + {isSearching && } ({ - label: `${String(i + 1).padStart(2)} | ${truncate(d.title, 50)} | ${formatDate(d.createdAt)}`, + label: `${String(i + 1).padStart(2)} | ${truncate(d.title, 50).padEnd(50)} | ${formatDate(new Date(d.createdAt))}`, value: d.id, }))} - limit={maxItems} + limit={maxItems - 2} onSelect={(item) => { const doc = documents.find((d) => d.id === item.value); if (doc) onSelectDocument(doc); @@ -137,7 +171,7 @@ export function CreateBaseline({ )} - Esc Back | q Quit + Esc Back | {step === "document" ? "Ctrl+C" : "q"} Quit ); diff --git a/meta-evals/src/components/helpers.ts b/meta-evals/src/components/helpers.ts index c5170dad..6157899a 100644 --- a/meta-evals/src/components/helpers.ts +++ b/meta-evals/src/components/helpers.ts @@ -8,9 +8,11 @@ export function truncate(str: string, maxLen: number): string { } export function formatDate(date: Date): string { - const month = String(date.getMonth() + 1).padStart(2, "0"); - const day = String(date.getDate()).padStart(2, "0"); - return `${month}-${day}`; + return date.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + year: "numeric", + }); } export function formatStatus(status: string): string { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 830279c2..678df111 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -533,6 +533,9 @@ importers: ink-spinner: specifier: ^5.0.0 version: 5.0.0(ink@6.5.1(@types/react@19.2.7)(react@19.2.1))(react@19.2.1) + ink-text-input: + specifier: ^6.0.0 + version: 6.0.0(ink@6.5.1(@types/react@19.2.7)(react@19.2.1))(react@19.2.1) react: specifier: ^19.2.1 version: 19.2.1 @@ -4425,6 +4428,13 @@ packages: ink: '>=4.0.0' react: '>=18.0.0' + ink-text-input@6.0.0: + resolution: {integrity: sha512-Fw64n7Yha5deb1rHY137zHTAbSTNelUKuB5Kkk2HACXEtwIHBCf9OH2tP/LQ9fRYTl1F0dZgbW0zPnZk6FA9Lw==} + engines: {node: '>=18'} + peerDependencies: + ink: '>=5' + react: '>=18' + ink@6.5.1: resolution: {integrity: sha512-wF3j/DmkM8q5E+OtfdQhCRw8/0ahkc8CUTgEddxZzpEWPslu7YPL3t64MWRoI9m6upVGpfAg4ms2BBvxCdKRLQ==} engines: {node: '>=20'} @@ -9913,7 +9923,7 @@ snapshots: sirv: 3.0.1 tinyglobby: 0.2.14 tinyrainbow: 2.0.0 - vitest: 3.2.4(@types/debug@4.1.12)(@types/node@20.19.9)(@vitest/ui@3.2.4)(happy-dom@18.0.1)(jiti@2.5.1)(jsdom@24.1.3)(terser@5.43.1)(tsx@4.21.0)(yaml@2.8.1) + vitest: 3.2.4(@types/debug@4.1.12)(@types/node@22.17.0)(@vitest/ui@3.2.4)(happy-dom@18.0.1)(jiti@2.5.1)(jsdom@24.1.3)(terser@5.43.1)(tsx@4.21.0)(yaml@2.8.1) '@vitest/utils@3.2.4': dependencies: @@ -11501,6 +11511,13 @@ snapshots: ink: 6.5.1(@types/react@19.2.7)(react@19.2.1) react: 19.2.1 + ink-text-input@6.0.0(ink@6.5.1(@types/react@19.2.7)(react@19.2.1))(react@19.2.1): + dependencies: + chalk: 5.6.2 + ink: 6.5.1(@types/react@19.2.7)(react@19.2.1) + react: 19.2.1 + type-fest: 4.41.0 + ink@6.5.1(@types/react@19.2.7)(react@19.2.1): dependencies: '@alcalzone/ansi-tokenize': 0.2.2 From fa3fcbde05764caa7a7a98e036d176853a276099 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sat, 3 Jan 2026 21:33:08 +0000 Subject: [PATCH 04/72] feat(meta-evals): Add delete series, better errors, tmux dev-env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add deleteSeries() to MetaEvaluationRepository - Add delete confirmation modal in MainMenu (d key, y/n confirm) - Improve API error handling with human-readable messages - Switch dev-env.sh from zellij to tmux πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- dev/scripts/dev-env.sh | 36 +++-- .../repositories/MetaEvaluationRepository.ts | 14 ++ meta-evals/src/app.tsx | 5 + meta-evals/src/components/MainMenu.tsx | 144 ++++++++++++++---- meta-evals/src/utils/apiClient.ts | 65 ++++++-- 5 files changed, 214 insertions(+), 50 deletions(-) diff --git a/dev/scripts/dev-env.sh b/dev/scripts/dev-env.sh index 8378c599..94126f72 100755 --- a/dev/scripts/dev-env.sh +++ b/dev/scripts/dev-env.sh @@ -1,29 +1,43 @@ #!/bin/bash -# Dev environment manager using zellij +# Dev environment manager using tmux # Usage: ./dev-env.sh [start|stop|status|attach] SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" SESSION_NAME="roast-dev" -LAYOUT_FILE="$SCRIPT_DIR/dev-env.kdl" start_dev() { - if zellij list-sessions 2>/dev/null | grep -q "$SESSION_NAME"; then + if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then echo "Session '$SESSION_NAME' already running" exit 0 fi cd "$REPO_ROOT" - zellij --session "$SESSION_NAME" --new-session-with-layout "$LAYOUT_FILE" & - sleep 2 + + # Create new detached session with first window for web + tmux new-session -d -s "$SESSION_NAME" -n "dev" -c "$REPO_ROOT" + + # Split vertically and run jobs in right pane + tmux split-window -h -t "$SESSION_NAME:dev" -c "$REPO_ROOT/internal-packages/jobs" + + # Run web dev server in left pane + tmux send-keys -t "$SESSION_NAME:dev.0" "pnpm run dev -H 0.0.0.0" Enter + + # Run jobs processor in right pane + tmux send-keys -t "$SESSION_NAME:dev.1" "NODE_ENV=development pnpm run process-pgboss" Enter + + # Select left pane + tmux select-pane -t "$SESSION_NAME:dev.0" + echo "Dev session '$SESSION_NAME' started" + echo "Use './dev-env.sh attach' or 'tmux attach -t $SESSION_NAME' to attach" } stop_dev() { - if zellij list-sessions 2>/dev/null | grep -q "$SESSION_NAME"; then + if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then echo "Stopping dev environment..." - zellij kill-session "$SESSION_NAME" + tmux kill-session -t "$SESSION_NAME" echo "Session '$SESSION_NAME' stopped." else echo "Session '$SESSION_NAME' is not running." @@ -31,17 +45,17 @@ stop_dev() { } status_dev() { - if zellij list-sessions 2>/dev/null | grep -q "$SESSION_NAME"; then + if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then echo "Session '$SESSION_NAME' is running." - zellij list-sessions | grep "$SESSION_NAME" + tmux list-windows -t "$SESSION_NAME" else echo "Session '$SESSION_NAME' is not running." fi } attach_dev() { - if zellij list-sessions 2>/dev/null | grep -q "$SESSION_NAME"; then - zellij attach "$SESSION_NAME" + if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then + tmux attach -t "$SESSION_NAME" else echo "Session '$SESSION_NAME' is not running. Use 'start' first." fi diff --git a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts index 1cef8079..7dadfccc 100644 --- a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts +++ b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts @@ -392,6 +392,20 @@ export class MetaEvaluationRepository { }); } + /** + * Delete a series and all its runs. + */ + async deleteSeries(seriesId: string): Promise { + // Delete runs first (foreign key constraint) + await this.prisma.seriesRun.deleteMany({ + where: { seriesId }, + }); + // Delete the series + await this.prisma.series.delete({ + where: { id: seriesId }, + }); + } + /** * Get detailed info about a specific series, including all runs. */ diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index 1b35523d..7a353750 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -224,6 +224,11 @@ export function App() { height={termHeight} onCreateBaseline={startCreateBaseline} onSelectSeries={(id) => setScreen({ type: "series-detail", seriesId: id })} + onDeleteSeries={async (id) => { + await metaEvaluationRepository.deleteSeries(id); + // Reload the menu + loadMainMenu(); + }} onExit={exit} judgeModel={judgeModel} availableModels={availableModels} diff --git a/meta-evals/src/components/MainMenu.tsx b/meta-evals/src/components/MainMenu.tsx index c213b695..a60e3d95 100644 --- a/meta-evals/src/components/MainMenu.tsx +++ b/meta-evals/src/components/MainMenu.tsx @@ -19,6 +19,7 @@ interface MainMenuProps { height: number; onCreateBaseline: () => void; onSelectSeries: (id: string) => void; + onDeleteSeries: (id: string) => Promise; onExit: () => void; judgeModel: string; availableModels: ModelInfo[]; @@ -38,6 +39,7 @@ export function MainMenu({ height, onCreateBaseline, onSelectSeries, + onDeleteSeries, onExit, judgeModel, availableModels, @@ -49,11 +51,40 @@ export function MainMenu({ }: MainMenuProps) { const [activeTab, setActiveTab] = useState<"series" | "settings">("series"); const [settingsSection, setSettingsSection] = useState<"model" | "temperature" | "maxTokens">("model"); + const [highlightedIndex, setHighlightedIndex] = useState(0); + const [confirmDelete, setConfirmDelete] = useState(null); + const [isDeleting, setIsDeleting] = useState(false); - // Handle tab switching + // Limit series shown, reserve 2 slots for create/exit + const visibleSeries = series.slice(0, maxItems - 2); + + // Handle keyboard input useInput((input, key) => { if (key.tab) { setActiveTab((prev) => (prev === "series" ? "settings" : "series")); + setConfirmDelete(null); + } + + // Delete with 'd' key (only in series tab) + if (activeTab === "series" && input === "d" && !confirmDelete && !isDeleting) { + const selectedSeries = visibleSeries[highlightedIndex]; + if (selectedSeries) { + setConfirmDelete(selectedSeries.id); + } + } + + // Confirm delete with 'y' + if (confirmDelete && input === "y" && !isDeleting) { + setIsDeleting(true); + onDeleteSeries(confirmDelete).finally(() => { + setConfirmDelete(null); + setIsDeleting(false); + }); + } + + // Cancel delete with 'n' or Escape + if (confirmDelete && (input === "n" || key.escape)) { + setConfirmDelete(null); } }); @@ -168,8 +199,6 @@ export function MainMenu({ } // Series tab (default) - // Limit series shown, reserve 2 slots for create/exit - const visibleSeries = series.slice(0, maxItems - 2); const items = [ ...visibleSeries .filter((s) => s.id) // Ensure valid IDs @@ -181,6 +210,9 @@ export function MainMenu({ { label: "Exit", value: "exit" }, ]; + // Find series being deleted for confirmation message + const deletingSeries = confirmDelete ? visibleSeries.find((s) => s.id === confirmDelete) : null; + return ( @@ -191,35 +223,91 @@ export function MainMenu({ {renderTabs()} - - - - {series.length === 0 - ? "No evaluation series yet. Create a baseline to get started." - : visibleSeries.length < series.length - ? `Showing ${visibleSeries.length} of ${series.length} series` - : `${series.length} series available`} - - - Judge: {currentModelName} - {" "}| Temp: {temperature} - {" "}| Tokens: {maxTokens} - + {/* Delete confirmation modal - replaces content when active */} + {confirmDelete && deletingSeries ? ( + + + + + ⚠ Confirm Delete ⚠ + + + + + Are you sure you want to delete this series? + + + + "{truncate(deletingSeries.documentTitle, 45)}" + + + + {deletingSeries.runCount} run{deletingSeries.runCount !== 1 ? "s" : ""} will be removed. + + + + {isDeleting ? ( + Deleting... + ) : ( + + Y - Delete + N - Cancel + + )} + + - + ) : ( + <> + + + + {series.length === 0 + ? "No evaluation series yet. Create a baseline to get started." + : visibleSeries.length < series.length + ? `Showing ${visibleSeries.length} of ${series.length} series` + : `${series.length} series available`} + + + Judge: {currentModelName} + {" "}| Temp: {temperature} + {" "}| Tokens: {maxTokens} + + + - { - if (item.value === "exit") onExit(); - else if (item.value === "create") onCreateBaseline(); - else onSelectSeries(item.value); - }} - /> + { + const idx = visibleSeries.findIndex((s) => s.id === item.value); + if (idx >= 0) setHighlightedIndex(idx); + }} + onSelect={(item) => { + if (confirmDelete) return; // Ignore selection during delete confirmation + if (item.value === "exit") onExit(); + else if (item.value === "create") onCreateBaseline(); + else onSelectSeries(item.value); + }} + /> + + )} - Tab Switch | Up/Down Navigate | Enter Select | q Quit + + {confirmDelete ? "Y Delete | N Cancel" : "Tab Switch | d Delete | Enter Select | q Quit"} + ); diff --git a/meta-evals/src/utils/apiClient.ts b/meta-evals/src/utils/apiClient.ts index 1cdf24b7..b748d135 100644 --- a/meta-evals/src/utils/apiClient.ts +++ b/meta-evals/src/utils/apiClient.ts @@ -81,22 +81,65 @@ export class ApiClient { const { sessionToken } = await this.getSessionInfo(); const url = `${API_BASE}${path}`; - const response = await fetch(url, { - ...options, - headers: { - "Content-Type": "application/json", - Cookie: `authjs.session-token=${sessionToken}`, - ...options.headers, - }, - }); - const data = await response.json(); + let response: Response; + try { + response = await fetch(url, { + ...options, + headers: { + "Content-Type": "application/json", + Cookie: `authjs.session-token=${sessionToken}`, + ...options.headers, + }, + }); + } catch (error) { + // Network error - server not running, wrong port, etc. + const message = error instanceof Error ? error.message : String(error); + if (message.includes("ECONNREFUSED") || message.includes("fetch failed")) { + throw new ApiError( + 0, + `Cannot connect to API at ${API_BASE}. Is the web server running? Try: pnpm run dev`, + { originalError: message } + ); + } + throw new ApiError(0, `Network error: ${message}`, { originalError: message }); + } + + // Handle empty responses + const text = await response.text(); + if (!text) { + if (!response.ok) { + throw new ApiError( + response.status, + `API returned ${response.status} ${response.statusText} with empty response`, + { url, status: response.status } + ); + } + throw new ApiError( + response.status, + `API returned empty response. Is the server running correctly at ${API_BASE}?`, + { url, status: response.status } + ); + } + + // Parse JSON + let data: T; + try { + data = JSON.parse(text); + } catch { + throw new ApiError( + response.status, + `API returned invalid JSON. Status: ${response.status}. Response: ${text.slice(0, 200)}`, + { url, status: response.status, responseText: text.slice(0, 500) } + ); + } if (!response.ok) { - throw new ApiError(response.status, data.error || "API request failed", data); + const errorData = data as { error?: string }; + throw new ApiError(response.status, errorData.error || "API request failed", data); } - return { data: data as T, status: response.status }; + return { data, status: response.status }; } /** From 2a4dd601e70c22719e84a02c96bd785827e96c0f Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sat, 3 Jan 2026 22:19:37 +0000 Subject: [PATCH 05/72] refactor: Switch fallacy extractor from chunked to single-pass analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Plugin now passes full documentText for analysis instead of splitting into chunks - Extractor uses documentText when text param is not provided (single-pass mode) - Made text param optional in FallacyExtractorInput to support both modes - Backwards compatible: chunk mode still works when text+chunkStartOffset provided This reduces code complexity and provides better context to the LLM by analyzing the full document at once. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../plugins/fallacy-check/index.ts | 62 +++++++------------ .../ai/src/tools/fallacy-extractor/index.ts | 27 +++++--- .../ai/src/tools/fallacy-extractor/types.ts | 6 +- .../ai/src/tools/generated-schemas.ts | 10 +-- 4 files changed, 47 insertions(+), 58 deletions(-) diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts index 14a46db7..36b86f4c 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts @@ -128,48 +128,24 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { operation: "fallacy-check-analysis", }); - logger.info("FallacyCheckPlugin: Starting analysis"); - logger.info(`FallacyCheckPlugin: Processing ${chunks.length} chunks`); + logger.info("FallacyCheckPlugin: Starting analysis (single-pass mode)"); - // Phase 1: Extract epistemic issues from all chunks in parallel - const extractionPromises = this.chunks.map((chunk) => - this.extractIssuesFromChunk(chunk) - ); - - const extractionResults = await Promise.allSettled(extractionPromises); + // Phase 1: Single-pass extraction on full document + // This provides full context for better accuracy and reduces false positives + // from flagging intro claims that are supported later in the document + const extractionResult = await this.extractIssuesFromDocument(documentText); - // Collect all extracted issues and track errors - const allIssues: FallacyIssue[] = []; - const extractionErrors: string[] = []; + const allIssues: FallacyIssue[] = extractionResult.issues; - for (const result of extractionResults) { - if (result.status === "fulfilled" && result.value) { - allIssues.push(...result.value.issues); - if (result.value.error) { - extractionErrors.push(result.value.error); - } - } else if (result.status === "rejected") { - const error = - result.reason instanceof Error - ? result.reason.message - : "Unknown extraction error"; - extractionErrors.push(error); - logger.warn(`Issue extraction failed for chunk: ${error}`); - } - } - - // Log summary of errors if any occurred - if (extractionErrors.length > 0) { - logger.warn( - `Issue extraction completed with ${extractionErrors.length} errors` - ); + if (extractionResult.error) { + logger.warn(`Issue extraction completed with error: ${extractionResult.error}`); } // Audit log: Extraction phase completed logger.info("FallacyCheckPlugin: AUDIT: Extraction phase completed", { timestamp: new Date().toISOString(), issuesExtracted: allIssues.length, - extractionErrors: extractionErrors.length, + extractionError: extractionResult.error || null, phase: "extraction", }); @@ -313,7 +289,12 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { }; } - private async extractIssuesFromChunk(chunk: TextChunk): Promise<{ + /** + * Extract issues from the full document in a single pass. + * This provides complete context for better accuracy and reduces false positives + * from flagging intro claims that are supported later in the document. + */ + private async extractIssuesFromDocument(documentText: string): Promise<{ issues: FallacyIssue[]; error?: string; }> { @@ -323,9 +304,7 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { const executeExtraction = async () => { return await fallacyExtractorTool.execute( { - text: chunk.text, - documentText: this.documentText, // Pass full document for location finding - chunkStartOffset: chunk.metadata?.position?.start, // Optimize location finding to search chunk first + documentText, // Full document for single-pass analysis and location finding }, { logger, @@ -340,15 +319,20 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { ) : await executeExtraction(); + // Create a synthetic "chunk" representing the full document for FallacyIssue compatibility + const fullDocChunk = new TextChunk("full-document", documentText, { + position: { start: 0, end: documentText.length }, + }); + const issues = result.issues.map( - (issue) => new FallacyIssue(issue, chunk, this.processingStartTime) + (issue) => new FallacyIssue(issue, fullDocChunk, this.processingStartTime) ); return { issues, }; } catch (error) { - logger.error("Error extracting issues from chunk:", error); + logger.error("Error extracting issues from document:", error); return { issues: [], error: error instanceof Error ? error.message : "Unknown error", diff --git a/internal-packages/ai/src/tools/fallacy-extractor/index.ts b/internal-packages/ai/src/tools/fallacy-extractor/index.ts index e250a10c..2e82e380 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/index.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/index.ts @@ -79,8 +79,8 @@ const extractedFallacyIssueSchema = z.object({ }) satisfies z.ZodType; const inputSchema = z.object({ - text: z.string().min(1).max(50000).describe("Text chunk to analyze for epistemic issues and logical fallacies"), - documentText: z.string().optional().describe("Full document text (optional, used for accurate location finding)"), + text: z.string().max(50000).optional().describe("Text chunk to analyze (optional if documentText provided)"), + documentText: z.string().optional().describe("Full document text - used for analysis in single-pass mode, or for location finding in chunk mode"), chunkStartOffset: z.number().min(0).optional().describe("Byte offset where this chunk starts in the full document (optimization for location finding)"), }) satisfies z.ZodType; @@ -108,21 +108,26 @@ export class FallacyExtractorTool extends Tool< const MIN_SEVERITY_THRESHOLD = 60; // Only report significant issues const MAX_ISSUES = 15; // Limit to prevent overwhelming output + // Use documentText for analysis if text is not provided (single-pass mode) + // This allows callers to just pass documentText for full-document analysis + const textToAnalyze = input.text || input.documentText || ""; + // Audit log: Tool execution started context.logger.info( "[FallacyExtractor] AUDIT: Tool execution started", { timestamp: new Date().toISOString(), - textLength: input.text.length, + textLength: textToAnalyze.length, minSeverityThreshold: MIN_SEVERITY_THRESHOLD, maxIssues: MAX_ISSUES, hasDocumentText: !!input.documentText, hasChunkOffset: input.chunkStartOffset !== undefined, + mode: input.text ? "chunk" : "single-pass", } ); context.logger.info( - `[FallacyExtractor] Analyzing text for epistemic issues` + `[FallacyExtractor] Analyzing text for epistemic issues (${input.text ? "chunk" : "single-pass"} mode)` ); const systemPrompt = `You are an expert epistemic critic analyzing reasoning quality and argumentation. @@ -227,12 +232,12 @@ export class FallacyExtractorTool extends Tool< const userPrompt = `Analyze this text for epistemic and reasoning issues: -${input.text} +${textToAnalyze} Analyze ALL sections (argumentative, factual, biographical). Look for statistical errors, logical fallacies, rhetorical manipulation, and narrative issues like vague claims or selective self-presentation. Distribute findings across the entire text.`; const cacheSeed = generateCacheSeed("fallacy-extract", [ - input.text, + textToAnalyze, MIN_SEVERITY_THRESHOLD, MAX_ISSUES, ]); @@ -416,19 +421,19 @@ Analyze ALL sections (argumentative, factual, biographical). Look for statistica let locationResult; // OPTIMIZATION: If we have chunk offset, search in chunk first (much faster!) - if (input.chunkStartOffset !== undefined) { + if (input.chunkStartOffset !== undefined && input.text) { // Use optimized 3-tier chunk-based location finding locationResult = await findLocationInChunk( { chunkText: input.text, - fullDocumentText: input.documentText, + fullDocumentText: input.documentText || input.text, chunkStartOffset: input.chunkStartOffset, searchText: issue.exactText, lineNumberHint: issue.approximateLineNumber, }, context ); - } else { + } else if (input.documentText) { // No chunk offset, search in full document locationResult = await fuzzyTextLocatorTool.execute( { @@ -443,6 +448,10 @@ Analyze ALL sections (argumentative, factual, biographical). Look for statistica }, context ); + } else { + // No document text available for location finding + issuesWithLocations.push(issue); + continue; } if (locationResult.found && locationResult.location) { diff --git a/internal-packages/ai/src/tools/fallacy-extractor/types.ts b/internal-packages/ai/src/tools/fallacy-extractor/types.ts index 6ce00077..da8f2076 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/types.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/types.ts @@ -65,10 +65,10 @@ export interface ExtractedFallacyIssue { * Input for the epistemic issues extractor tool */ export interface FallacyExtractorInput { - /** Text chunk to analyze */ - text: string; + /** Text chunk to analyze (optional if documentText provided) */ + text?: string; - /** Full document text (for accurate location finding in full doc) */ + /** Full document text - used for analysis in single-pass mode, or for location finding in chunk mode */ documentText?: string; /** Absolute offset where this chunk starts in the full document (optimization) */ diff --git a/internal-packages/ai/src/tools/generated-schemas.ts b/internal-packages/ai/src/tools/generated-schemas.ts index 26d719d1..01d64b8b 100644 --- a/internal-packages/ai/src/tools/generated-schemas.ts +++ b/internal-packages/ai/src/tools/generated-schemas.ts @@ -3,7 +3,7 @@ * Generated by scripts/generate-tool-schemas.ts * DO NOT EDIT MANUALLY * - * Schema Hash: 2dc92b2afb89d952e1e754b74ea2707195835936258f1a5b1609257f8086cc86 + * Schema Hash: e45284c446c65c76ac371d80b42053755741ea59bc55c8857c2a4ff54f202455 */ export const toolSchemas = { @@ -2388,13 +2388,12 @@ export const toolSchemas = { "properties": { "text": { "type": "string", - "minLength": 1, "maxLength": 50000, - "description": "Text chunk to analyze for epistemic issues and logical fallacies" + "description": "Text chunk to analyze (optional if documentText provided)" }, "documentText": { "type": "string", - "description": "Full document text (optional, used for accurate location finding)" + "description": "Full document text - used for analysis in single-pass mode, or for location finding in chunk mode" }, "chunkStartOffset": { "type": "number", @@ -2402,9 +2401,6 @@ export const toolSchemas = { "description": "Byte offset where this chunk starts in the full document (optimization for location finding)" } }, - "required": [ - "text" - ], "additionalProperties": false, "$schema": "http://json-schema.org/draft-07/schema#" }, From e0e8b651be61f2162546879d047716f23ebd2b25 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sat, 3 Jan 2026 23:48:24 +0000 Subject: [PATCH 06/72] feat: Add supported-elsewhere filter to reduce false positives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add SupportedElsewhereFilterTool that checks if flagged issues are actually supported/justified elsewhere in the document - Integrate filter into fallacy-check plugin between extraction and comment generation phases - Add debug logging to fallacy extractor and filter for visibility - Add restart command to dev-env.sh with buffer clearing - Update implementation notes with next steps (model testing, per-claim verification, extraction prompt improvements) Results on test document show filter correctly identifies claims that are justified by technical explanations later in the document. Opus filters more aggressively (0 issues) vs Sonnet (1-2 issues). πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- dev/scripts/dev-env.sh | 34 +- .../plugins/fallacy-check/index.ts | 64 +++- .../ai/src/tools/fallacy-extractor/index.ts | 22 +- .../tools/supported-elsewhere-filter/index.ts | 326 ++++++++++++++++++ .../tools/supported-elsewhere-filter/types.ts | 51 +++ ...5-12-15-fact-fallacy-check-improvements.md | 21 ++ 6 files changed, 514 insertions(+), 4 deletions(-) create mode 100644 internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts create mode 100644 internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts diff --git a/dev/scripts/dev-env.sh b/dev/scripts/dev-env.sh index 94126f72..e7a17410 100755 --- a/dev/scripts/dev-env.sh +++ b/dev/scripts/dev-env.sh @@ -1,7 +1,7 @@ #!/bin/bash # Dev environment manager using tmux -# Usage: ./dev-env.sh [start|stop|status|attach] +# Usage: ./dev-env.sh [start|stop|status|attach|restart] SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" @@ -61,6 +61,33 @@ attach_dev() { fi } +restart_dev() { + if ! tmux has-session -t "$SESSION_NAME" 2>/dev/null; then + echo "Session '$SESSION_NAME' is not running. Starting fresh..." + start_dev + return + fi + + echo "Restarting dev environment..." + + # Send Ctrl+C to both panes to kill running processes + tmux send-keys -t "$SESSION_NAME:dev.0" C-c + tmux send-keys -t "$SESSION_NAME:dev.1" C-c + + # Wait a moment for processes to die + sleep 1 + + # Clear scrollback buffer in both panes + tmux clear-history -t "$SESSION_NAME:dev.0" + tmux clear-history -t "$SESSION_NAME:dev.1" + + # Re-run the commands + tmux send-keys -t "$SESSION_NAME:dev.0" "pnpm run dev -H 0.0.0.0" Enter + tmux send-keys -t "$SESSION_NAME:dev.1" "NODE_ENV=development pnpm run process-pgboss" Enter + + echo "Dev environment restarted in existing session." +} + case "${1:-start}" in start) start_dev @@ -74,8 +101,11 @@ case "${1:-start}" in attach) attach_dev ;; + restart) + restart_dev + ;; *) - echo "Usage: $0 [start|stop|status|attach]" + echo "Usage: $0 [start|stop|status|attach|restart]" exit 1 ;; esac diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts index 36b86f4c..267f744c 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts @@ -6,6 +6,7 @@ import type { Comment, ToolChainResult } from "../../../shared/types"; import fallacyExtractorTool from "../../../tools/fallacy-extractor"; import fuzzyTextLocatorTool from "../../../tools/smart-text-searcher"; import fallacyReviewTool from "../../../tools/fallacy-review"; +import supportedElsewhereFilterTool from "../../../tools/supported-elsewhere-filter"; import { TextChunk } from "../../TextChunk"; import type { AnalysisResult, @@ -150,7 +151,68 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { }); // Deduplicate issues by similar text - this.issues = this.deduplicateIssues(allIssues); + const deduplicatedIssues = this.deduplicateIssues(allIssues); + + // Phase 1.5: Filter out issues that are supported elsewhere in the document + // This catches false positives where claims are actually justified later + logger.info("FallacyCheckPlugin: AUDIT: Supported-elsewhere filter started", { + timestamp: new Date().toISOString(), + issuesToFilter: deduplicatedIssues.length, + phase: "supported-elsewhere-filter", + }); + + let filteredIssues = deduplicatedIssues; + try { + const filterInput = { + documentText, + issues: deduplicatedIssues.map((issue) => ({ + quotedText: issue.text, + issueType: issue.issueType, + reasoning: issue.issue.reasoning, + locationOffset: issue.issue.location?.startOffset, + })), + }; + + const filterResult = await supportedElsewhereFilterTool.execute( + filterInput, + { logger } + ); + + // Keep only the issues that are NOT supported elsewhere + const unsupportedIndices = new Set( + filterResult.unsupportedIssues.map((r) => r.index) + ); + filteredIssues = deduplicatedIssues.filter((_, idx) => + unsupportedIndices.has(idx) + ); + + // Log what was filtered + const supportedCount = filterResult.supportedIssues.length; + if (supportedCount > 0) { + logger.info( + `FallacyCheckPlugin: Filtered out ${supportedCount} issues (supported elsewhere in document)` + ); + for (const supported of filterResult.supportedIssues) { + logger.debug( + ` - Issue ${supported.index}: ${supported.explanation}` + ); + } + } + + logger.info("FallacyCheckPlugin: AUDIT: Supported-elsewhere filter completed", { + timestamp: new Date().toISOString(), + issuesBeforeFilter: deduplicatedIssues.length, + issuesAfterFilter: filteredIssues.length, + issuesFiltered: supportedCount, + phase: "supported-elsewhere-filter", + }); + } catch (error) { + logger.warn("FallacyCheckPlugin: Supported-elsewhere filter failed, keeping all issues", error); + // Fallback: keep all issues if filter fails + filteredIssues = deduplicatedIssues; + } + + this.issues = filteredIssues; // Phase 2: Generate comments for all issues in parallel const commentPromises = this.issues.map(async (issue) => { diff --git a/internal-packages/ai/src/tools/fallacy-extractor/index.ts b/internal-packages/ai/src/tools/fallacy-extractor/index.ts index 2e82e380..bb63f353 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/index.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/index.ts @@ -112,12 +112,25 @@ export class FallacyExtractorTool extends Tool< // This allows callers to just pass documentText for full-document analysis const textToAnalyze = input.text || input.documentText || ""; + // Prompt version for tracking - update this when prompt changes + const PROMPT_VERSION = "v2-justification-check"; + + // DIRECT CONSOLE LOG FOR DEBUGGING - bypasses any logger filtering + console.log(`\n\nπŸ”₯πŸ”₯πŸ”₯ FALLACY EXTRACTOR RUNNING πŸ”₯πŸ”₯πŸ”₯`); + console.log(`PROMPT_VERSION=${PROMPT_VERSION}`); + console.log(`MODE=${input.text ? "chunk" : "single-pass"}`); + console.log(`DOC_LENGTH=${textToAnalyze.length}`); + console.log(`DOC_PREVIEW=${textToAnalyze.substring(0, 80)}...`); + console.log(`πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯πŸ”₯\n\n`); + // Audit log: Tool execution started context.logger.info( "[FallacyExtractor] AUDIT: Tool execution started", { timestamp: new Date().toISOString(), + promptVersion: PROMPT_VERSION, textLength: textToAnalyze.length, + textPreview: textToAnalyze.substring(0, 100), minSeverityThreshold: MIN_SEVERITY_THRESHOLD, maxIssues: MAX_ISSUES, hasDocumentText: !!input.documentText, @@ -127,7 +140,7 @@ export class FallacyExtractorTool extends Tool< ); context.logger.info( - `[FallacyExtractor] Analyzing text for epistemic issues (${input.text ? "chunk" : "single-pass"} mode)` + `[FallacyExtractor] PROMPT_VERSION=${PROMPT_VERSION} MODE=${input.text ? "chunk" : "single-pass"} DOC_LENGTH=${textToAnalyze.length}` ); const systemPrompt = `You are an expert epistemic critic analyzing reasoning quality and argumentation. @@ -138,6 +151,13 @@ export class FallacyExtractorTool extends Tool< - Do NOT flag authors EXPLAINING, WARNING about, or ACKNOWLEDGING errors (good epistemics!) - Only flag authors MAKING the error themselves +**🚨 CRITICAL: CHECK FOR JUSTIFICATION ELSEWHERE** +- Before flagging a claim as unsupported or a non sequitur, CHECK if the author provides justification ELSEWHERE in the document +- Authors often state conclusions first, then explain reasoning later - this is valid argumentation +- A claim in paragraph 2 may be fully justified by technical explanation in paragraph 5 +- Only flag as "non sequitur" if there is NO supporting reasoning ANYWHERE in the document +- Read the ENTIRE document before deciding whether a logical leap exists + **🎯 SELECTIVITY**: Senior reviewer, not pedantic nitpicker. - Only flag issues that significantly mislead, clearly commit error, and matter to the argument - Default to NOT flagging. Aim for ~5-10 high-quality issues, not 20+ marginal ones diff --git a/internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts b/internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts new file mode 100644 index 00000000..e5e79880 --- /dev/null +++ b/internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts @@ -0,0 +1,326 @@ +/** + * Supported Elsewhere Filter Tool + * + * Checks if claims or arguments flagged as issues are actually supported, + * explained, or qualified elsewhere in the document. Common in well-structured + * writing where intro claims are backed up later in the text. + */ + +import { z } from "zod"; +import { Tool, type ToolContext } from "../base/Tool"; +import { callClaudeWithTool } from "../../claude/wrapper"; +import { MODEL_CONFIG } from "../../claude/wrapper"; +import type { + SupportedElsewhereFilterInput, + SupportedElsewhereFilterOutput, + SupportedElsewhereResult, +} from "./types"; + +const issueSchema = z.object({ + quotedText: z.string().describe("The exact text flagged as an issue"), + issueType: z.string().describe("Type of issue identified"), + reasoning: z.string().describe("The reasoning for why this was flagged"), + locationOffset: z.number().optional().describe("Approximate location in document"), +}); + +const inputSchema = z.object({ + documentText: z.string().min(1).max(200000).describe("Full document text to search"), + issues: z.array(issueSchema).describe("Issues to check for support elsewhere"), +}); + +const resultSchema = z.object({ + index: z.number().describe("Index of the issue in the input array"), + isSupported: z.boolean().describe("Whether this issue is supported elsewhere"), + supportLocation: z.string().optional().describe("Where the support was found"), + explanation: z.string().describe("Explanation of the support or lack thereof"), +}); + +const outputSchema = z.object({ + unsupportedIssues: z.array(resultSchema).describe("Issues NOT supported elsewhere"), + supportedIssues: z.array(resultSchema).describe("Issues ARE supported elsewhere"), +}); + +// Tool config +const supportedElsewhereFilterConfig = { + id: "supported-elsewhere-filter", + name: "Supported Elsewhere Filter", + description: "Checks if flagged issues are supported elsewhere in the document", + version: "1.0.0", + category: "utility" as const, +}; + +export class SupportedElsewhereFilterTool extends Tool< + SupportedElsewhereFilterInput, + SupportedElsewhereFilterOutput +> { + config = supportedElsewhereFilterConfig; + inputSchema = inputSchema; + outputSchema = outputSchema; + + async execute( + input: SupportedElsewhereFilterInput, + context: ToolContext + ): Promise { + console.log(`\n\nπŸ”πŸ”πŸ” SUPPORTED-ELSEWHERE FILTER RUNNING πŸ”πŸ”πŸ”`); + console.log(`Checking ${input.issues.length} issues for support elsewhere`); + for (let i = 0; i < input.issues.length; i++) { + console.log(` Issue ${i}: "${input.issues[i].quotedText.substring(0, 60)}..."`); + console.log(` Type: ${input.issues[i].issueType}`); + } + console.log(`πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”πŸ”\n`); + + context.logger.info( + `[SupportedElsewhereFilter] Checking ${input.issues.length} issues for support elsewhere` + ); + + // If no issues, return empty result + if (input.issues.length === 0) { + return { + unsupportedIssues: [], + supportedIssues: [], + }; + } + + // Format issues for the LLM + const formattedIssues = input.issues + .map((issue, idx) => { + return `**Issue ${idx}**: +Text: "${issue.quotedText}" +Type: ${issue.issueType} +Reasoning: ${issue.reasoning} +`; + }) + .join("\n---\n\n"); + + const systemPrompt = `You are an expert at analyzing document structure and finding supporting evidence. + +Your task is to check if each flagged issue is actually **supported, explained, or qualified elsewhere** in the document. + +**MARK AS SUPPORTED (filter out) if**: +- The claim is backed up with evidence or reasoning later in the document +- The author provides technical explanation that justifies the claim +- The author qualifies or nuances the claim elsewhere +- Context provided elsewhere makes the claim reasonable +- The issue is about an intro/thesis that the rest of the document supports + +**MARK AS UNSUPPORTED (keep flagging) if**: +- No evidence, reasoning, or support is provided anywhere in the document +- The claim stands alone without qualification or explanation +- Other parts of the document don't address the concern +- The support found is weak or doesn't actually address the issue + +**Examples of SUPPORTED issues (filter out)**: + +1. Issue: "Non sequitur - claims X is evidence against Y without justification" + Support found: Later section explains WHY X implies not-Y with technical reasoning + β†’ SUPPORTED - the logical connection is explained later + +2. Issue: "Claims 'significant improvement' without data" (in intro) + Support found: Paragraph 5 provides specific metrics and comparison + β†’ SUPPORTED - intro claim is backed up later + +3. Issue: "Missing context about sample size" + Support found: Methods section specifies n=500 participants + β†’ SUPPORTED - context is provided in appropriate section + +**Examples of UNSUPPORTED issues (keep flagging)**: + +1. Issue: "Non sequitur - claims X is evidence against Y" + Document searched: No explanation of the logical connection anywhere + β†’ UNSUPPORTED - logical leap is never justified + +2. Issue: "Claims 95% success rate without methodology" + Document searched: No methodology section, no data tables + β†’ UNSUPPORTED - specific claim needs specific evidence + +3. Issue: "Appeals to authority without naming sources" + Document searched: No citations or references provided + β†’ UNSUPPORTED - authority claims need attribution + +For each issue, search the ENTIRE document for supporting evidence or reasoning.`; + + // For longer documents, we need to be strategic about what we show the LLM + // Show the full document if short, otherwise provide structured chunks + const docForPrompt = input.documentText.length <= 15000 + ? input.documentText + : this.extractKeySections(input.documentText); + + const userPrompt = `Search this document for support for the flagged issues: + +**Full Document**: +${docForPrompt} + +**Issues to Check**: + +${formattedIssues} + +For each issue, determine if it is supported elsewhere in the document.`; + + try { + const result = await callClaudeWithTool<{ + results: Array<{ + index: number; + isSupported: boolean; + supportLocation?: string; + explanation: string; + }>; + }>({ + model: MODEL_CONFIG.analysis, + system: systemPrompt, + messages: [{ role: "user", content: userPrompt }], + max_tokens: 4000, + temperature: 0.1, + toolName: "supported_elsewhere_results", + toolDescription: "Results of checking each issue for support elsewhere", + toolSchema: { + type: "object", + properties: { + results: { + type: "array", + items: { + type: "object", + properties: { + index: { + type: "number", + description: "Index of the issue (0-based)", + }, + isSupported: { + type: "boolean", + description: "Whether this issue is supported elsewhere", + }, + supportLocation: { + type: "string", + description: "Where the support was found (quote or description)", + }, + explanation: { + type: "string", + description: "Explanation of why it is/isn't supported", + }, + }, + required: ["index", "isSupported", "explanation"], + }, + }, + }, + required: ["results"], + }, + }); + + // Process results + const unsupportedIssues: SupportedElsewhereResult[] = []; + const supportedIssues: SupportedElsewhereResult[] = []; + + for (const r of result.toolResult.results || []) { + // Validate index is in range + if (r.index < 0 || r.index >= input.issues.length) { + context.logger.warn(`[SupportedElsewhereFilter] Invalid index ${r.index}, skipping`); + continue; + } + + const filterResult: SupportedElsewhereResult = { + index: r.index, + isSupported: r.isSupported, + supportLocation: r.supportLocation, + explanation: r.explanation, + }; + + if (r.isSupported) { + supportedIssues.push(filterResult); + } else { + unsupportedIssues.push(filterResult); + } + } + + console.log(`\n\nβœ…βœ…βœ… SUPPORTED-ELSEWHERE FILTER RESULTS βœ…βœ…βœ…`); + console.log(`KEPT (unsupported): ${unsupportedIssues.length} issues`); + for (const issue of unsupportedIssues) { + console.log(` Issue ${issue.index}: NOT supported`); + console.log(` Reason: ${issue.explanation}`); + } + console.log(`FILTERED (supported): ${supportedIssues.length} issues`); + for (const issue of supportedIssues) { + console.log(` Issue ${issue.index}: SUPPORTED at "${issue.supportLocation || 'N/A'}"`); + console.log(` Reason: ${issue.explanation}`); + } + console.log(`βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…βœ…\n\n`); + + context.logger.info( + `[SupportedElsewhereFilter] ${supportedIssues.length}/${input.issues.length} issues filtered (supported elsewhere), ${unsupportedIssues.length} kept` + ); + + return { + unsupportedIssues, + supportedIssues, + }; + } catch (error) { + context.logger.error("[SupportedElsewhereFilter] Filter failed:", error); + // Fallback: assume all issues are unsupported (keep them) + return { + unsupportedIssues: input.issues.map((_, idx) => ({ + index: idx, + isSupported: false, + explanation: "Fallback: filter failed, preserving issue", + })), + supportedIssues: [], + }; + } + } + + /** + * Extract key sections from a long document for analysis. + * Prioritizes intro, conclusion, and sections with evidence-related keywords. + */ + private extractKeySections(documentText: string): string { + const lines = documentText.split("\n"); + const chunks: string[] = []; + + // Always include first ~2000 chars (intro) + chunks.push("**[INTRO/BEGINNING]**\n" + documentText.substring(0, 2000)); + + // Always include last ~2000 chars (conclusion) + if (documentText.length > 4000) { + chunks.push("**[CONCLUSION/END]**\n" + documentText.substring(documentText.length - 2000)); + } + + // Find sections with evidence-related keywords + const evidenceKeywords = [ + "method", "data", "result", "study", "research", "evidence", + "citation", "reference", "source", "appendix", "table", "figure", + "analysis", "finding", "sample", "participant", "measure", + "because", "therefore", "thus", "since", "reason", "explain" + ]; + + let currentSection = ""; + let sectionHasEvidence = false; + + for (const line of lines) { + const lowerLine = line.toLowerCase(); + + // Check if this line or section contains evidence keywords + if (evidenceKeywords.some(kw => lowerLine.includes(kw))) { + sectionHasEvidence = true; + } + + // Check for section headers (markdown or uppercase) + if (line.startsWith("#") || line.match(/^[A-Z][A-Z\s]{3,}$/)) { + if (sectionHasEvidence && currentSection.length > 100) { + chunks.push("**[EVIDENCE SECTION]**\n" + currentSection.substring(0, 1500)); + } + currentSection = line + "\n"; + sectionHasEvidence = false; + } else { + currentSection += line + "\n"; + } + } + + // Don't exceed ~12000 chars total + let result = chunks.join("\n\n---\n\n"); + if (result.length > 12000) { + result = result.substring(0, 12000) + "\n...[truncated]..."; + } + + return result; + } +} + +export const supportedElsewhereFilterTool = new SupportedElsewhereFilterTool(); +export default supportedElsewhereFilterTool; diff --git a/internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts b/internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts new file mode 100644 index 00000000..e1bbf48c --- /dev/null +++ b/internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts @@ -0,0 +1,51 @@ +/** + * Supported Elsewhere Filter Types + * + * This filter checks if claims or arguments flagged as issues are actually + * supported, explained, or qualified elsewhere in the document. Common in + * well-structured writing where intro claims are backed up later. + */ + +export interface SupportedElsewhereFilterInput { + /** Full document text to search for support */ + documentText: string; + + /** Issues to check for support elsewhere */ + issues: SupportedElsewhereIssue[]; +} + +export interface SupportedElsewhereIssue { + /** The exact text flagged as an issue */ + quotedText: string; + + /** Type of issue identified */ + issueType: string; + + /** The reasoning for why this was flagged */ + reasoning: string; + + /** Approximate location in document (character offset) */ + locationOffset?: number; +} + +export interface SupportedElsewhereFilterOutput { + /** Issues that are NOT supported elsewhere (keep flagging) */ + unsupportedIssues: SupportedElsewhereResult[]; + + /** Issues that ARE supported elsewhere (filter out) */ + supportedIssues: SupportedElsewhereResult[]; +} + +export interface SupportedElsewhereResult { + /** Index of the issue in the input array */ + index: number; + + /** Whether this issue is supported elsewhere in the document */ + isSupported: boolean; + + /** Where the support was found (if applicable) */ + supportLocation?: string; + + /** Brief explanation of the support or lack thereof */ + explanation: string; +} diff --git a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md index f454a3af..fee0fbe8 100644 --- a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md +++ b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md @@ -121,3 +121,24 @@ Separate filtering step. For each issue: "Does this hold under the strongest int ### 3.4 Simplify Review Remove filtering logic from review prompt. Focus only on generating summaries. + +### 3.5 Next Steps (2025-01-03) + +**Model Testing:** +- Test filter with additional models: Gemini 3 Flash, Gemini 3 Pro +- Current observations: Opus filters more aggressively (0 issues kept), Sonnet more conservative (1-2 kept) +- Opus appears more correct - recognizes that intro claims justified by later technical sections count as supported +- Need to verify on more documents to confirm Opus isn't too lenient on real issues + +**Filter Architecture:** +- Consider verifying each claim in a separate LLM call during filtering stage +- Current batch approach may miss nuances when evaluating multiple claims together +- Per-claim calls would be more expensive but potentially more accurate + +**Extraction Prompt:** +- Take another pass over the extraction prompt - still producing some questionable flags +- Consider splitting extraction into multiple specialized prompts: + - Logical fallacies (non sequitur, circular reasoning, etc.) + - Missing context / unsupported claims + - Rhetorical manipulation / emotional appeals +- Specialized prompts may reduce cognitive load and improve accuracy From 85e39edc1eb5115cc10cc530ee41828c7dc53981 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 4 Jan 2026 00:31:07 +0000 Subject: [PATCH 07/72] feat: Add OpenRouter support for multi-model filter testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add callOpenRouterWithTool() wrapper for OpenRouter API tool calling - Add Gemini 3 Pro/Flash model IDs to OPENROUTER_MODELS - Add temperature normalization per provider (Anthropic 0-1, others 0-2) - Update supported-elsewhere filter to use OpenRouter for non-Claude models - Add FALLACY_FILTER_MODEL env var for easy model switching - Increase max_tokens to 8000 for OpenRouter (Gemini Pro needs more) - Add error logging for tool call failures Tested with Gemini 3 Flash ($0.003) and Pro ($0.054) - both agree with Opus that all 5 issues are supported elsewhere (vs Sonnet keeping 1-2). πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../ai/src/tools/generated-schemas.ts | 4 +- .../tools/supported-elsewhere-filter/index.ts | 123 +++++++++++------- .../tools/supported-elsewhere-filter/types.ts | 7 + internal-packages/ai/src/utils/openrouter.ts | 91 +++++++++++++ 4 files changed, 180 insertions(+), 45 deletions(-) diff --git a/internal-packages/ai/src/tools/generated-schemas.ts b/internal-packages/ai/src/tools/generated-schemas.ts index 01d64b8b..60a159e2 100644 --- a/internal-packages/ai/src/tools/generated-schemas.ts +++ b/internal-packages/ai/src/tools/generated-schemas.ts @@ -3,7 +3,7 @@ * Generated by scripts/generate-tool-schemas.ts * DO NOT EDIT MANUALLY * - * Schema Hash: e45284c446c65c76ac371d80b42053755741ea59bc55c8857c2a4ff54f202455 + * Schema Hash: df35080852aa73e4d7fb2aa34d36337a3918862c2d2c402a0ac2ba0273c59580 */ export const toolSchemas = { @@ -2064,6 +2064,8 @@ export const toolSchemas = { "enum": [ "anthropic/claude-sonnet-4.5", "anthropic/claude-sonnet-4", + "google/gemini-3-pro-preview", + "google/gemini-3-flash-preview", "google/gemini-2.5-pro", "google/gemini-2.5-flash", "openai/gpt-5", diff --git a/internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts b/internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts index e5e79880..25ddc6f1 100644 --- a/internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts +++ b/internal-packages/ai/src/tools/supported-elsewhere-filter/index.ts @@ -10,6 +10,7 @@ import { z } from "zod"; import { Tool, type ToolContext } from "../base/Tool"; import { callClaudeWithTool } from "../../claude/wrapper"; import { MODEL_CONFIG } from "../../claude/wrapper"; +import { callOpenRouterWithTool } from "../../utils/openrouter"; import type { SupportedElsewhereFilterInput, SupportedElsewhereFilterOutput, @@ -26,6 +27,7 @@ const issueSchema = z.object({ const inputSchema = z.object({ documentText: z.string().min(1).max(200000).describe("Full document text to search"), issues: z.array(issueSchema).describe("Issues to check for support elsewhere"), + model: z.string().optional().describe("Model to use (Claude or OpenRouter model ID)"), }); const resultSchema = z.object({ @@ -61,7 +63,15 @@ export class SupportedElsewhereFilterTool extends Tool< input: SupportedElsewhereFilterInput, context: ToolContext ): Promise { + // Determine which model to use: + // 1. input.model (explicit override) + // 2. FALLACY_FILTER_MODEL env var (for testing different models) + // 3. Default Claude analysis model + const modelId = input.model || process.env.FALLACY_FILTER_MODEL || MODEL_CONFIG.analysis; + const isOpenRouterModel = modelId.includes("/"); // OpenRouter models have format "provider/model" + console.log(`\n\nπŸ”πŸ”πŸ” SUPPORTED-ELSEWHERE FILTER RUNNING πŸ”πŸ”πŸ”`); + console.log(`Model: ${modelId} (${isOpenRouterModel ? "OpenRouter" : "Claude"})`); console.log(`Checking ${input.issues.length} issues for support elsewhere`); for (let i = 0; i < input.issues.length; i++) { console.log(` Issue ${i}: "${input.issues[i].quotedText.substring(0, 60)}..."`); @@ -156,54 +166,79 @@ ${formattedIssues} For each issue, determine if it is supported elsewhere in the document.`; - try { - const result = await callClaudeWithTool<{ - results: Array<{ - index: number; - isSupported: boolean; - supportLocation?: string; - explanation: string; - }>; - }>({ - model: MODEL_CONFIG.analysis, - system: systemPrompt, - messages: [{ role: "user", content: userPrompt }], - max_tokens: 4000, - temperature: 0.1, - toolName: "supported_elsewhere_results", - toolDescription: "Results of checking each issue for support elsewhere", - toolSchema: { - type: "object", - properties: { - results: { - type: "array", - items: { - type: "object", - properties: { - index: { - type: "number", - description: "Index of the issue (0-based)", - }, - isSupported: { - type: "boolean", - description: "Whether this issue is supported elsewhere", - }, - supportLocation: { - type: "string", - description: "Where the support was found (quote or description)", - }, - explanation: { - type: "string", - description: "Explanation of why it is/isn't supported", - }, - }, - required: ["index", "isSupported", "explanation"], + // Shared tool schema for both Claude and OpenRouter + const toolSchema = { + type: "object" as const, + properties: { + results: { + type: "array", + items: { + type: "object", + properties: { + index: { + type: "number", + description: "Index of the issue (0-based)", + }, + isSupported: { + type: "boolean", + description: "Whether this issue is supported elsewhere", + }, + supportLocation: { + type: "string", + description: "Where the support was found (quote or description)", + }, + explanation: { + type: "string", + description: "Explanation of why it is/isn't supported", }, }, + required: ["index", "isSupported", "explanation"], }, - required: ["results"], }, - }); + }, + required: ["results"], + }; + + type FilterResults = { + results: Array<{ + index: number; + isSupported: boolean; + supportLocation?: string; + explanation: string; + }>; + }; + + try { + let result: { toolResult: FilterResults }; + + if (isOpenRouterModel) { + // Use OpenRouter for non-Claude models (Gemini, GPT, etc.) + // Use higher max_tokens for OpenRouter models (some need more space) + console.log(`πŸ“‘ Calling OpenRouter API with model: ${modelId}`); + result = await callOpenRouterWithTool({ + model: modelId, + system: systemPrompt, + messages: [{ role: "user", content: userPrompt }], + max_tokens: 8000, + temperature: 0.1, + toolName: "supported_elsewhere_results", + toolDescription: "Results of checking each issue for support elsewhere", + toolSchema, + }); + } else { + // Use Claude API directly + console.log(`πŸ€– Calling Claude API with model: ${modelId}`); + result = await callClaudeWithTool({ + model: modelId, + system: systemPrompt, + messages: [{ role: "user", content: userPrompt }], + max_tokens: 4000, + temperature: 0.1, + toolName: "supported_elsewhere_results", + toolDescription: "Results of checking each issue for support elsewhere", + toolSchema, + }); + } // Process results const unsupportedIssues: SupportedElsewhereResult[] = []; diff --git a/internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts b/internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts index e1bbf48c..dc339eef 100644 --- a/internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts +++ b/internal-packages/ai/src/tools/supported-elsewhere-filter/types.ts @@ -12,6 +12,13 @@ export interface SupportedElsewhereFilterInput { /** Issues to check for support elsewhere */ issues: SupportedElsewhereIssue[]; + + /** + * Optional model to use for filtering. + * Can be a Claude model (default) or an OpenRouter model ID. + * Examples: "claude-sonnet-4-20250514", "google/gemini-3-flash-preview" + */ + model?: string; } export interface SupportedElsewhereIssue { diff --git a/internal-packages/ai/src/utils/openrouter.ts b/internal-packages/ai/src/utils/openrouter.ts index c3cc33b5..82e72970 100644 --- a/internal-packages/ai/src/utils/openrouter.ts +++ b/internal-packages/ai/src/utils/openrouter.ts @@ -74,6 +74,8 @@ export const OPENROUTER_MODELS = { // Top tier - Latest and most capable models (2025) CLAUDE_SONNET_4_5: 'anthropic/claude-sonnet-4.5', CLAUDE_SONNET_4: 'anthropic/claude-sonnet-4', + GEMINI_3_PRO: 'google/gemini-3-pro-preview', + GEMINI_3_FLASH: 'google/gemini-3-flash-preview', GEMINI_2_5_PRO: 'google/gemini-2.5-pro', GEMINI_2_5_FLASH: 'google/gemini-2.5-flash', GPT_5: 'openai/gpt-5', @@ -105,6 +107,95 @@ export const OPENROUTER_MODELS = { export type OpenRouterModel = typeof OPENROUTER_MODELS[keyof typeof OPENROUTER_MODELS]; +/** + * Call OpenRouter with tool/function calling + * Similar interface to callClaudeWithTool but uses OpenAI-compatible API + */ +export interface OpenRouterToolCallOptions { + model: string; + system: string; + messages: Array<{ role: 'user' | 'assistant'; content: string }>; + max_tokens?: number; + temperature?: number; + toolName: string; + toolDescription: string; + toolSchema: Record; +} + +export interface OpenRouterToolCallResult { + toolResult: T; + model: string; + usage?: { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; + }; +} + +export async function callOpenRouterWithTool( + options: OpenRouterToolCallOptions +): Promise> { + const client = createOpenRouterClient(); + + const response = await client.chat.completions.create({ + model: options.model, + messages: [ + { role: 'system', content: options.system }, + ...options.messages, + ], + max_tokens: options.max_tokens || 4000, + temperature: normalizeTemperature(options.temperature || 0.1, options.model), + tools: [ + { + type: 'function', + function: { + name: options.toolName, + description: options.toolDescription, + parameters: options.toolSchema, + }, + }, + ], + tool_choice: { + type: 'function', + function: { name: options.toolName }, + }, + }); + + const choice = response.choices[0]; + if (!choice) { + throw new Error('No response from OpenRouter'); + } + + // Check for tool call + const toolCall = choice.message?.tool_calls?.[0]; + if (!toolCall || toolCall.function.name !== options.toolName) { + // Log what we actually got for debugging + console.error(`[OpenRouter] Expected tool call '${options.toolName}' but got:`); + console.error(` finish_reason: ${choice.finish_reason}`); + console.error(` message.content: ${choice.message?.content?.substring(0, 500) || '(empty)'}`); + console.error(` tool_calls: ${JSON.stringify(choice.message?.tool_calls || [])}`); + throw new Error(`No tool call found for ${options.toolName}`); + } + + // Parse the tool arguments + let toolResult: T; + try { + toolResult = JSON.parse(toolCall.function.arguments) as T; + } catch (e) { + throw new Error(`Failed to parse tool arguments: ${toolCall.function.arguments}`); + } + + return { + toolResult, + model: options.model, + usage: response.usage ? { + prompt_tokens: response.usage.prompt_tokens, + completion_tokens: response.usage.completion_tokens, + total_tokens: response.usage.total_tokens, + } : undefined, + }; +} + /** * Temperature range configuration by provider * Different providers support different temperature ranges From 02ea420189f93fb35f16d41112fc1e75c5a07a89 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 4 Jan 2026 00:46:51 +0000 Subject: [PATCH 08/72] feat: Add OpenRouter support for fallacy extraction + improve dev-env restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add model parameter to FallacyExtractorInput for OpenRouter models - Support FALLACY_EXTRACTOR_MODEL env var for easy model switching - Use callOpenRouterWithTool for non-Claude models (Gemini, GPT, etc.) - Clear visible screen before scrollback in dev-env restart πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- dev/scripts/dev-env.sh | 5 +- .../ai/src/tools/fallacy-extractor/index.ts | 220 ++++++++++-------- .../ai/src/tools/fallacy-extractor/types.ts | 7 + .../ai/src/tools/generated-schemas.ts | 6 +- 4 files changed, 141 insertions(+), 97 deletions(-) diff --git a/dev/scripts/dev-env.sh b/dev/scripts/dev-env.sh index e7a17410..d497078b 100755 --- a/dev/scripts/dev-env.sh +++ b/dev/scripts/dev-env.sh @@ -77,7 +77,10 @@ restart_dev() { # Wait a moment for processes to die sleep 1 - # Clear scrollback buffer in both panes + # Clear visible screen and scrollback buffer in both panes + tmux send-keys -t "$SESSION_NAME:dev.0" "clear" Enter + tmux send-keys -t "$SESSION_NAME:dev.1" "clear" Enter + sleep 0.2 tmux clear-history -t "$SESSION_NAME:dev.0" tmux clear-history -t "$SESSION_NAME:dev.1" diff --git a/internal-packages/ai/src/tools/fallacy-extractor/index.ts b/internal-packages/ai/src/tools/fallacy-extractor/index.ts index bb63f353..9b50d066 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/index.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/index.ts @@ -4,6 +4,7 @@ import { ISSUE_TYPES, } from "../../analysis-plugins/plugins/fallacy-check/constants"; import { callClaudeWithTool } from "../../claude/wrapper"; +import { callOpenRouterWithTool } from "../../utils/openrouter"; import { Tool, ToolContext, @@ -82,6 +83,7 @@ const inputSchema = z.object({ text: z.string().max(50000).optional().describe("Text chunk to analyze (optional if documentText provided)"), documentText: z.string().optional().describe("Full document text - used for analysis in single-pass mode, or for location finding in chunk mode"), chunkStartOffset: z.number().min(0).optional().describe("Byte offset where this chunk starts in the full document (optimization for location finding)"), + model: z.string().optional().describe("Model to use (Claude or OpenRouter model ID)"), }) satisfies z.ZodType; const outputSchema = z.object({ @@ -115,9 +117,17 @@ export class FallacyExtractorTool extends Tool< // Prompt version for tracking - update this when prompt changes const PROMPT_VERSION = "v2-justification-check"; + // Determine which model to use: + // 1. input.model (explicit override) + // 2. FALLACY_EXTRACTOR_MODEL env var (for testing different models) + // 3. Default (Claude via callClaudeWithTool which uses its own default) + const modelId = input.model || process.env.FALLACY_EXTRACTOR_MODEL || undefined; + const isOpenRouterModel = modelId?.includes("/") || false; // OpenRouter models have format "provider/model" + // DIRECT CONSOLE LOG FOR DEBUGGING - bypasses any logger filtering console.log(`\n\nπŸ”₯πŸ”₯πŸ”₯ FALLACY EXTRACTOR RUNNING πŸ”₯πŸ”₯πŸ”₯`); console.log(`PROMPT_VERSION=${PROMPT_VERSION}`); + console.log(`MODEL=${modelId || "default"} (${isOpenRouterModel ? "OpenRouter" : "Claude"})`); console.log(`MODE=${input.text ? "chunk" : "single-pass"}`); console.log(`DOC_LENGTH=${textToAnalyze.length}`); console.log(`DOC_PREVIEW=${textToAnalyze.substring(0, 80)}...`); @@ -262,106 +272,126 @@ Analyze ALL sections (argumentative, factual, biographical). Look for statistica MAX_ISSUES, ]); - const result = await callClaudeWithTool<{ - issues: ExtractedFallacyIssue[]; - wasComplete: boolean; - }>({ - system: systemPrompt, - messages: [ - { - role: "user", - content: userPrompt, - }, - ], - max_tokens: 8000, - temperature: 0, - toolName: "extract_fallacy_issues", - toolDescription: "Extract and score fallacy issues from text", - toolSchema: { - type: "object", - properties: { - issues: { - type: "array", - items: { - type: "object", - properties: { - exactText: { - type: "string", - description: "The exact text from the document", - }, - issueType: { - type: "string", - enum: [ - ISSUE_TYPES.MISINFORMATION, - ISSUE_TYPES.MISSING_CONTEXT, - ISSUE_TYPES.DECEPTIVE_WORDING, - ISSUE_TYPES.LOGICAL_FALLACY, - ISSUE_TYPES.VERIFIED_ACCURATE, - ], - description: "Type of issue", - }, - fallacyType: { - type: "string", - enum: [ - "ad-hominem", - "straw-man", - "false-dilemma", - "slippery-slope", - "appeal-to-authority", - "appeal-to-emotion", - "appeal-to-nature", - "hasty-generalization", - "survivorship-bias", - "selection-bias", - "cherry-picking", - "circular-reasoning", - "equivocation", - "non-sequitur", - "other", - ], - description: "Specific fallacy type (only for logical-fallacy issues)", - }, - severityScore: { - type: "number", - description: "0-100: How severe is this issue", - }, - confidenceScore: { - type: "number", - description: "0-100: How confident you are this is the fallacy", - }, - reasoning: { - type: "string", - description: "Why this is an issue", - }, - importanceScore: { - type: "number", - description: "0-100: How important to address", - }, - approximateLineNumber: { - type: "number", - description: "Approximate line number where this text appears (optional, helps speed up location finding)", - }, + // Shared tool schema for both Claude and OpenRouter + const toolSchema = { + type: "object" as const, + properties: { + issues: { + type: "array", + items: { + type: "object", + properties: { + exactText: { + type: "string", + description: "The exact text from the document", + }, + issueType: { + type: "string", + enum: [ + ISSUE_TYPES.MISINFORMATION, + ISSUE_TYPES.MISSING_CONTEXT, + ISSUE_TYPES.DECEPTIVE_WORDING, + ISSUE_TYPES.LOGICAL_FALLACY, + ISSUE_TYPES.VERIFIED_ACCURATE, + ], + description: "Type of issue", + }, + fallacyType: { + type: "string", + enum: [ + "ad-hominem", + "straw-man", + "false-dilemma", + "slippery-slope", + "appeal-to-authority", + "appeal-to-emotion", + "appeal-to-nature", + "hasty-generalization", + "survivorship-bias", + "selection-bias", + "cherry-picking", + "circular-reasoning", + "equivocation", + "non-sequitur", + "other", + ], + description: "Specific fallacy type (only for logical-fallacy issues)", + }, + severityScore: { + type: "number", + description: "0-100: How severe is this issue", + }, + confidenceScore: { + type: "number", + description: "0-100: How confident you are this is the fallacy", + }, + reasoning: { + type: "string", + description: "Why this is an issue", + }, + importanceScore: { + type: "number", + description: "0-100: How important to address", + }, + approximateLineNumber: { + type: "number", + description: "Approximate line number where this text appears (optional, helps speed up location finding)", }, - required: [ - "exactText", - "issueType", - "severityScore", - "confidenceScore", - "reasoning", - "importanceScore", - ], }, + required: [ + "exactText", + "issueType", + "severityScore", + "confidenceScore", + "reasoning", + "importanceScore", + ], }, - wasComplete: { - type: "boolean", - description: "Whether analysis was complete or had to be truncated", - }, }, - required: ["issues", "wasComplete"], + wasComplete: { + type: "boolean", + description: "Whether analysis was complete or had to be truncated", + }, }, - enablePromptCaching: true, - cacheSeed, - }); + required: ["issues", "wasComplete"], + }; + + type ExtractorResults = { + issues: ExtractedFallacyIssue[]; + wasComplete: boolean; + }; + + let result: { toolResult: ExtractorResults }; + + if (isOpenRouterModel && modelId) { + // Use OpenRouter for non-Claude models (Gemini, GPT, etc.) + console.log(`πŸ“‘ Calling OpenRouter API with model: ${modelId}`); + result = await callOpenRouterWithTool({ + model: modelId, + system: systemPrompt, + messages: [{ role: "user", content: userPrompt }], + max_tokens: 8000, + temperature: 0.1, // OpenRouter doesn't support temp=0 for all models + toolName: "extract_fallacy_issues", + toolDescription: "Extract and score fallacy issues from text", + toolSchema, + }); + } else { + // Use Claude API directly + console.log(`πŸ€– Calling Claude API${modelId ? ` with model: ${modelId}` : ""}`); + result = await callClaudeWithTool({ + ...(modelId && { model: modelId }), + system: systemPrompt, + messages: [{ role: "user", content: userPrompt }], + max_tokens: 8000, + temperature: 0, + toolName: "extract_fallacy_issues", + toolDescription: "Extract and score fallacy issues from text", + toolSchema, + enablePromptCaching: true, + cacheSeed, + }); + } let allIssues = result.toolResult.issues || []; const wasComplete = result.toolResult.wasComplete ?? true; diff --git a/internal-packages/ai/src/tools/fallacy-extractor/types.ts b/internal-packages/ai/src/tools/fallacy-extractor/types.ts index da8f2076..13a54139 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/types.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/types.ts @@ -73,6 +73,13 @@ export interface FallacyExtractorInput { /** Absolute offset where this chunk starts in the full document (optimization) */ chunkStartOffset?: number; + + /** + * Optional model to use for extraction. + * Can be a Claude model (default) or an OpenRouter model ID. + * Examples: "claude-sonnet-4-20250514", "google/gemini-3-flash-preview" + */ + model?: string; } /** diff --git a/internal-packages/ai/src/tools/generated-schemas.ts b/internal-packages/ai/src/tools/generated-schemas.ts index 60a159e2..726a46ed 100644 --- a/internal-packages/ai/src/tools/generated-schemas.ts +++ b/internal-packages/ai/src/tools/generated-schemas.ts @@ -3,7 +3,7 @@ * Generated by scripts/generate-tool-schemas.ts * DO NOT EDIT MANUALLY * - * Schema Hash: df35080852aa73e4d7fb2aa34d36337a3918862c2d2c402a0ac2ba0273c59580 + * Schema Hash: 74d74639d9cc319a253b27fd9dd6141cff7a8ec8ebfff951f09b198cc438ed30 */ export const toolSchemas = { @@ -2401,6 +2401,10 @@ export const toolSchemas = { "type": "number", "minimum": 0, "description": "Byte offset where this chunk starts in the full document (optimization for location finding)" + }, + "model": { + "type": "string", + "description": "Model to use (Claude or OpenRouter model ID)" } }, "additionalProperties": false, From c4a44a1e2daa37f37a05d453e74a29629ab41fee Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Wed, 7 Jan 2026 11:08:20 +0000 Subject: [PATCH 09/72] docs: Add prioritized implementation plan for fallacy checker refactor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update model testing results (Opus, Sonnet, Gemini Flash/Pro comparison) - Document OpenRouter integration for multi-model testing - Reorganize next steps by pipeline stage (extraction, filtering, review) - Add planned filters: Principle of Charity, dedup/severity threshold - Add cross-cutting concerns: multi-expert aggregation, observability, validation - Add section 3.8: Prioritized implementation plan with 4 phases - Include risk table with mitigations Key insight: Phase 1 (observability + validation) must come first - can't improve what you can't measure. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- ...5-12-15-fact-fallacy-check-improvements.md | 133 +++++++++++++++--- 1 file changed, 113 insertions(+), 20 deletions(-) diff --git a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md index fee0fbe8..952b5009 100644 --- a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md +++ b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md @@ -122,23 +122,116 @@ Separate filtering step. For each issue: "Does this hold under the strongest int Remove filtering logic from review prompt. Focus only on generating summaries. -### 3.5 Next Steps (2025-01-03) - -**Model Testing:** -- Test filter with additional models: Gemini 3 Flash, Gemini 3 Pro -- Current observations: Opus filters more aggressively (0 issues kept), Sonnet more conservative (1-2 kept) -- Opus appears more correct - recognizes that intro claims justified by later technical sections count as supported -- Need to verify on more documents to confirm Opus isn't too lenient on real issues - -**Filter Architecture:** -- Consider verifying each claim in a separate LLM call during filtering stage -- Current batch approach may miss nuances when evaluating multiple claims together -- Per-claim calls would be more expensive but potentially more accurate - -**Extraction Prompt:** -- Take another pass over the extraction prompt - still producing some questionable flags -- Consider splitting extraction into multiple specialized prompts: - - Logical fallacies (non sequitur, circular reasoning, etc.) - - Missing context / unsupported claims - - Rhetorical manipulation / emotional appeals -- Specialized prompts may reduce cognitive load and improve accuracy +### 3.5 Model Testing Results (2025-01-04) + +**Supported-Elsewhere Filter - Model Comparison:** + +| Model | Issues Kept | Cost | Notes | +|-------|-------------|------|-------| +| Claude Opus | 0/5 | ~$0.06 | Most aggressive filtering | +| Claude Sonnet | 1-2/5 | ~$0.02 | Too conservative | +| Gemini 3 Flash | 0/5 | $0.003 | Agrees with Opus, very fast | +| Gemini 3 Pro | 0/5 | $0.054 | Agrees with Opus, detailed explanations | + +**Conclusion:** Opus, Gemini Flash, and Gemini Pro all agree that intro claims justified by later technical sections should be filtered. Sonnet is the outlier - too conservative. **Gemini 3 Flash is the best choice** for the filter: cheap ($0.003), fast, and accurate. + +**Extraction - Model Comparison:** + +| Model | Issues Found | Notes | +|-------|--------------|-------| +| Claude Sonnet | 5 | Standard extraction | +| Gemini 3 Flash | 4 | Slightly different profile - missed 2 issues but found 1 different one | + +Both sets of extracted issues were 100% false positives (all filtered by supported-elsewhere). The extraction differences don't matter in practice since the filter catches them all. + +### 3.6 OpenRouter Integration + +Added OpenRouter support for multi-model testing: + +```bash +# Environment variables for model override +FALLACY_EXTRACTOR_MODEL=google/gemini-3-flash-preview +FALLACY_FILTER_MODEL=google/gemini-3-flash-preview +``` + +**Implemented:** +- `callOpenRouterWithTool()` - Generic wrapper for OpenRouter tool calling +- Temperature normalization per provider (Anthropic 0-1, others 0-2) +- Auto-detection of OpenRouter models (contains `/` in model ID) +- Added Gemini 3 Pro/Flash model IDs to `OPENROUTER_MODELS` + +### 3.7 Next Steps + +#### Extraction +- Try specialized prompts per issue type (logical fallacies, missing context, rhetorical manipulation) +- Test more models (Flash, others) individually and in combination + +#### Filtering +- **Principle of Charity filter** (not yet implemented) - "Does this hold under the strongest interpretation?" +- **Dedup / severity threshold** (not yet implemented) - consolidate similar issues, enforce minimum severity +- Consider per-claim verification (separate LLM calls) - batch approach works but may miss nuances +- Consider Gemini 3 Flash for production (16x cheaper, same accuracy) + +#### Review +- No changes needed - already simplified to summary-only + +#### Cross-Cutting: Multi-Expert Aggregation +- Run multiple models in parallel, aggregate by majority vote or confidence-weighted +- Reduces both false positives and false negatives +- Cost-effective: cheap models (Flash) + one premium model +- Alternative: same model at different temperatures for diversity + +#### Cross-Cutting: Pipeline Observability +- Add metrics/logging per stage: issues in β†’ issues out, time, cost +- Enable tracing through full pipeline for debugging +- Start with structured logs, consider dedicated metrics later +- Goal: understand where issues are caught/missed, identify bottlenecks + +#### Cross-Cutting: Validation & Regression Testing +- **Use meta-evals infrastructure** - already has UI for quick iteration and process parts implemented +- Run against recent unique docs in dev DB (imported from prod), compare to previous results +- Find cases with genuine fallacies that should NOT be filtered (validate filter accuracy) +- Track: issues found, issues filtered, final comments generated +- Measure delta from original to understand impact of changes +- Don't need meta-evals scoring/rating yet - just use the execution framework +- Goal: ensure changes are improvements, catch regressions early + +--- + +### 3.8 Prioritized Implementation Plan + +**Principle: Measure before changing. Validate before deploying.** + +#### Phase 1: Foundation (do this first) +*Can't improve what we can't measure. Can't validate without a baseline.* + +1. **Pipeline observability** - Add structured logging per stage (issues in/out, time, cost). Quick win, enables everything else. +2. **Validation framework** - Set up meta-evals to run against dev DB documents. Establish baseline of current behavior before making changes. + +#### Phase 2: Filter Improvements (one at a time, measured) +*Each change validated against baseline. Stop if regressions detected.* + +3. **Dedup/severity threshold** - Mechanical, low risk. Consolidate similar issues, enforce minimums. +4. **Principle of Charity filter** - LLM-based, higher complexity. "Does this hold under strongest interpretation?" +5. **Per-claim verification** - Only if batch approach shows accuracy issues in validation. + +#### Phase 3: Extraction Improvements +*Filters are solid, now refine the input.* + +6. **Specialized prompts** - Split by issue type (logical fallacies, missing context, rhetorical). Measure each variant. +7. **Model testing** - Compare Flash vs Claude for extraction quality/cost tradeoff. + +#### Phase 4: Optimizations +*Core pipeline works well, now optimize for cost and accuracy.* + +8. **Gemini Flash for production** - 16x cheaper, validated as accurate. Easy win. +9. **Multi-expert aggregation** - Run multiple models, aggregate results. Higher accuracy, diminishing returns. + +#### Key Risks & Mitigations + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Filter too aggressive (misses real issues) | Users see fewer issues than they should | Validation with known-fallacy documents | +| Filter too lenient (keeps false positives) | User trust eroded | Regression testing against baseline | +| Changes make things worse silently | Wasted effort, user harm | Observability + regression framework (Phase 1) | +| Over-engineering before validating | Wasted effort | Phase 1 first, measure before building | From 19b62900f90a30e4ecc4143f738440899298a19a Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Wed, 7 Jan 2026 11:16:29 +0000 Subject: [PATCH 10/72] docs: Add Phase 5 (meta-eval scoring) to implementation plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Meta-eval scoring for comment quality (accuracy, clarity, tone) - Review stage improvements based on meta-eval feedback - Feedback loop to iterate on prompts over time πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../ideation/2025-12-15-fact-fallacy-check-improvements.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md index 952b5009..529d9b2f 100644 --- a/research/ideation/2025-12-15-fact-fallacy-check-improvements.md +++ b/research/ideation/2025-12-15-fact-fallacy-check-improvements.md @@ -227,6 +227,13 @@ FALLACY_FILTER_MODEL=google/gemini-3-flash-preview 8. **Gemini Flash for production** - 16x cheaper, validated as accurate. Easy win. 9. **Multi-expert aggregation** - Run multiple models, aggregate results. Higher accuracy, diminishing returns. +#### Phase 5: Meta-Evaluation & Review Improvements +*Use meta-evals scoring/rating to improve comment quality.* + +10. **Meta-eval scoring** - Enable quality scoring on generated comments (accuracy, clarity, importance, tone). +11. **Review stage improvements** - Use meta-eval feedback to refine comment generation and summaries. +12. **Feedback loop** - Iterate on prompts based on meta-eval scores, track improvement over time. + #### Key Risks & Mitigations | Risk | Impact | Mitigation | From bb3a1df287f884139a6e3a2a9a5c3a654bd335a7 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Wed, 7 Jan 2026 11:55:35 +0000 Subject: [PATCH 11/72] feat: Add pipeline telemetry for fallacy checker observability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create telemetry module with StageMetrics, PipelineExecutionRecord types - Add PipelineTelemetry collector class with fluent API - Track 5 pipeline stages: extraction, dedup, filter, comment-gen, review - Persist telemetry to EvaluationVersion.pipelineTelemetry JSON field - Refactor FallacyCheckPlugin with helper methods for cleaner code πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../ai/src/analysis-plugins/PluginManager.ts | 12 + .../plugins/fallacy-check/index.ts | 328 ++++++++++-------- .../telemetry/PipelineTelemetry.ts | 245 +++++++++++++ .../plugins/fallacy-check/telemetry/index.ts | 13 + .../plugins/fallacy-check/telemetry/types.ts | 100 ++++++ .../ai/src/analysis-plugins/types.ts | 1 + .../documentAnalysis/analyzeDocument.ts | 4 +- .../documentAnalysis/unified/index.ts | 2 + .../migration.sql | 2 + internal-packages/db/prisma/schema.prisma | 1 + .../jobs/src/core/JobOrchestrator.ts | 1 + 11 files changed, 569 insertions(+), 140 deletions(-) create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts create mode 100644 internal-packages/db/prisma/migrations/20260107113055_add_pipeline_telemetry/migration.sql diff --git a/internal-packages/ai/src/analysis-plugins/PluginManager.ts b/internal-packages/ai/src/analysis-plugins/PluginManager.ts index 48a6ccb7..e29ea02c 100644 --- a/internal-packages/ai/src/analysis-plugins/PluginManager.ts +++ b/internal-packages/ai/src/analysis-plugins/PluginManager.ts @@ -66,6 +66,7 @@ export interface SimpleDocumentAnalysisResult { }; logSummary: JobLogSummary; jobLogString: string; // Formatted string for Job.logs field + pipelineTelemetry?: Record; // Pipeline telemetry from plugins (e.g., FallacyCheckPlugin) } export interface FullDocumentAnalysisResult { @@ -88,6 +89,7 @@ export interface FullDocumentAnalysisResult { }>; logSummary: JobLogSummary; jobLogString: string; // Formatted string for Job.logs field + pipelineTelemetry?: Record; // Pipeline telemetry from plugins (e.g., FallacyCheckPlugin) } export class PluginManager { @@ -521,6 +523,13 @@ export class PluginManager { const logSummary = this.pluginLogger.generateSummary(); const jobLogString = this.pluginLogger.generateJobLogString(); + // Collect pipeline telemetry from plugins that provide it (e.g., FALLACY_CHECK) + let pipelineTelemetry: Record | undefined; + const fallacyResult = pluginResults.get('FALLACY_CHECK'); + if (fallacyResult?.pipelineTelemetry) { + pipelineTelemetry = fallacyResult.pipelineTelemetry; + } + return { summary, analysis, @@ -535,6 +544,7 @@ export class PluginManager { }, logSummary, jobLogString, + pipelineTelemetry, }; } finally { // Cleanup if needed @@ -624,6 +634,7 @@ export class PluginManager { errors: undefined, // TODO: Add better error tracking logSummary: pluginResults.logSummary, jobLogString: pluginResults.jobLogString, + pipelineTelemetry: pluginResults.pipelineTelemetry, }; } catch (error) { logger.error( @@ -660,6 +671,7 @@ export class PluginManager { ], logSummary: this.pluginLogger.generateSummary(), jobLogString: this.pluginLogger.generateJobLogString(), + pipelineTelemetry: undefined, }; } } diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts index 267f744c..ca219709 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts @@ -16,6 +16,7 @@ import type { import { LIMITS, THRESHOLDS, ISSUE_TYPES } from "./constants"; import { buildFallacyComment } from "./comments/builder"; import { FallacyIssue } from "./FallacyIssue"; +import { PipelineTelemetry, PIPELINE_STAGES, type PipelineExecutionRecord } from "./telemetry"; export class FallacyCheckPlugin implements SimpleAnalysisPlugin { private documentText: string; @@ -26,6 +27,7 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { private summary: string = ""; private analysis: string = ""; private processingStartTime: number = 0; + private telemetryRecord: PipelineExecutionRecord | null = null; constructor() { // Initialize empty values - they'll be set in analyze() @@ -120,6 +122,9 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { return this.getResults(); } + // Initialize telemetry - use local const to avoid repeated null assertions + const telemetry = new PipelineTelemetry(documentText.length); + try { // Audit log: Analysis started logger.info("FallacyCheckPlugin: AUDIT: Analysis started", { @@ -132,17 +137,19 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { logger.info("FallacyCheckPlugin: Starting analysis (single-pass mode)"); // Phase 1: Single-pass extraction on full document - // This provides full context for better accuracy and reduces false positives - // from flagging intro claims that are supported later in the document + telemetry.startStage(PIPELINE_STAGES.EXTRACTION, 1); // 1 = full document const extractionResult = await this.extractIssuesFromDocument(documentText); - const allIssues: FallacyIssue[] = extractionResult.issues; + telemetry.endStage(allIssues.length, { + error: extractionResult.error, + metadata: { documentLength: documentText.length }, + }); + telemetry.setFinalCounts({ issuesExtracted: allIssues.length }); if (extractionResult.error) { logger.warn(`Issue extraction completed with error: ${extractionResult.error}`); } - // Audit log: Extraction phase completed logger.info("FallacyCheckPlugin: AUDIT: Extraction phase completed", { timestamp: new Date().toISOString(), issuesExtracted: allIssues.length, @@ -150,157 +157,46 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { phase: "extraction", }); - // Deduplicate issues by similar text + // Phase 1.5: Deduplicate issues by similar text + telemetry.startStage(PIPELINE_STAGES.DEDUPLICATION, allIssues.length); const deduplicatedIssues = this.deduplicateIssues(allIssues); + telemetry.endStage(deduplicatedIssues.length); + telemetry.setFinalCounts({ issuesAfterDedup: deduplicatedIssues.length }); - // Phase 1.5: Filter out issues that are supported elsewhere in the document - // This catches false positives where claims are actually justified later + // Phase 2: Filter out issues supported elsewhere in the document logger.info("FallacyCheckPlugin: AUDIT: Supported-elsewhere filter started", { timestamp: new Date().toISOString(), issuesToFilter: deduplicatedIssues.length, phase: "supported-elsewhere-filter", }); - let filteredIssues = deduplicatedIssues; - try { - const filterInput = { - documentText, - issues: deduplicatedIssues.map((issue) => ({ - quotedText: issue.text, - issueType: issue.issueType, - reasoning: issue.issue.reasoning, - locationOffset: issue.issue.location?.startOffset, - })), - }; - - const filterResult = await supportedElsewhereFilterTool.execute( - filterInput, - { logger } - ); - - // Keep only the issues that are NOT supported elsewhere - const unsupportedIndices = new Set( - filterResult.unsupportedIssues.map((r) => r.index) - ); - filteredIssues = deduplicatedIssues.filter((_, idx) => - unsupportedIndices.has(idx) - ); - - // Log what was filtered - const supportedCount = filterResult.supportedIssues.length; - if (supportedCount > 0) { - logger.info( - `FallacyCheckPlugin: Filtered out ${supportedCount} issues (supported elsewhere in document)` - ); - for (const supported of filterResult.supportedIssues) { - logger.debug( - ` - Issue ${supported.index}: ${supported.explanation}` - ); - } - } - - logger.info("FallacyCheckPlugin: AUDIT: Supported-elsewhere filter completed", { - timestamp: new Date().toISOString(), - issuesBeforeFilter: deduplicatedIssues.length, - issuesAfterFilter: filteredIssues.length, - issuesFiltered: supportedCount, - phase: "supported-elsewhere-filter", - }); - } catch (error) { - logger.warn("FallacyCheckPlugin: Supported-elsewhere filter failed, keeping all issues", error); - // Fallback: keep all issues if filter fails - filteredIssues = deduplicatedIssues; - } - - this.issues = filteredIssues; - - // Phase 2: Generate comments for all issues in parallel - const commentPromises = this.issues.map(async (issue) => { - // Run in next tick to ensure true parallelism - await new Promise((resolve) => setImmediate(resolve)); - const comment = await buildFallacyComment( - issue, - documentText, - { logger } - ); - // Filter out comments with empty descriptions - if ( - comment && - comment.description && - comment.description.trim() !== "" - ) { - return comment; - } - return null; - }); - - const commentResults = await Promise.all(commentPromises); - const allComments = commentResults.filter( - (comment): comment is Comment => comment !== null + telemetry.startStage(PIPELINE_STAGES.SUPPORTED_ELSEWHERE_FILTER, deduplicatedIssues.length); + const filteredIssues = await this.runSupportedElsewhereFilter( + deduplicatedIssues, + documentText, + telemetry ); + telemetry.setFinalCounts({ issuesAfterFiltering: filteredIssues.length }); - // Phase 3: Review and filter comments, generate summaries - try { - const reviewComments = allComments.map((comment, index) => ({ - index, - header: comment.header || "Epistemic Issue", - description: comment.description, - level: comment.level || 'warning', - importance: comment.importance, - quotedText: comment.highlight.quotedText, - })); - - // Audit log: Review phase started - logger.info("FallacyCheckPlugin: AUDIT: Review phase started", { - timestamp: new Date().toISOString(), - commentsToReview: allComments.length, - phase: "review", - operation: "fallacy-review-tool", - }); - - const reviewResult = await fallacyReviewTool.execute( - { - documentText, - comments: reviewComments, - }, - { logger } - ); - - // Filter comments based on review - this.comments = reviewResult.commentIndicesToKeep.map( - (idx) => allComments[idx] - ); + this.issues = filteredIssues; - // Use summaries from review - this.summary = reviewResult.oneLineSummary; - this.analysis = reviewResult.documentSummary; + // Phase 3: Generate comments for all issues in parallel + telemetry.startStage(PIPELINE_STAGES.COMMENT_GENERATION, this.issues.length); + const allComments = await this.generateCommentsForIssues(this.issues, documentText); + telemetry.endStage(allComments.length); + telemetry.setFinalCounts({ commentsGenerated: allComments.length }); - // Audit log: Review phase completed - logger.info("FallacyCheckPlugin: AUDIT: Review phase completed", { - timestamp: new Date().toISOString(), - commentsReviewed: allComments.length, - commentsKept: this.comments.length, - commentsFiltered: allComments.length - this.comments.length, - phase: "review", - }); - - logger.info( - `FallacyCheckPlugin: Review complete - kept ${this.comments.length}/${allComments.length} comments` - ); - } catch (error) { - logger.error("FallacyCheckPlugin: Review failed, using fallback", error); - // Fallback: keep all comments and use old summary generation - this.comments = allComments; - const { summary, analysisSummary } = this.generateAnalysis(); - this.summary = summary; - this.analysis = analysisSummary; - } + // Phase 4: Review and filter comments, generate summaries + telemetry.startStage(PIPELINE_STAGES.REVIEW, allComments.length); + await this.reviewAndFilterComments(allComments, documentText, telemetry); this.hasRun = true; - const totalDuration = Date.now() - this.processingStartTime; + // Finalize telemetry + this.telemetryRecord = telemetry.finalize(true); + telemetry.logSummary(); - // Audit log: Analysis completed successfully + const totalDuration = Date.now() - this.processingStartTime; logger.info("FallacyCheckPlugin: AUDIT: Analysis completed", { timestamp: new Date().toISOString(), totalDurationMs: totalDuration, @@ -319,6 +215,10 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { const totalDuration = Date.now() - this.processingStartTime; const errorMessage = error instanceof Error ? error.message : String(error); + // Finalize telemetry with error + this.telemetryRecord = telemetry.finalize(false, errorMessage); + telemetry.logSummary(); + // Audit log: Analysis failed logger.error("FallacyCheckPlugin: AUDIT: Analysis failed", { timestamp: new Date().toISOString(), @@ -348,6 +248,8 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { analysis: this.analysis, comments: this.comments, cost: 0, + // Cast to Record for JSON serialization + pipelineTelemetry: this.telemetryRecord as unknown as Record | undefined, }; } @@ -454,6 +356,154 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { return sortedIssues; } + /** + * Run the supported-elsewhere filter to remove false positives + */ + private async runSupportedElsewhereFilter( + issues: FallacyIssue[], + documentText: string, + telemetry: PipelineTelemetry + ): Promise { + try { + const filterInput = { + documentText, + issues: issues.map((issue) => ({ + quotedText: issue.text, + issueType: issue.issueType, + reasoning: issue.issue.reasoning, + locationOffset: issue.issue.location?.startOffset, + })), + }; + + const filterResult = await supportedElsewhereFilterTool.execute( + filterInput, + { logger } + ); + + // Keep only the issues that are NOT supported elsewhere + const unsupportedIndices = new Set( + filterResult.unsupportedIssues.map((r) => r.index) + ); + const filteredIssues = issues.filter((_, idx) => + unsupportedIndices.has(idx) + ); + + // Log what was filtered + const supportedCount = filterResult.supportedIssues.length; + if (supportedCount > 0) { + logger.info( + `FallacyCheckPlugin: Filtered out ${supportedCount} issues (supported elsewhere in document)` + ); + for (const supported of filterResult.supportedIssues) { + logger.debug(` - Issue ${supported.index}: ${supported.explanation}`); + } + } + + logger.info("FallacyCheckPlugin: AUDIT: Supported-elsewhere filter completed", { + timestamp: new Date().toISOString(), + issuesBeforeFilter: issues.length, + issuesAfterFilter: filteredIssues.length, + issuesFiltered: supportedCount, + phase: "supported-elsewhere-filter", + }); + + telemetry.endStage(filteredIssues.length); + return filteredIssues; + } catch (error) { + logger.warn("FallacyCheckPlugin: Supported-elsewhere filter failed, keeping all issues", error); + telemetry.endStage(issues.length, { + error: error instanceof Error ? error.message : String(error), + }); + return issues; + } + } + + /** + * Generate comments for all issues in parallel + */ + private async generateCommentsForIssues( + issues: FallacyIssue[], + documentText: string + ): Promise { + const commentPromises = issues.map(async (issue) => { + // Run in next tick to ensure true parallelism + await new Promise((resolve) => setImmediate(resolve)); + const comment = await buildFallacyComment(issue, documentText, { logger }); + // Filter out comments with empty descriptions + if (comment?.description?.trim()) { + return comment; + } + return null; + }); + + const commentResults = await Promise.all(commentPromises); + return commentResults.filter((comment): comment is Comment => comment !== null); + } + + /** + * Review and filter comments, generate summaries + */ + private async reviewAndFilterComments( + allComments: Comment[], + documentText: string, + telemetry: PipelineTelemetry + ): Promise { + try { + const reviewComments = allComments.map((comment, index) => ({ + index, + header: comment.header || "Epistemic Issue", + description: comment.description, + level: comment.level || 'warning', + importance: comment.importance, + quotedText: comment.highlight.quotedText, + })); + + logger.info("FallacyCheckPlugin: AUDIT: Review phase started", { + timestamp: new Date().toISOString(), + commentsToReview: allComments.length, + phase: "review", + operation: "fallacy-review-tool", + }); + + const reviewResult = await fallacyReviewTool.execute( + { documentText, comments: reviewComments }, + { logger } + ); + + // Filter comments based on review + this.comments = reviewResult.commentIndicesToKeep.map((idx) => allComments[idx]); + this.summary = reviewResult.oneLineSummary; + this.analysis = reviewResult.documentSummary; + + logger.info("FallacyCheckPlugin: AUDIT: Review phase completed", { + timestamp: new Date().toISOString(), + commentsReviewed: allComments.length, + commentsKept: this.comments.length, + commentsFiltered: allComments.length - this.comments.length, + phase: "review", + }); + + telemetry.endStage(this.comments.length); + telemetry.setFinalCounts({ commentsKept: this.comments.length }); + + logger.info( + `FallacyCheckPlugin: Review complete - kept ${this.comments.length}/${allComments.length} comments` + ); + } catch (error) { + logger.error("FallacyCheckPlugin: Review failed, using fallback", error); + // Fallback: keep all comments and use old summary generation + this.comments = allComments; + const { summary, analysisSummary } = this.generateAnalysis(); + this.summary = summary; + this.analysis = analysisSummary; + + telemetry.endStage(this.comments.length, { + error: error instanceof Error ? error.message : String(error), + }); + telemetry.setFinalCounts({ commentsKept: this.comments.length }); + } + } + private generateAnalysis(): { summary: string; analysisSummary: string } { const totalIssues = this.issues.length; const criticalIssues = this.issues.filter( diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts new file mode 100644 index 00000000..3257d78d --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts @@ -0,0 +1,245 @@ +/** + * Pipeline Telemetry Collector + * + * Collects and aggregates metrics during fallacy check pipeline execution. + * Provides a fluent API for tracking stages and finalizing results. + */ + +import { v4 as uuidv4 } from 'uuid'; +import type { + StageMetrics, + PipelineExecutionRecord, + PipelineStage, +} from './types'; + +/** Current pipeline version - increment when making significant changes */ +const PIPELINE_VERSION = '2.0.0'; // v2: single-pass extraction + supported-elsewhere filter + +/** + * Tracks metrics for an in-progress stage + */ +interface ActiveStage { + stageName: string; + startTime: number; + inputCount: number; + model?: string; +} + +/** + * Pipeline Telemetry Collector + * + * Usage: + * ```ts + * const telemetry = new PipelineTelemetry(documentText.length); + * + * telemetry.startStage('extraction', 1); + * const issues = await extract(); + * telemetry.endStage(issues.length); + * + * telemetry.startStage('filter', issues.length); + * const filtered = await filter(issues); + * telemetry.endStage(filtered.length); + * + * const record = telemetry.finalize(true); + * ``` + */ +export class PipelineTelemetry { + private executionId: string; + private startedAt: Date; + private documentLength: number; + private stages: StageMetrics[] = []; + private activeStage: ActiveStage | null = null; + private finalCounts: PipelineExecutionRecord['finalCounts'] = { + issuesExtracted: 0, + issuesAfterDedup: 0, + issuesAfterFiltering: 0, + commentsGenerated: 0, + commentsKept: 0, + }; + + constructor(documentLength: number) { + this.executionId = uuidv4(); + this.startedAt = new Date(); + this.documentLength = documentLength; + } + + /** + * Start tracking a new pipeline stage + */ + startStage( + stageName: PipelineStage | string, + inputCount: number, + options?: { model?: string } + ): this { + // If there's an active stage that wasn't ended, end it with error + if (this.activeStage) { + console.warn( + `[PipelineTelemetry] Stage '${this.activeStage.stageName}' was not properly ended. Ending with error.` + ); + this.endStage(0, { error: 'Stage was not properly ended' }); + } + + this.activeStage = { + stageName, + startTime: Date.now(), + inputCount, + model: options?.model, + }; + + return this; + } + + /** + * End the current stage and record metrics + */ + endStage( + outputCount: number, + options?: { + costUsd?: number; + error?: string; + metadata?: Record; + } + ): this { + if (!this.activeStage) { + console.warn( + '[PipelineTelemetry] endStage called without an active stage' + ); + return this; + } + + const durationMs = Date.now() - this.activeStage.startTime; + const filteredCount = this.activeStage.inputCount - outputCount; + + const metrics: StageMetrics = { + stageName: this.activeStage.stageName, + durationMs, + inputCount: this.activeStage.inputCount, + outputCount, + filteredCount: Math.max(0, filteredCount), // Don't report negative if output > input + model: this.activeStage.model, + costUsd: options?.costUsd, + error: options?.error, + metadata: options?.metadata, + }; + + this.stages.push(metrics); + this.activeStage = null; + + return this; + } + + /** + * Record a stage that already completed (for stages we can't wrap) + */ + recordStage( + stageName: PipelineStage | string, + metrics: Omit + ): this { + this.stages.push({ + stageName, + ...metrics, + }); + return this; + } + + /** + * Update final counts (call after each major phase) + */ + setFinalCounts( + counts: Partial + ): this { + this.finalCounts = { + ...this.finalCounts, + ...counts, + }; + return this; + } + + /** + * Calculate total cost from all stages + */ + private calculateTotalCost(): number | undefined { + const costs = this.stages + .map((s) => s.costUsd) + .filter((c): c is number => c !== undefined); + + if (costs.length === 0) return undefined; + return costs.reduce((sum, cost) => sum + cost, 0); + } + + /** + * Finalize and return the complete execution record + */ + finalize(success: boolean, error?: string): PipelineExecutionRecord { + // End any active stage + if (this.activeStage) { + this.endStage(0, { error: error || 'Pipeline ended with active stage' }); + } + + const completedAt = new Date(); + const totalDurationMs = completedAt.getTime() - this.startedAt.getTime(); + + return { + executionId: this.executionId, + startedAt: this.startedAt.toISOString(), + completedAt: completedAt.toISOString(), + totalDurationMs, + documentLength: this.documentLength, + stages: this.stages, + finalCounts: this.finalCounts, + success, + error, + totalCostUsd: this.calculateTotalCost(), + pipelineVersion: PIPELINE_VERSION, + }; + } + + /** + * Get execution ID for correlation + */ + getExecutionId(): string { + return this.executionId; + } + + /** + * Log a summary of the current telemetry state + */ + logSummary(): void { + console.log('\n========== PIPELINE TELEMETRY SUMMARY =========='); + console.log(`Execution ID: ${this.executionId}`); + console.log(`Document length: ${this.documentLength} chars`); + console.log(`\nStages completed: ${this.stages.length}`); + + for (const stage of this.stages) { + const status = stage.error ? '❌' : 'βœ…'; + console.log(` ${status} ${stage.stageName}:`); + console.log(` Duration: ${stage.durationMs}ms`); + console.log(` In: ${stage.inputCount} β†’ Out: ${stage.outputCount} (filtered: ${stage.filteredCount})`); + if (stage.model) { + console.log(` Model: ${stage.model}`); + } + if (stage.costUsd !== undefined) { + console.log(` Cost: $${stage.costUsd.toFixed(4)}`); + } + if (stage.error) { + console.log(` Error: ${stage.error}`); + } + } + + console.log('\nFinal counts:'); + console.log(` Issues extracted: ${this.finalCounts.issuesExtracted}`); + console.log(` After dedup: ${this.finalCounts.issuesAfterDedup}`); + console.log(` After filtering: ${this.finalCounts.issuesAfterFiltering}`); + console.log(` Comments generated: ${this.finalCounts.commentsGenerated}`); + console.log(` Comments kept: ${this.finalCounts.commentsKept}`); + + const totalCost = this.calculateTotalCost(); + if (totalCost !== undefined) { + console.log(`\nTotal cost: $${totalCost.toFixed(4)}`); + } + + const elapsed = Date.now() - this.startedAt.getTime(); + console.log(`Total elapsed: ${elapsed}ms`); + console.log('================================================\n'); + } +} diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts new file mode 100644 index 00000000..f3384c74 --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts @@ -0,0 +1,13 @@ +/** + * Pipeline Telemetry Module + * + * Exports telemetry types and collector for fallacy check pipeline observability. + */ + +export { PipelineTelemetry } from './PipelineTelemetry'; +export { + type StageMetrics, + type PipelineExecutionRecord, + type PipelineStage, + PIPELINE_STAGES, +} from './types'; diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts new file mode 100644 index 00000000..8f199cd8 --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts @@ -0,0 +1,100 @@ +/** + * Pipeline Telemetry Types + * + * Structured metrics for tracking fallacy check pipeline execution. + * Used for observability, debugging, and regression detection. + */ + +/** + * Metrics for a single pipeline stage + */ +export interface StageMetrics { + /** Stage name (e.g., 'extraction', 'supported-elsewhere-filter') */ + stageName: string; + + /** Duration of the stage in milliseconds */ + durationMs: number; + + /** Number of items going into this stage */ + inputCount: number; + + /** Number of items coming out of this stage */ + outputCount: number; + + /** Number of items filtered/removed by this stage */ + filteredCount: number; + + /** Estimated cost in dollars (if applicable) */ + costUsd?: number; + + /** Model used for this stage (if applicable) */ + model?: string; + + /** Any error that occurred during this stage */ + error?: string; + + /** Additional stage-specific metadata */ + metadata?: Record; +} + +/** + * Complete pipeline execution record + */ +export interface PipelineExecutionRecord { + /** Unique identifier for this execution */ + executionId: string; + + /** Timestamp when pipeline started */ + startedAt: string; + + /** Timestamp when pipeline completed */ + completedAt: string; + + /** Total duration of the entire pipeline in milliseconds */ + totalDurationMs: number; + + /** Document length in characters */ + documentLength: number; + + /** Metrics for each stage, in order of execution */ + stages: StageMetrics[]; + + /** Final counts */ + finalCounts: { + /** Total issues extracted initially */ + issuesExtracted: number; + /** Issues after deduplication */ + issuesAfterDedup: number; + /** Issues after all filtering */ + issuesAfterFiltering: number; + /** Final comments generated */ + commentsGenerated: number; + /** Comments kept after review */ + commentsKept: number; + }; + + /** Overall success/failure status */ + success: boolean; + + /** Error message if pipeline failed */ + error?: string; + + /** Total estimated cost in dollars */ + totalCostUsd?: number; + + /** Pipeline version (for tracking changes over time) */ + pipelineVersion: string; +} + +/** + * Stage names used in the fallacy check pipeline + */ +export const PIPELINE_STAGES = { + EXTRACTION: 'extraction', + DEDUPLICATION: 'deduplication', + SUPPORTED_ELSEWHERE_FILTER: 'supported-elsewhere-filter', + COMMENT_GENERATION: 'comment-generation', + REVIEW: 'review', +} as const; + +export type PipelineStage = typeof PIPELINE_STAGES[keyof typeof PIPELINE_STAGES]; diff --git a/internal-packages/ai/src/analysis-plugins/types.ts b/internal-packages/ai/src/analysis-plugins/types.ts index 36d69b0e..9fef6882 100644 --- a/internal-packages/ai/src/analysis-plugins/types.ts +++ b/internal-packages/ai/src/analysis-plugins/types.ts @@ -116,6 +116,7 @@ export interface AnalysisResult { comments: Comment[]; cost: number; grade?: number; // Optional grade (0-100) for quality assessment + pipelineTelemetry?: Record; // Pipeline execution telemetry (JSON-serializable) } export interface SimpleAnalysisPlugin { diff --git a/internal-packages/ai/src/workflows/documentAnalysis/analyzeDocument.ts b/internal-packages/ai/src/workflows/documentAnalysis/analyzeDocument.ts index 8d1066dd..635d74be 100644 --- a/internal-packages/ai/src/workflows/documentAnalysis/analyzeDocument.ts +++ b/internal-packages/ai/src/workflows/documentAnalysis/analyzeDocument.ts @@ -25,6 +25,7 @@ export async function analyzeDocument( highlights: Comment[]; tasks: TaskResult[]; jobLogString?: string; // Include job log string for Job.logs field + pipelineTelemetry?: Record; // Pipeline telemetry from fallacy checker }> { const logPrefix = `[Job ${jobId || 'N/A'}]`; logger.info(`${logPrefix} Starting document analysis for agent ${agentInfo.name}`); @@ -123,7 +124,8 @@ export async function analyzeDocument( selfCritique, highlights: highlightExtractionResult.outputs.highlights, tasks, - jobLogString + jobLogString, + pipelineTelemetry: undefined, // LLM workflow doesn't use pipeline telemetry }; } } diff --git a/internal-packages/ai/src/workflows/documentAnalysis/unified/index.ts b/internal-packages/ai/src/workflows/documentAnalysis/unified/index.ts index 27ae4e91..ab6259f6 100644 --- a/internal-packages/ai/src/workflows/documentAnalysis/unified/index.ts +++ b/internal-packages/ai/src/workflows/documentAnalysis/unified/index.ts @@ -34,6 +34,7 @@ export async function analyzeDocumentUnified( highlights: AiComment[]; tasks: TaskResult[]; jobLogString?: string; + pipelineTelemetry?: Record; }> { // Check timeout before starting plugin analysis checkJobTimeout(); @@ -68,6 +69,7 @@ export async function analyzeDocumentUnified( highlights: aiCommentsToDbComments(validAiComments) as any, tasks: result.tasks, jobLogString: result.jobLogString, + pipelineTelemetry: result.pipelineTelemetry, }; } diff --git a/internal-packages/db/prisma/migrations/20260107113055_add_pipeline_telemetry/migration.sql b/internal-packages/db/prisma/migrations/20260107113055_add_pipeline_telemetry/migration.sql new file mode 100644 index 00000000..872c91a4 --- /dev/null +++ b/internal-packages/db/prisma/migrations/20260107113055_add_pipeline_telemetry/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "public"."EvaluationVersion" ADD COLUMN "pipelineTelemetry" JSONB; diff --git a/internal-packages/db/prisma/schema.prisma b/internal-packages/db/prisma/schema.prisma index a19fae27..70d24a66 100644 --- a/internal-packages/db/prisma/schema.prisma +++ b/internal-packages/db/prisma/schema.prisma @@ -142,6 +142,7 @@ model EvaluationVersion { selfCritique String? version Int @default(1) isStale Boolean @default(false) + pipelineTelemetry Json? comments EvaluationComment[] agentVersion AgentVersion @relation(fields: [agentVersionId], references: [id]) documentVersion DocumentVersion @relation(fields: [documentVersionId], references: [id], onDelete: Cascade) diff --git a/internal-packages/jobs/src/core/JobOrchestrator.ts b/internal-packages/jobs/src/core/JobOrchestrator.ts index e26c8630..3909b244 100644 --- a/internal-packages/jobs/src/core/JobOrchestrator.ts +++ b/internal-packages/jobs/src/core/JobOrchestrator.ts @@ -258,6 +258,7 @@ export class JobOrchestrator implements JobOrchestratorInterface { agentVersionId: agentVersion.id, evaluationId: job.evaluation.id, documentVersionId: documentVersion.id, + pipelineTelemetry: evaluationOutputs.pipelineTelemetry || null, job: { connect: { id: job.id, From bd0d97cf5e7f3f1a38a65a9a6b4fd6332c14eda4 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Wed, 7 Jan 2026 12:16:32 +0000 Subject: [PATCH 12/72] feat: Add validation framework for fallacy checker regression testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add validation types (EvaluationSnapshot, DocumentComparisonResult, RegressionFlag) - Add comment comparison logic with fuzzy matching (Levenshtein similarity) - Add regression detection: score drop, lost comments, high-importance loss, extraction drop - Add Validation screen to meta-evals CLI with Corpus/Compare/Results tabs - Add repository methods for corpus queries and evaluation snapshots - Clarify Settings UI shows judge model is for Score/Rank flows TODO: Add baseline selection (pinned golden baseline vs latest run) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../repositories/MetaEvaluationRepository.ts | 230 ++++++++++ meta-evals/src/app.tsx | 13 +- meta-evals/src/components/MainMenu.tsx | 10 +- meta-evals/src/components/Validation.tsx | 403 ++++++++++++++++++ meta-evals/src/components/index.ts | 1 + meta-evals/src/components/types.ts | 3 +- meta-evals/src/validation/compare.ts | 389 +++++++++++++++++ meta-evals/src/validation/index.ts | 8 + meta-evals/src/validation/types.ts | 161 +++++++ 9 files changed, 1213 insertions(+), 5 deletions(-) create mode 100644 meta-evals/src/components/Validation.tsx create mode 100644 meta-evals/src/validation/compare.ts create mode 100644 meta-evals/src/validation/index.ts create mode 100644 meta-evals/src/validation/types.ts diff --git a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts index 7dadfccc..68c89dcf 100644 --- a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts +++ b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts @@ -713,6 +713,236 @@ export class MetaEvaluationRepository { async disconnect(): Promise { await this.prisma.$disconnect(); } + + // ========================================================================== + // Validation Framework Methods + // ========================================================================== + + /** + * Get documents suitable for validation testing. + * Returns documents that have been evaluated by the specified agent. + */ + async getValidationCorpusDocuments( + agentId: string, + options: { limit?: number; minContentLength?: number } = {} + ): Promise< + Array<{ + documentId: string; + title: string; + contentLength: number; + lastEvaluatedAt: Date | null; + evaluationCount: number; + }> + > { + const { limit = 50, minContentLength = 100 } = options; + + // Get documents that have evaluations from this agent + const evaluations = await this.prisma.evaluation.findMany({ + where: { agentId }, + include: { + document: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { title: true, content: true }, + }, + }, + }, + versions: { + orderBy: { createdAt: "desc" }, + take: 1, + select: { createdAt: true }, + }, + _count: { select: { versions: true } }, + }, + take: limit, + }); + + return evaluations + .filter((e) => { + const content = e.document.versions[0]?.content; + return content && content.length >= minContentLength; + }) + .map((e) => ({ + documentId: e.documentId, + title: e.document.versions[0]?.title || "Unknown", + contentLength: e.document.versions[0]?.content.length || 0, + lastEvaluatedAt: e.versions[0]?.createdAt || null, + evaluationCount: e._count.versions, + })); + } + + /** + * Get evaluation snapshots for a set of documents from a specific agent. + * Returns the most recent EvaluationVersion for each document. + */ + async getEvaluationSnapshots( + documentIds: string[], + agentId: string + ): Promise< + Array<{ + evaluationVersionId: string; + agentId: string; + agentName: string; + createdAt: Date; + documentId: string; + documentTitle: string; + grade: number | null; + pipelineTelemetry: unknown; + comments: Array<{ + id: string; + quotedText: string; + header: string | null; + description: string; + importance: number | null; + startOffset: number; + endOffset: number; + }>; + }> + > { + // Get the most recent evaluation version for each document + const evaluations = await this.prisma.evaluation.findMany({ + where: { + agentId, + documentId: { in: documentIds }, + }, + include: { + agent: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { name: true }, + }, + }, + }, + document: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { title: true }, + }, + }, + }, + versions: { + orderBy: { createdAt: "desc" }, + take: 1, + include: { + comments: { + include: { + highlight: true, + }, + }, + }, + }, + }, + }); + + return evaluations + .filter((e) => e.versions.length > 0) + .map((e) => { + const version = e.versions[0]; + return { + evaluationVersionId: version.id, + agentId: e.agentId, + agentName: e.agent.versions[0]?.name || e.agentId, + createdAt: version.createdAt, + documentId: e.documentId, + documentTitle: e.document.versions[0]?.title || "Unknown", + grade: version.grade, + pipelineTelemetry: version.pipelineTelemetry, + comments: version.comments.map((c) => ({ + id: c.id, + quotedText: c.highlight.quotedText, + header: c.header, + description: c.description, + importance: c.importance, + startOffset: c.highlight.startOffset, + endOffset: c.highlight.endOffset, + })), + }; + }); + } + + /** + * Get a specific evaluation version by ID with full details for comparison. + */ + async getEvaluationSnapshotById(evaluationVersionId: string): Promise<{ + evaluationVersionId: string; + agentId: string; + agentName: string; + createdAt: Date; + documentId: string; + documentTitle: string; + grade: number | null; + pipelineTelemetry: unknown; + comments: Array<{ + id: string; + quotedText: string; + header: string | null; + description: string; + importance: number | null; + startOffset: number; + endOffset: number; + }>; + } | null> { + const version = await this.prisma.evaluationVersion.findUnique({ + where: { id: evaluationVersionId }, + include: { + evaluation: { + include: { + agent: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { name: true }, + }, + }, + }, + document: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { title: true }, + }, + }, + }, + }, + }, + comments: { + include: { + highlight: true, + }, + }, + }, + }); + + if (!version) return null; + + return { + evaluationVersionId: version.id, + agentId: version.agentId, + agentName: version.evaluation.agent.versions[0]?.name || version.agentId, + createdAt: version.createdAt, + documentId: version.evaluation.documentId, + documentTitle: version.evaluation.document.versions[0]?.title || "Unknown", + grade: version.grade, + pipelineTelemetry: version.pipelineTelemetry, + comments: version.comments.map((c) => ({ + id: c.id, + quotedText: c.highlight.quotedText, + header: c.header, + description: c.description, + importance: c.importance, + startOffset: c.highlight.startOffset, + endOffset: c.highlight.endOffset, + })), + }; + } } // Default instance for convenience diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index 7a353750..73f06450 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -12,7 +12,7 @@ import { type AgentChoice, } from "@roast/db"; import { apiClient } from "./utils/apiClient"; -import { MainMenu, CreateBaseline, SeriesDetail, RankRuns, ScoreRun, type Screen } from "./components"; +import { MainMenu, CreateBaseline, SeriesDetail, RankRuns, ScoreRun, Validation, type Screen } from "./components"; import { getAvailableModels, getRecommendedJudgeModels, DEFAULT_JUDGE_MODEL, type ModelInfo } from "./utils/models"; // ============================================================================ @@ -229,6 +229,7 @@ export function App() { // Reload the menu loadMainMenu(); }} + onValidation={() => setScreen({ type: "validation" })} onExit={exit} judgeModel={judgeModel} availableModels={availableModels} @@ -338,5 +339,15 @@ export function App() { ); } + if (screen.type === "validation") { + return ( + + ); + } + return null; } diff --git a/meta-evals/src/components/MainMenu.tsx b/meta-evals/src/components/MainMenu.tsx index a60e3d95..f13fb4aa 100644 --- a/meta-evals/src/components/MainMenu.tsx +++ b/meta-evals/src/components/MainMenu.tsx @@ -20,6 +20,7 @@ interface MainMenuProps { onCreateBaseline: () => void; onSelectSeries: (id: string) => void; onDeleteSeries: (id: string) => Promise; + onValidation: () => void; onExit: () => void; judgeModel: string; availableModels: ModelInfo[]; @@ -40,6 +41,7 @@ export function MainMenu({ onCreateBaseline, onSelectSeries, onDeleteSeries, + onValidation, onExit, judgeModel, availableModels, @@ -160,6 +162,7 @@ export function MainMenu({ + For Score/Rank AI judge: Model: {currentModelName} @@ -207,6 +210,7 @@ export function MainMenu({ value: s.id || `series-${idx}`, // Fallback key })), { label: "+ Create New Baseline", value: "create" }, + { label: "πŸ” Validation (Compare Runs)", value: "validation" }, { label: "Exit", value: "exit" }, ]; @@ -280,9 +284,8 @@ export function MainMenu({ : `${series.length} series available`} - Judge: {currentModelName} - {" "}| Temp: {temperature} - {" "}| Tokens: {maxTokens} + Score/Rank Judge: {currentModelName} + {" "}(Tab β†’ Settings to change) @@ -298,6 +301,7 @@ export function MainMenu({ if (confirmDelete) return; // Ignore selection during delete confirmation if (item.value === "exit") onExit(); else if (item.value === "create") onCreateBaseline(); + else if (item.value === "validation") onValidation(); else onSelectSeries(item.value); }} /> diff --git a/meta-evals/src/components/Validation.tsx b/meta-evals/src/components/Validation.tsx new file mode 100644 index 00000000..5a8ff399 --- /dev/null +++ b/meta-evals/src/components/Validation.tsx @@ -0,0 +1,403 @@ +/** + * Validation Screen Component + * + * Compare pipeline runs and detect regressions. + */ + +import React, { useState, useEffect } from "react"; +import { Box, Text, useInput } from "ink"; +import SelectInput from "ink-select-input"; +import Spinner from "ink-spinner"; +import { metaEvaluationRepository, type AgentChoice } from "@roast/db"; +import { truncate } from "./helpers"; +import { ScreenContainer, InfoBox } from "./shared"; +import { + type ValidationDocument, + type DocumentComparisonResult, + type EvaluationSnapshot, + compareSnapshots, + getComparisonStatus, +} from "../validation"; + +type Tab = "corpus" | "compare" | "results"; + +interface ValidationProps { + height: number; + maxItems: number; + onBack: () => void; +} + +interface CorpusDocument extends ValidationDocument { + selected: boolean; +} + +export function Validation({ height, maxItems, onBack }: ValidationProps) { + const [activeTab, setActiveTab] = useState("corpus"); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + // Data + const [agents, setAgents] = useState([]); + const [selectedAgent, setSelectedAgent] = useState(null); + const [corpusDocuments, setCorpusDocuments] = useState([]); + const [comparisons, setComparisons] = useState([]); + const [isRunning, setIsRunning] = useState(false); + const [progress, setProgress] = useState({ current: 0, total: 0 }); + + // Keyboard handling + useInput((input, key) => { + if (key.escape) { + if (activeTab !== "corpus") { + setActiveTab("corpus"); + } else { + onBack(); + } + } + if (key.tab) { + setActiveTab((prev) => { + if (prev === "corpus") return "compare"; + if (prev === "compare") return comparisons.length > 0 ? "results" : "corpus"; + return "corpus"; + }); + } + }); + + // Load agents on mount + useEffect(() => { + loadAgents(); + }, []); + + // Load corpus when agent selected + useEffect(() => { + if (selectedAgent) { + loadCorpus(selectedAgent.id); + } + }, [selectedAgent?.id]); + + async function loadAgents() { + try { + setLoading(true); + // Get agents that use fallacy-check plugin + // Note: pluginIds are stored as lowercase strings (e.g., "fallacy-check") + const { prisma } = await import("@roast/db"); + const fallacyAgents = await prisma.agent.findMany({ + where: { + isDeprecated: false, + ephemeralBatchId: null, + versions: { + some: { + pluginIds: { + has: "fallacy-check", + }, + }, + }, + }, + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { name: true, version: true }, + }, + }, + take: 20, + }); + + const agentChoices: AgentChoice[] = fallacyAgents + .filter((a) => a.versions.length > 0) + .map((a) => ({ + id: a.id, + name: a.versions[0].name, + version: a.versions[0].version, + })); + + setAgents(agentChoices); + if (agentChoices.length > 0) { + setSelectedAgent(agentChoices[0]); + } + setLoading(false); + } catch (e) { + setError(String(e)); + setLoading(false); + } + } + + async function loadCorpus(agentId: string) { + try { + setLoading(true); + const docs = await metaEvaluationRepository.getValidationCorpusDocuments( + agentId, + { limit: 50, minContentLength: 200 } + ); + + setCorpusDocuments( + docs.map((d) => ({ + ...d, + selected: true, // Select all by default + })) + ); + setLoading(false); + } catch (e) { + setError(String(e)); + setLoading(false); + } + } + + async function runValidation() { + if (!selectedAgent) return; + const selectedDocs = corpusDocuments.filter((d) => d.selected); + if (selectedDocs.length === 0) return; + + setIsRunning(true); + setProgress({ current: 0, total: selectedDocs.length }); + setActiveTab("compare"); + + try { + // Get baseline snapshots (most recent evaluations) + const baselineSnapshots = await metaEvaluationRepository.getEvaluationSnapshots( + selectedDocs.map((d) => d.documentId), + selectedAgent.id + ); + + // For now, we compare baseline with itself (to test the UI) + // In real use, we'd run the pipeline again and compare + const results: DocumentComparisonResult[] = []; + + for (const snapshot of baselineSnapshots) { + setProgress((p) => ({ ...p, current: p.current + 1 })); + + // Convert to EvaluationSnapshot format + const baselineEval: EvaluationSnapshot = { + evaluationVersionId: snapshot.evaluationVersionId, + agentId: snapshot.agentId, + agentName: snapshot.agentName, + createdAt: snapshot.createdAt, + documentId: snapshot.documentId, + documentTitle: snapshot.documentTitle, + comments: snapshot.comments, + grade: snapshot.grade, + pipelineTelemetry: extractTelemetry(snapshot.pipelineTelemetry), + }; + + // For demo, use same snapshot as "current" + // In real use, this would be from a new pipeline run + const comparison = compareSnapshots(baselineEval, baselineEval); + results.push(comparison); + } + + setComparisons(results); + setActiveTab("results"); + } catch (e) { + setError(String(e)); + } finally { + setIsRunning(false); + } + } + + function toggleDocument(docId: string) { + setCorpusDocuments((docs) => + docs.map((d) => + d.documentId === docId ? { ...d, selected: !d.selected } : d + ) + ); + } + + function toggleAll() { + const allSelected = corpusDocuments.every((d) => d.selected); + setCorpusDocuments((docs) => + docs.map((d) => ({ ...d, selected: !allSelected })) + ); + } + + // Render tabs header + const renderTabs = () => ( + + + [Corpus] + + + + [Compare] + + + + [Results] + + (Tab to switch) + + ); + + if (error) { + return ( + + Error: {error} + Press Escape to go back + + ); + } + + if (loading) { + return ( + + + Loading... + + + ); + } + + // Results tab + if (activeTab === "results") { + const okCount = comparisons.filter((c) => getComparisonStatus(c) === "ok").length; + const warningCount = comparisons.filter((c) => getComparisonStatus(c) === "warning").length; + const errorCount = comparisons.filter((c) => getComparisonStatus(c) === "error").length; + + return ( + + {renderTabs()} + + + + βœ… {okCount} + {" | "} + ⚠️ {warningCount} + {" | "} + ❌ {errorCount} + {" | "} + Total: {comparisons.length} + + + + + {comparisons.slice(0, maxItems - 5).map((c, i) => { + const status = getComparisonStatus(c); + const icon = status === "ok" ? "βœ…" : status === "warning" ? "⚠️" : "❌"; + const color = status === "ok" ? "green" : status === "warning" ? "yellow" : "red"; + + return ( + + + {icon} {truncate(c.documentTitle, 50)} + + + {" "}| {c.baseline.comments.length} β†’ {c.current.comments.length} comments + + + ); + })} + + + + Escape Go back | Tab Switch tabs + + + ); + } + + // Compare tab (running) + if (activeTab === "compare") { + return ( + + {renderTabs()} + + {isRunning ? ( + + + Running validation... + + + {progress.current}/{progress.total} documents + + + ) : ( + + Select documents and run validation from the Corpus tab. + + )} + + + Escape Go back | Tab Switch tabs + + + ); + } + + // Corpus tab (default) + const selectedCount = corpusDocuments.filter((d) => d.selected).length; + const items = [ + ...(agents.length > 1 + ? [{ label: `Agent: ${selectedAgent?.name || "Select..."}`, value: "agent" }] + : []), + { label: `[${selectedCount === corpusDocuments.length ? "x" : " "}] Select All (${corpusDocuments.length} docs)`, value: "toggle-all" }, + ...corpusDocuments.slice(0, maxItems - 5).map((d) => ({ + label: `[${d.selected ? "x" : " "}] ${truncate(d.title, 45)} (${d.evaluationCount} evals)`, + value: d.documentId, + })), + { label: selectedCount > 0 ? `β–Ά Run Validation (${selectedCount} selected)` : "β–Ά Run Validation (select docs first)", value: "run" }, + { label: "← Back", value: "back" }, + ]; + + return ( + + {renderTabs()} + + + + Agent: {selectedAgent?.name || "None"} + {" | "} + Selected: {selectedCount}/{corpusDocuments.length} + + + + { + if (item.value === "back") { + onBack(); + } else if (item.value === "toggle-all") { + toggleAll(); + } else if (item.value === "run") { + if (selectedCount > 0) { + runValidation(); + } + } else if (item.value === "agent") { + // TODO: Agent selection UI + } else { + toggleDocument(item.value); + } + }} + /> + + + Enter Toggle/Select | Tab Switch tabs | Escape Go back + + + ); +} + +/** + * Extract pipeline telemetry snapshot from raw data. + */ +function extractTelemetry(raw: unknown): { + totalDurationMs: number; + issuesExtracted: number; + issuesAfterDedup: number; + issuesAfterFiltering: number; + commentsGenerated: number; + commentsKept: number; +} | null { + if (!raw || typeof raw !== "object") return null; + + const telemetry = raw as Record; + const finalCounts = telemetry.finalCounts as Record | undefined; + + if (!finalCounts) return null; + + return { + totalDurationMs: (telemetry.totalDurationMs as number) || 0, + issuesExtracted: finalCounts.issuesExtracted || 0, + issuesAfterDedup: finalCounts.issuesAfterDedup || 0, + issuesAfterFiltering: finalCounts.issuesAfterFiltering || 0, + commentsGenerated: finalCounts.commentsGenerated || 0, + commentsKept: finalCounts.commentsKept || 0, + }; +} diff --git a/meta-evals/src/components/index.ts b/meta-evals/src/components/index.ts index 8bca17dd..b36628db 100644 --- a/meta-evals/src/components/index.ts +++ b/meta-evals/src/components/index.ts @@ -7,5 +7,6 @@ export { CreateBaseline } from "./CreateBaseline"; export { SeriesDetail } from "./SeriesDetail"; export { RankRuns } from "./RankRuns"; export { ScoreRun } from "./ScoreRun"; +export { Validation } from "./Validation"; export * from "./helpers"; export * from "./types"; diff --git a/meta-evals/src/components/types.ts b/meta-evals/src/components/types.ts index 838ff68d..bc55d13d 100644 --- a/meta-evals/src/components/types.ts +++ b/meta-evals/src/components/types.ts @@ -10,6 +10,7 @@ export type Screen = | { type: "create-baseline"; step: "document" | "agents" | "confirm" | "creating" } | { type: "series-detail"; seriesId: string } | { type: "rank-runs"; seriesId: string } - | { type: "score-run"; seriesId: string }; + | { type: "score-run"; seriesId: string } + | { type: "validation" }; export type { SeriesSummary, DocumentChoice, AgentChoice }; diff --git a/meta-evals/src/validation/compare.ts b/meta-evals/src/validation/compare.ts new file mode 100644 index 00000000..9e7cbef9 --- /dev/null +++ b/meta-evals/src/validation/compare.ts @@ -0,0 +1,389 @@ +/** + * Comparison Logic for Validation Framework + * + * Compares evaluation snapshots and detects regressions. + */ + +import type { + ComparableComment, + EvaluationSnapshot, + PipelineTelemetrySnapshot, + CommentComparisonResult, + DocumentComparisonResult, + RegressionFlag, + RegressionType, +} from "./types"; +import { REGRESSION_THRESHOLDS } from "./types"; + +/** + * Calculate similarity between two strings using Levenshtein distance. + * Returns a value between 0 (completely different) and 1 (identical). + */ +function stringSimilarity(a: string, b: string): number { + if (a === b) return 1; + if (a.length === 0 || b.length === 0) return 0; + + // Normalize strings for comparison + const normalize = (s: string) => s.toLowerCase().trim(); + const normA = normalize(a); + const normB = normalize(b); + + if (normA === normB) return 1; + + // Calculate Levenshtein distance + const matrix: number[][] = []; + for (let i = 0; i <= normA.length; i++) { + matrix[i] = [i]; + } + for (let j = 0; j <= normB.length; j++) { + matrix[0][j] = j; + } + for (let i = 1; i <= normA.length; i++) { + for (let j = 1; j <= normB.length; j++) { + const cost = normA[i - 1] === normB[j - 1] ? 0 : 1; + matrix[i][j] = Math.min( + matrix[i - 1][j] + 1, // deletion + matrix[i][j - 1] + 1, // insertion + matrix[i - 1][j - 1] + cost // substitution + ); + } + } + + const maxLen = Math.max(normA.length, normB.length); + return 1 - matrix[normA.length][normB.length] / maxLen; +} + +/** + * Check if two comments match based on quoted text. + * Uses fuzzy matching since quoted text might vary slightly between runs. + */ +function commentsMatch( + a: ComparableComment, + b: ComparableComment, + threshold = 0.8 +): { matches: boolean; confidence: number } { + // First try exact match on quoted text + if (a.quotedText === b.quotedText) { + return { matches: true, confidence: 1 }; + } + + // Check if offset ranges overlap significantly + const overlapStart = Math.max(a.startOffset, b.startOffset); + const overlapEnd = Math.min(a.endOffset, b.endOffset); + const overlap = Math.max(0, overlapEnd - overlapStart); + const unionLength = + Math.max(a.endOffset, b.endOffset) - Math.min(a.startOffset, b.startOffset); + const overlapRatio = unionLength > 0 ? overlap / unionLength : 0; + + // If offsets overlap significantly, check text similarity + if (overlapRatio > 0.5) { + const textSimilarity = stringSimilarity(a.quotedText, b.quotedText); + if (textSimilarity >= threshold) { + return { matches: true, confidence: textSimilarity }; + } + } + + // Fallback: pure text similarity for comments on same region + const textSimilarity = stringSimilarity(a.quotedText, b.quotedText); + if (textSimilarity >= threshold) { + return { matches: true, confidence: textSimilarity }; + } + + return { matches: false, confidence: textSimilarity }; +} + +/** + * Match comments between baseline and current snapshots. + * Returns matched pairs, new comments, and lost comments. + */ +function matchComments( + baseline: ComparableComment[], + current: ComparableComment[] +): { + matched: CommentComparisonResult[]; + newComments: ComparableComment[]; + lostComments: ComparableComment[]; +} { + const matched: CommentComparisonResult[] = []; + const unmatchedBaseline = new Set(baseline.map((_, i) => i)); + const unmatchedCurrent = new Set(current.map((_, i) => i)); + + // Greedy matching: find best match for each baseline comment + for (let i = 0; i < baseline.length; i++) { + let bestMatch: { index: number; confidence: number } | null = null; + + for (let j = 0; j < current.length; j++) { + if (!unmatchedCurrent.has(j)) continue; + + const result = commentsMatch(baseline[i], current[j]); + if (result.matches) { + if (!bestMatch || result.confidence > bestMatch.confidence) { + bestMatch = { index: j, confidence: result.confidence }; + } + } + } + + if (bestMatch) { + matched.push({ + status: "matched", + baselineComment: baseline[i], + currentComment: current[bestMatch.index], + matchConfidence: bestMatch.confidence, + }); + unmatchedBaseline.delete(i); + unmatchedCurrent.delete(bestMatch.index); + } + } + + // Remaining baseline comments are "lost" + const lostComments = Array.from(unmatchedBaseline).map((i) => baseline[i]); + + // Remaining current comments are "new" + const newComments = Array.from(unmatchedCurrent).map((i) => current[i]); + + return { matched, newComments, lostComments }; +} + +/** + * Extract telemetry snapshot from raw pipeline telemetry. + */ +function extractTelemetrySnapshot( + raw: unknown +): PipelineTelemetrySnapshot | null { + if (!raw || typeof raw !== "object") return null; + + const telemetry = raw as Record; + const finalCounts = telemetry.finalCounts as Record | undefined; + + if (!finalCounts) return null; + + return { + totalDurationMs: (telemetry.totalDurationMs as number) || 0, + issuesExtracted: finalCounts.issuesExtracted || 0, + issuesAfterDedup: finalCounts.issuesAfterDedup || 0, + issuesAfterFiltering: finalCounts.issuesAfterFiltering || 0, + commentsGenerated: finalCounts.commentsGenerated || 0, + commentsKept: finalCounts.commentsKept || 0, + }; +} + +/** + * Detect regressions between baseline and current telemetry. + */ +function detectTelemetryRegressions( + baseline: PipelineTelemetrySnapshot | null, + current: PipelineTelemetrySnapshot | null +): RegressionFlag[] { + const regressions: RegressionFlag[] = []; + + if (!baseline || !current) return regressions; + + // Extraction drop + if (baseline.issuesExtracted > 0) { + const extractionDropPercent = + ((baseline.issuesExtracted - current.issuesExtracted) / + baseline.issuesExtracted) * + 100; + + if (extractionDropPercent >= REGRESSION_THRESHOLDS.EXTRACTION_DROP_PERCENT) { + regressions.push({ + type: "extraction_drop", + severity: "error", + message: `Extraction dropped ${extractionDropPercent.toFixed(0)}% (${baseline.issuesExtracted} β†’ ${current.issuesExtracted})`, + details: { + baselineCount: baseline.issuesExtracted, + currentCount: current.issuesExtracted, + dropPercent: extractionDropPercent, + }, + }); + } + } + + // Duration spike + if (baseline.totalDurationMs > 0) { + const durationIncreasePercent = + ((current.totalDurationMs - baseline.totalDurationMs) / + baseline.totalDurationMs) * + 100; + + if (durationIncreasePercent >= REGRESSION_THRESHOLDS.DURATION_SPIKE_PERCENT) { + regressions.push({ + type: "duration_spike", + severity: "warning", + message: `Duration increased ${durationIncreasePercent.toFixed(0)}% (${baseline.totalDurationMs}ms β†’ ${current.totalDurationMs}ms)`, + details: { + baselineMs: baseline.totalDurationMs, + currentMs: current.totalDurationMs, + increasePercent: durationIncreasePercent, + }, + }); + } + } + + return regressions; +} + +/** + * Compare two evaluation snapshots and detect regressions. + */ +export function compareSnapshots( + baseline: EvaluationSnapshot, + current: EvaluationSnapshot +): DocumentComparisonResult { + // Match comments + const { matched, newComments, lostComments } = matchComments( + baseline.comments, + current.comments + ); + + // Calculate aggregate metrics + const scoreChange = + baseline.grade !== null && current.grade !== null + ? current.grade - baseline.grade + : null; + + const commentCountChange = current.comments.length - baseline.comments.length; + + // Extract telemetry + const baselineTelemetry = extractTelemetrySnapshot(baseline.pipelineTelemetry); + const currentTelemetry = extractTelemetrySnapshot(current.pipelineTelemetry); + + const extractionChange = + baselineTelemetry && currentTelemetry && baselineTelemetry.issuesExtracted > 0 + ? ((currentTelemetry.issuesExtracted - baselineTelemetry.issuesExtracted) / + baselineTelemetry.issuesExtracted) * + 100 + : null; + + const durationChange = + baselineTelemetry && currentTelemetry + ? currentTelemetry.totalDurationMs - baselineTelemetry.totalDurationMs + : null; + + // Detect regressions + const regressions: RegressionFlag[] = []; + + // Score drop + if (scoreChange !== null && scoreChange < -REGRESSION_THRESHOLDS.SCORE_DROP) { + regressions.push({ + type: "score_drop", + severity: "error", + message: `Score dropped by ${Math.abs(scoreChange).toFixed(1)} (${baseline.grade} β†’ ${current.grade})`, + details: { + baselineScore: baseline.grade, + currentScore: current.grade, + drop: Math.abs(scoreChange), + }, + }); + } + + // Lost comments threshold + if (baseline.comments.length > 0) { + const lostPercent = + (lostComments.length / baseline.comments.length) * 100; + + if (lostPercent >= REGRESSION_THRESHOLDS.LOST_COMMENTS_PERCENT) { + regressions.push({ + type: "lost_comments", + severity: "error", + message: `Lost ${lostPercent.toFixed(0)}% of comments (${lostComments.length}/${baseline.comments.length})`, + details: { + lostCount: lostComments.length, + baselineCount: baseline.comments.length, + lostPercent, + }, + }); + } + } + + // High-importance comments lost + const highImportanceLost = lostComments.filter( + (c) => + c.importance !== null && + c.importance >= REGRESSION_THRESHOLDS.HIGH_IMPORTANCE_THRESHOLD + ); + + if (highImportanceLost.length > 0) { + regressions.push({ + type: "lost_high_importance", + severity: "error", + message: `Lost ${highImportanceLost.length} high-importance comment(s)`, + details: { + lostComments: highImportanceLost.map((c) => ({ + header: c.header, + importance: c.importance, + quotedText: c.quotedText.slice(0, 50), + })), + }, + }); + } + + // Telemetry regressions + regressions.push( + ...detectTelemetryRegressions(baselineTelemetry, currentTelemetry) + ); + + return { + documentId: baseline.documentId, + documentTitle: baseline.documentTitle, + baseline, + current, + matchedComments: matched, + newComments, + lostComments, + scoreChange, + commentCountChange, + extractionChange, + durationChange, + regressions, + }; +} + +/** + * Determine overall status from regressions. + */ +export function getComparisonStatus( + result: DocumentComparisonResult +): "ok" | "warning" | "error" { + const hasError = result.regressions.some((r) => r.severity === "error"); + const hasWarning = result.regressions.some((r) => r.severity === "warning"); + + if (hasError) return "error"; + if (hasWarning) return "warning"; + return "ok"; +} + +/** + * Format a comparison result for display. + */ +export function formatComparisonSummary( + result: DocumentComparisonResult +): string { + const status = getComparisonStatus(result); + const icon = status === "ok" ? "βœ…" : status === "warning" ? "⚠️" : "❌"; + + const parts = [ + `${icon} ${result.documentTitle}`, + ` Comments: ${result.baseline.comments.length} β†’ ${result.current.comments.length}`, + ]; + + if (result.scoreChange !== null) { + const sign = result.scoreChange >= 0 ? "+" : ""; + parts.push(` Score: ${result.baseline.grade} β†’ ${result.current.grade} (${sign}${result.scoreChange.toFixed(1)})`); + } + + if (result.newComments.length > 0) { + parts.push(` New: ${result.newComments.length}`); + } + + if (result.lostComments.length > 0) { + parts.push(` Lost: ${result.lostComments.length}`); + } + + for (const regression of result.regressions) { + const rIcon = regression.severity === "error" ? "πŸ”΄" : "🟑"; + parts.push(` ${rIcon} ${regression.message}`); + } + + return parts.join("\n"); +} diff --git a/meta-evals/src/validation/index.ts b/meta-evals/src/validation/index.ts new file mode 100644 index 00000000..b5db5c37 --- /dev/null +++ b/meta-evals/src/validation/index.ts @@ -0,0 +1,8 @@ +/** + * Validation Framework + * + * Run fallacy checker against corpus, compare results, detect regressions. + */ + +export * from "./types"; +export * from "./compare"; diff --git a/meta-evals/src/validation/types.ts b/meta-evals/src/validation/types.ts new file mode 100644 index 00000000..8ccfd61a --- /dev/null +++ b/meta-evals/src/validation/types.ts @@ -0,0 +1,161 @@ +/** + * Types for Validation Framework + * + * Used to compare pipeline runs and detect regressions. + */ + +/** + * A document selected for validation testing + */ +export interface ValidationDocument { + documentId: string; + title: string; + contentLength: number; + lastEvaluatedAt: Date | null; + evaluationCount: number; +} + +/** + * Simplified comment for comparison purposes + */ +export interface ComparableComment { + id: string; + quotedText: string; + header: string | null; + description: string; + importance: number | null; + startOffset: number; + endOffset: number; +} + +/** + * An evaluation snapshot for comparison + */ +export interface EvaluationSnapshot { + evaluationVersionId: string; + agentId: string; + agentName: string; + createdAt: Date; + documentId: string; + documentTitle: string; + comments: ComparableComment[]; + grade: number | null; + pipelineTelemetry: PipelineTelemetrySnapshot | null; +} + +/** + * Simplified telemetry for comparison + */ +export interface PipelineTelemetrySnapshot { + totalDurationMs: number; + issuesExtracted: number; + issuesAfterDedup: number; + issuesAfterFiltering: number; + commentsGenerated: number; + commentsKept: number; +} + +/** + * Result of comparing a single comment between runs + */ +export interface CommentComparisonResult { + status: "matched" | "new" | "lost"; + baselineComment?: ComparableComment; + currentComment?: ComparableComment; + matchConfidence?: number; // 0-1 for fuzzy matches +} + +/** + * Result of comparing two evaluation snapshots + */ +export interface DocumentComparisonResult { + documentId: string; + documentTitle: string; + baseline: EvaluationSnapshot; + current: EvaluationSnapshot; + + // Comment-level changes + matchedComments: CommentComparisonResult[]; + newComments: ComparableComment[]; + lostComments: ComparableComment[]; + + // Aggregate metrics + scoreChange: number | null; // current - baseline (null if either missing) + commentCountChange: number; // current - baseline + + // Pipeline telemetry changes + extractionChange: number | null; // % change in issues extracted + durationChange: number | null; // ms change + + // Regression flags + regressions: RegressionFlag[]; +} + +/** + * A specific regression detected + */ +export interface RegressionFlag { + type: RegressionType; + severity: "warning" | "error"; + message: string; + details?: Record; +} + +export type RegressionType = + | "score_drop" + | "lost_comments" + | "lost_high_importance" + | "extraction_drop" + | "duration_spike"; + +/** + * Thresholds for regression detection + */ +export const REGRESSION_THRESHOLDS = { + // Score drop > 1 point is a regression + SCORE_DROP: 1, + // Losing > 50% of comments is a regression + LOST_COMMENTS_PERCENT: 50, + // Any lost comment with importance > 70 is a regression + HIGH_IMPORTANCE_THRESHOLD: 70, + // Extraction dropping > 30% is a regression + EXTRACTION_DROP_PERCENT: 30, + // Duration increase > 100% is a warning + DURATION_SPIKE_PERCENT: 100, +} as const; + +/** + * Summary of a validation run + */ +export interface ValidationRunSummary { + runId: string; + createdAt: Date; + description: string; + documentCount: number; + + // Aggregate results + noRegressionCount: number; + warningCount: number; + errorCount: number; + + // Can be set as new baseline + canBeBaseline: boolean; +} + +/** + * Full validation run with all comparisons + */ +export interface ValidationRun { + summary: ValidationRunSummary; + comparisons: DocumentComparisonResult[]; +} + +/** + * Input for creating a validation run + */ +export interface CreateValidationRunInput { + description: string; + documentIds: string[]; + agentId: string; + baselineRunId?: string; // If not specified, uses most recent for each doc +} From 406bb275555680316f69889adfdffb9751271072 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Wed, 7 Jan 2026 12:47:33 +0000 Subject: [PATCH 13/72] feat: Add baseline management and pipeline execution to validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add ValidationBaseline and ValidationBaselineSnapshot tables - Add repository methods for baseline CRUD - Update Validation UI with baseline management: - Create/delete/select baselines - Run pipeline on baseline documents - Compare new results vs saved baseline - Save results as new baseline - Show change summary: "X kept, +Y new, -Z lost" per document - Use [=] unchanged / [~] changed instead of pass/fail icons - Clarify main menu labels (Score/Rank vs Validation) - Remove emoji from menu items πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../migration.sql | 48 ++ internal-packages/db/prisma/schema.prisma | 73 ++- .../repositories/MetaEvaluationRepository.ts | 197 ++++++ meta-evals/src/app.tsx | 11 + meta-evals/src/components/MainMenu.tsx | 4 +- meta-evals/src/components/Validation.tsx | 588 +++++++++++++----- 6 files changed, 760 insertions(+), 161 deletions(-) create mode 100644 internal-packages/db/prisma/migrations/20260107122529_add_validation_baselines/migration.sql diff --git a/internal-packages/db/prisma/migrations/20260107122529_add_validation_baselines/migration.sql b/internal-packages/db/prisma/migrations/20260107122529_add_validation_baselines/migration.sql new file mode 100644 index 00000000..669bc146 --- /dev/null +++ b/internal-packages/db/prisma/migrations/20260107122529_add_validation_baselines/migration.sql @@ -0,0 +1,48 @@ +-- CreateTable +CREATE TABLE "public"."ValidationBaseline" ( + "id" TEXT NOT NULL, + "name" TEXT NOT NULL, + "description" TEXT, + "agentId" TEXT NOT NULL, + "commitHash" TEXT, + "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "createdById" TEXT, + + CONSTRAINT "ValidationBaseline_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "public"."ValidationBaselineSnapshot" ( + "id" TEXT NOT NULL, + "baselineId" TEXT NOT NULL, + "evaluationVersionId" TEXT NOT NULL, + + CONSTRAINT "ValidationBaselineSnapshot_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE INDEX "ValidationBaseline_agentId_idx" ON "public"."ValidationBaseline"("agentId"); + +-- CreateIndex +CREATE INDEX "ValidationBaseline_createdAt_idx" ON "public"."ValidationBaseline"("createdAt"); + +-- CreateIndex +CREATE INDEX "ValidationBaselineSnapshot_baselineId_idx" ON "public"."ValidationBaselineSnapshot"("baselineId"); + +-- CreateIndex +CREATE INDEX "ValidationBaselineSnapshot_evaluationVersionId_idx" ON "public"."ValidationBaselineSnapshot"("evaluationVersionId"); + +-- CreateIndex +CREATE UNIQUE INDEX "ValidationBaselineSnapshot_baselineId_evaluationVersionId_key" ON "public"."ValidationBaselineSnapshot"("baselineId", "evaluationVersionId"); + +-- AddForeignKey +ALTER TABLE "public"."ValidationBaseline" ADD CONSTRAINT "ValidationBaseline_agentId_fkey" FOREIGN KEY ("agentId") REFERENCES "public"."Agent"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "public"."ValidationBaseline" ADD CONSTRAINT "ValidationBaseline_createdById_fkey" FOREIGN KEY ("createdById") REFERENCES "public"."User"("id") ON DELETE SET NULL ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "public"."ValidationBaselineSnapshot" ADD CONSTRAINT "ValidationBaselineSnapshot_baselineId_fkey" FOREIGN KEY ("baselineId") REFERENCES "public"."ValidationBaseline"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "public"."ValidationBaselineSnapshot" ADD CONSTRAINT "ValidationBaselineSnapshot_evaluationVersionId_fkey" FOREIGN KEY ("evaluationVersionId") REFERENCES "public"."EvaluationVersion"("id") ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/internal-packages/db/prisma/schema.prisma b/internal-packages/db/prisma/schema.prisma index 70d24a66..96c75c52 100644 --- a/internal-packages/db/prisma/schema.prisma +++ b/internal-packages/db/prisma/schema.prisma @@ -50,14 +50,15 @@ model User { evalsThisMonth Int @default(0) hourResetAt DateTime? monthResetAt DateTime? - accounts Account[] - agents Agent[] - agentEvalBatches AgentEvalBatch[] - apiKeys ApiKey[] - documents Document[] - sessions Session[] - cancelledJobs Job[] - claimEvaluations ClaimEvaluation[] + accounts Account[] + agents Agent[] + agentEvalBatches AgentEvalBatch[] + apiKeys ApiKey[] + documents Document[] + sessions Session[] + cancelledJobs Job[] + claimEvaluations ClaimEvaluation[] + validationBaselines ValidationBaseline[] } model VerificationToken { @@ -143,12 +144,13 @@ model EvaluationVersion { version Int @default(1) isStale Boolean @default(false) pipelineTelemetry Json? - comments EvaluationComment[] - agentVersion AgentVersion @relation(fields: [agentVersionId], references: [id]) - documentVersion DocumentVersion @relation(fields: [documentVersionId], references: [id], onDelete: Cascade) - evaluation Evaluation @relation(fields: [evaluationId], references: [id], onDelete: Cascade) - job Job? - metaEvaluations MetaEvaluation[] + comments EvaluationComment[] + agentVersion AgentVersion @relation(fields: [agentVersionId], references: [id]) + documentVersion DocumentVersion @relation(fields: [documentVersionId], references: [id], onDelete: Cascade) + evaluation Evaluation @relation(fields: [evaluationId], references: [id], onDelete: Cascade) + job Job? + metaEvaluations MetaEvaluation[] + validationBaselineSnapshots ValidationBaselineSnapshot[] @@unique([evaluationId, version]) @@index([evaluationId]) @@ -195,11 +197,12 @@ model Agent { isDeprecated Boolean @default(false) isRecommended Boolean @default(false) isLlmCostTracked Boolean @default(true) - ephemeralBatch AgentEvalBatch? @relation("EphemeralAgent", fields: [ephemeralBatchId], references: [id], onDelete: Cascade) - submittedBy User @relation(fields: [submittedById], references: [id], onDelete: Cascade) - evalBatches AgentEvalBatch[] - versions AgentVersion[] - evaluations Evaluation[] + ephemeralBatch AgentEvalBatch? @relation("EphemeralAgent", fields: [ephemeralBatchId], references: [id], onDelete: Cascade) + submittedBy User @relation(fields: [submittedById], references: [id], onDelete: Cascade) + evalBatches AgentEvalBatch[] + versions AgentVersion[] + evaluations Evaluation[] + validationBaselines ValidationBaseline[] } /// This model contains an expression index which requires additional setup for migrations. Visit https://pris.ly/d/expression-indexes for more info. @@ -440,3 +443,35 @@ model MetaEvaluationDimension { @@index([name]) @@index([score]) } + +/// Validation baseline - a saved reference point for regression testing +model ValidationBaseline { + id String @id @default(cuid()) + name String // "Pre-refactor", "v2.0 release" + description String? + agentId String + commitHash String? // git commit when baseline was created + createdAt DateTime @default(now()) + createdById String? + + agent Agent @relation(fields: [agentId], references: [id], onDelete: Cascade) + createdBy User? @relation(fields: [createdById], references: [id]) + snapshots ValidationBaselineSnapshot[] + + @@index([agentId]) + @@index([createdAt]) +} + +/// Links a baseline to specific evaluation versions (one per document) +model ValidationBaselineSnapshot { + id String @id @default(cuid()) + baselineId String + evaluationVersionId String + + baseline ValidationBaseline @relation(fields: [baselineId], references: [id], onDelete: Cascade) + evaluationVersion EvaluationVersion @relation(fields: [evaluationVersionId], references: [id], onDelete: Cascade) + + @@unique([baselineId, evaluationVersionId]) + @@index([baselineId]) + @@index([evaluationVersionId]) +} diff --git a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts index 68c89dcf..4fa08f94 100644 --- a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts +++ b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts @@ -943,6 +943,203 @@ export class MetaEvaluationRepository { })), }; } + + // ========================================================================== + // Validation Baseline Methods + // ========================================================================== + + /** + * Create a new validation baseline from existing evaluation versions. + */ + async createValidationBaseline(input: { + name: string; + description?: string; + agentId: string; + evaluationVersionIds: string[]; + commitHash?: string; + createdById?: string; + }): Promise<{ id: string; name: string; snapshotCount: number }> { + const baseline = await this.prisma.validationBaseline.create({ + data: { + name: input.name, + description: input.description, + agentId: input.agentId, + commitHash: input.commitHash, + createdById: input.createdById, + snapshots: { + create: input.evaluationVersionIds.map((evId) => ({ + evaluationVersionId: evId, + })), + }, + }, + include: { + _count: { select: { snapshots: true } }, + }, + }); + + return { + id: baseline.id, + name: baseline.name, + snapshotCount: baseline._count.snapshots, + }; + } + + /** + * Get all validation baselines for an agent. + */ + async getValidationBaselines(agentId: string): Promise< + Array<{ + id: string; + name: string; + description: string | null; + commitHash: string | null; + createdAt: Date; + snapshotCount: number; + }> + > { + const baselines = await this.prisma.validationBaseline.findMany({ + where: { agentId }, + include: { + _count: { select: { snapshots: true } }, + }, + orderBy: { createdAt: "desc" }, + }); + + return baselines.map((b) => ({ + id: b.id, + name: b.name, + description: b.description, + commitHash: b.commitHash, + createdAt: b.createdAt, + snapshotCount: b._count.snapshots, + })); + } + + /** + * Get evaluation snapshots from a baseline. + */ + async getBaselineSnapshots(baselineId: string): Promise< + Array<{ + evaluationVersionId: string; + agentId: string; + agentName: string; + createdAt: Date; + documentId: string; + documentTitle: string; + grade: number | null; + pipelineTelemetry: unknown; + comments: Array<{ + id: string; + quotedText: string; + header: string | null; + description: string; + importance: number | null; + startOffset: number; + endOffset: number; + }>; + }> + > { + const baseline = await this.prisma.validationBaseline.findUnique({ + where: { id: baselineId }, + include: { + snapshots: { + include: { + evaluationVersion: { + include: { + evaluation: { + include: { + agent: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { name: true }, + }, + }, + }, + document: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { title: true }, + }, + }, + }, + }, + }, + comments: { + include: { + highlight: true, + }, + }, + }, + }, + }, + }, + }, + }); + + if (!baseline) return []; + + return baseline.snapshots.map((s) => { + const ev = s.evaluationVersion; + return { + evaluationVersionId: ev.id, + agentId: ev.agentId, + agentName: ev.evaluation.agent.versions[0]?.name || ev.agentId, + createdAt: ev.createdAt, + documentId: ev.evaluation.documentId, + documentTitle: ev.evaluation.document.versions[0]?.title || "Unknown", + grade: ev.grade, + pipelineTelemetry: ev.pipelineTelemetry, + comments: ev.comments.map((c) => ({ + id: c.id, + quotedText: c.highlight.quotedText, + header: c.header, + description: c.description, + importance: c.importance, + startOffset: c.highlight.startOffset, + endOffset: c.highlight.endOffset, + })), + }; + }); + } + + /** + * Delete a validation baseline. + */ + async deleteValidationBaseline(baselineId: string): Promise { + await this.prisma.validationBaseline.delete({ + where: { id: baselineId }, + }); + } + + /** + * Get document IDs from a baseline (for running new evaluations). + */ + async getBaselineDocumentIds(baselineId: string): Promise { + const baseline = await this.prisma.validationBaseline.findUnique({ + where: { id: baselineId }, + include: { + snapshots: { + include: { + evaluationVersion: { + include: { + evaluation: { + select: { documentId: true }, + }, + }, + }, + }, + }, + }, + }); + + if (!baseline) return []; + + return [...new Set(baseline.snapshots.map((s) => s.evaluationVersion.evaluation.documentId))]; + } } // Default instance for convenience diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index 73f06450..e45e9f8a 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -345,6 +345,17 @@ export function App() { height={termHeight} maxItems={maxListItems} onBack={loadMainMenu} + onCreateBatch={async (agentId, documentIds) => { + // Create batch jobs for the agent on selected documents + const response = await apiClient.post("/api/batches", { + agentId, + documentIds, + name: `Validation run`, + }); + + // Get job IDs from the batch + return await getJobsForBatch(response.batch.id); + }} /> ); } diff --git a/meta-evals/src/components/MainMenu.tsx b/meta-evals/src/components/MainMenu.tsx index f13fb4aa..3dce7050 100644 --- a/meta-evals/src/components/MainMenu.tsx +++ b/meta-evals/src/components/MainMenu.tsx @@ -209,8 +209,8 @@ export function MainMenu({ label: `${truncate(s.documentTitle, 40)} | ${s.runCount} runs | ${s.agentNames.slice(0, 2).join(", ")}`, value: s.id || `series-${idx}`, // Fallback key })), - { label: "+ Create New Baseline", value: "create" }, - { label: "πŸ” Validation (Compare Runs)", value: "validation" }, + { label: "+ New Series (for Score/Rank)", value: "create" }, + { label: "+ Validation (Regression Testing)", value: "validation" }, { label: "Exit", value: "exit" }, ]; diff --git a/meta-evals/src/components/Validation.tsx b/meta-evals/src/components/Validation.tsx index 5a8ff399..9ec90f26 100644 --- a/meta-evals/src/components/Validation.tsx +++ b/meta-evals/src/components/Validation.tsx @@ -2,10 +2,17 @@ * Validation Screen Component * * Compare pipeline runs and detect regressions. + * + * Flow: + * 1. Select/create a baseline (saved evaluation snapshots) + * 2. Run pipeline on baseline documents with current code + * 3. Compare new results vs baseline + * 4. View regressions */ import React, { useState, useEffect } from "react"; import { Box, Text, useInput } from "ink"; +import TextInput from "ink-text-input"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import { metaEvaluationRepository, type AgentChoice } from "@roast/db"; @@ -19,45 +26,75 @@ import { getComparisonStatus, } from "../validation"; -type Tab = "corpus" | "compare" | "results"; +type Tab = "baselines" | "run" | "results"; interface ValidationProps { height: number; maxItems: number; onBack: () => void; + onCreateBatch: (agentId: string, documentIds: string[]) => Promise; // Returns job IDs +} + +interface Baseline { + id: string; + name: string; + description: string | null; + commitHash: string | null; + createdAt: Date; + snapshotCount: number; } interface CorpusDocument extends ValidationDocument { selected: boolean; } -export function Validation({ height, maxItems, onBack }: ValidationProps) { - const [activeTab, setActiveTab] = useState("corpus"); +export function Validation({ height, maxItems, onBack, onCreateBatch }: ValidationProps) { + const [activeTab, setActiveTab] = useState("baselines"); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); - // Data + // Agent state const [agents, setAgents] = useState([]); const [selectedAgent, setSelectedAgent] = useState(null); + + // Baseline state + const [baselines, setBaselines] = useState([]); + const [selectedBaseline, setSelectedBaseline] = useState(null); + const [creatingBaseline, setCreatingBaseline] = useState(false); + const [newBaselineName, setNewBaselineName] = useState(""); + + // Corpus state (for creating new baseline) const [corpusDocuments, setCorpusDocuments] = useState([]); - const [comparisons, setComparisons] = useState([]); + const [showCorpusSelect, setShowCorpusSelect] = useState(false); + + // Run state const [isRunning, setIsRunning] = useState(false); - const [progress, setProgress] = useState({ current: 0, total: 0 }); + const [runProgress, setRunProgress] = useState({ phase: "", current: 0, total: 0 }); + + // Results state + const [comparisons, setComparisons] = useState([]); + const [savingBaseline, setSavingBaseline] = useState(false); + const [saveBaselineName, setSaveBaselineName] = useState(""); // Keyboard handling useInput((input, key) => { if (key.escape) { - if (activeTab !== "corpus") { - setActiveTab("corpus"); + if (creatingBaseline) { + setCreatingBaseline(false); + setShowCorpusSelect(false); + } else if (savingBaseline) { + setSavingBaseline(false); + } else if (activeTab !== "baselines") { + setActiveTab("baselines"); } else { onBack(); } } - if (key.tab) { + if (key.tab && !creatingBaseline && !savingBaseline) { setActiveTab((prev) => { - if (prev === "corpus") return "compare"; - if (prev === "compare") return comparisons.length > 0 ? "results" : "corpus"; - return "corpus"; + if (prev === "baselines") return "run"; + if (prev === "run") return comparisons.length > 0 ? "results" : "baselines"; + return "baselines"; }); } }); @@ -67,9 +104,10 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { loadAgents(); }, []); - // Load corpus when agent selected + // Load baselines when agent selected useEffect(() => { if (selectedAgent) { + loadBaselines(selectedAgent.id); loadCorpus(selectedAgent.id); } }, [selectedAgent?.id]); @@ -77,8 +115,6 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { async function loadAgents() { try { setLoading(true); - // Get agents that use fallacy-check plugin - // Note: pluginIds are stored as lowercase strings (e.g., "fallacy-check") const { prisma } = await import("@roast/db"); const fallacyAgents = await prisma.agent.findMany({ where: { @@ -86,9 +122,7 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { ephemeralBatchId: null, versions: { some: { - pluginIds: { - has: "fallacy-check", - }, + pluginIds: { has: "fallacy-check" }, }, }, }, @@ -121,20 +155,61 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { } } + async function loadBaselines(agentId: string) { + try { + const data = await metaEvaluationRepository.getValidationBaselines(agentId); + setBaselines(data); + if (data.length > 0 && !selectedBaseline) { + setSelectedBaseline(data[0]); + } + } catch (e) { + setError(String(e)); + } + } + async function loadCorpus(agentId: string) { try { - setLoading(true); const docs = await metaEvaluationRepository.getValidationCorpusDocuments( agentId, { limit: 50, minContentLength: 200 } ); + setCorpusDocuments(docs.map((d) => ({ ...d, selected: true }))); + } catch (e) { + setError(String(e)); + } + } - setCorpusDocuments( - docs.map((d) => ({ - ...d, - selected: true, // Select all by default - })) + async function createBaseline() { + if (!selectedAgent || !newBaselineName.trim()) return; + + const selectedDocs = corpusDocuments.filter((d) => d.selected); + if (selectedDocs.length === 0) return; + + try { + setLoading(true); + + // Get current evaluation version IDs for selected documents + const snapshots = await metaEvaluationRepository.getEvaluationSnapshots( + selectedDocs.map((d) => d.documentId), + selectedAgent.id ); + + const result = await metaEvaluationRepository.createValidationBaseline({ + name: newBaselineName.trim(), + agentId: selectedAgent.id, + evaluationVersionIds: snapshots.map((s) => s.evaluationVersionId), + }); + + // Reload baselines + await loadBaselines(selectedAgent.id); + + // Select the new baseline + const newBaseline = baselines.find((b) => b.id === result.id); + if (newBaseline) setSelectedBaseline(newBaseline); + + setCreatingBaseline(false); + setShowCorpusSelect(false); + setNewBaselineName(""); setLoading(false); } catch (e) { setError(String(e)); @@ -142,46 +217,95 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { } } + async function deleteBaseline(baselineId: string) { + try { + await metaEvaluationRepository.deleteValidationBaseline(baselineId); + if (selectedAgent) { + await loadBaselines(selectedAgent.id); + } + if (selectedBaseline?.id === baselineId) { + setSelectedBaseline(baselines[0] || null); + } + } catch (e) { + setError(String(e)); + } + } + async function runValidation() { - if (!selectedAgent) return; - const selectedDocs = corpusDocuments.filter((d) => d.selected); - if (selectedDocs.length === 0) return; + if (!selectedAgent || !selectedBaseline) return; setIsRunning(true); - setProgress({ current: 0, total: selectedDocs.length }); - setActiveTab("compare"); + setActiveTab("run"); + setComparisons([]); try { - // Get baseline snapshots (most recent evaluations) - const baselineSnapshots = await metaEvaluationRepository.getEvaluationSnapshots( - selectedDocs.map((d) => d.documentId), - selectedAgent.id - ); + // Phase 1: Get baseline snapshots + setRunProgress({ phase: "Loading baseline...", current: 0, total: 0 }); + const baselineSnapshots = await metaEvaluationRepository.getBaselineSnapshots(selectedBaseline.id); - // For now, we compare baseline with itself (to test the UI) - // In real use, we'd run the pipeline again and compare - const results: DocumentComparisonResult[] = []; + if (baselineSnapshots.length === 0) { + throw new Error("Baseline has no snapshots"); + } - for (const snapshot of baselineSnapshots) { - setProgress((p) => ({ ...p, current: p.current + 1 })); - - // Convert to EvaluationSnapshot format - const baselineEval: EvaluationSnapshot = { - evaluationVersionId: snapshot.evaluationVersionId, - agentId: snapshot.agentId, - agentName: snapshot.agentName, - createdAt: snapshot.createdAt, - documentId: snapshot.documentId, - documentTitle: snapshot.documentTitle, - comments: snapshot.comments, - grade: snapshot.grade, - pipelineTelemetry: extractTelemetry(snapshot.pipelineTelemetry), - }; + // Phase 2: Run pipeline on documents + setRunProgress({ phase: "Running pipeline...", current: 0, total: baselineSnapshots.length }); + const documentIds = [...new Set(baselineSnapshots.map((s) => s.documentId))]; + + // Create batch jobs + const jobIds = await onCreateBatch(selectedAgent.id, documentIds); - // For demo, use same snapshot as "current" - // In real use, this would be from a new pipeline run - const comparison = compareSnapshots(baselineEval, baselineEval); - results.push(comparison); + // Phase 3: Wait for jobs to complete and get results + setRunProgress({ phase: "Waiting for jobs...", current: 0, total: jobIds.length }); + + // Poll for job completion + const { prisma } = await import("@roast/db"); + let completed = 0; + const maxWaitMs = 5 * 60 * 1000; // 5 minutes + const startTime = Date.now(); + + while (completed < jobIds.length && Date.now() - startTime < maxWaitMs) { + await new Promise((r) => setTimeout(r, 2000)); // Poll every 2s + + const jobs = await prisma.job.findMany({ + where: { id: { in: jobIds } }, + select: { id: true, status: true, evaluationVersionId: true }, + }); + + completed = jobs.filter((j) => j.status === "COMPLETED" || j.status === "FAILED").length; + setRunProgress({ phase: "Waiting for jobs...", current: completed, total: jobIds.length }); + } + + // Phase 4: Get new evaluation versions and compare + setRunProgress({ phase: "Comparing results...", current: 0, total: baselineSnapshots.length }); + + const jobs = await prisma.job.findMany({ + where: { id: { in: jobIds }, status: "COMPLETED" }, + select: { evaluationVersionId: true }, + }); + + const newVersionIds = jobs + .map((j) => j.evaluationVersionId) + .filter((id): id is string => id !== null); + + // Get new snapshots + const newSnapshots = await Promise.all( + newVersionIds.map((id) => metaEvaluationRepository.getEvaluationSnapshotById(id)) + ); + + // Compare + const results: DocumentComparisonResult[] = []; + for (const baselineSnapshot of baselineSnapshots) { + const newSnapshot = newSnapshots.find( + (s) => s && s.documentId === baselineSnapshot.documentId + ); + + if (newSnapshot) { + const baselineEval = toEvaluationSnapshot(baselineSnapshot); + const currentEval = toEvaluationSnapshot(newSnapshot); + results.push(compareSnapshots(baselineEval, currentEval)); + } + + setRunProgress((p) => ({ ...p, current: p.current + 1 })); } setComparisons(results); @@ -193,30 +317,51 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { } } + async function saveResultsAsBaseline() { + if (!selectedAgent || !saveBaselineName.trim() || comparisons.length === 0) return; + + try { + setSavingBaseline(false); + setLoading(true); + + // Get the "current" evaluation version IDs from comparisons + const evalVersionIds = comparisons.map((c) => c.current.evaluationVersionId); + + await metaEvaluationRepository.createValidationBaseline({ + name: saveBaselineName.trim(), + agentId: selectedAgent.id, + evaluationVersionIds: evalVersionIds, + }); + + await loadBaselines(selectedAgent.id); + setSaveBaselineName(""); + setLoading(false); + } catch (e) { + setError(String(e)); + setLoading(false); + } + } + function toggleDocument(docId: string) { setCorpusDocuments((docs) => - docs.map((d) => - d.documentId === docId ? { ...d, selected: !d.selected } : d - ) + docs.map((d) => (d.documentId === docId ? { ...d, selected: !d.selected } : d)) ); } function toggleAll() { const allSelected = corpusDocuments.every((d) => d.selected); - setCorpusDocuments((docs) => - docs.map((d) => ({ ...d, selected: !allSelected })) - ); + setCorpusDocuments((docs) => docs.map((d) => ({ ...d, selected: !allSelected }))); } // Render tabs header const renderTabs = () => ( - - [Corpus] + + [Baselines] - - [Compare] + + [Run] @@ -238,18 +383,141 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { if (loading) { return ( - - Loading... - + Loading... + + ); + } + + // Creating baseline - corpus selection + if (creatingBaseline && showCorpusSelect) { + const selectedCount = corpusDocuments.filter((d) => d.selected).length; + const items = [ + { label: `[${selectedCount === corpusDocuments.length ? "x" : " "}] Select All (${corpusDocuments.length})`, value: "toggle-all" }, + ...corpusDocuments.slice(0, maxItems - 4).map((d) => ({ + label: `[${d.selected ? "x" : " "}] ${truncate(d.title, 50)}`, + value: d.documentId, + })), + { label: selectedCount > 0 ? `βœ“ Create Baseline (${selectedCount} docs)` : "Select documents first", value: "create" }, + { label: "← Cancel", value: "cancel" }, + ]; + + return ( + + + Select documents to include in baseline + + + { + if (item.value === "cancel") { + setShowCorpusSelect(false); + setCreatingBaseline(false); + } else if (item.value === "toggle-all") { + toggleAll(); + } else if (item.value === "create" && selectedCount > 0) { + createBaseline(); + } else { + toggleDocument(item.value); + } + }} + /> + + ); + } + + // Creating baseline - name input + if (creatingBaseline) { + return ( + + + Enter a name for this baseline (e.g., "Pre-refactor", "v2.0") + + + + Name: + { + if (newBaselineName.trim()) { + setShowCorpusSelect(true); + } + }} + /> + + + + Enter Continue | Escape Cancel + + + ); + } + + // Saving results as baseline + if (savingBaseline) { + return ( + + + Save current results as a new baseline for future comparisons + + + + Name: + { + if (saveBaselineName.trim()) { + saveResultsAsBaseline(); + } + }} + /> + + + + Enter Save | Escape Cancel + ); } // Results tab - if (activeTab === "results") { - const okCount = comparisons.filter((c) => getComparisonStatus(c) === "ok").length; - const warningCount = comparisons.filter((c) => getComparisonStatus(c) === "warning").length; - const errorCount = comparisons.filter((c) => getComparisonStatus(c) === "error").length; + if (activeTab === "results" && comparisons.length > 0) { + // Count by change status + const unchangedCount = comparisons.filter((c) => + c.newComments.length === 0 && c.lostComments.length === 0 + ).length; + const changedCount = comparisons.length - unchangedCount; + + // Format change summary for a comparison + const formatChangeSummary = (c: DocumentComparisonResult) => { + const parts: string[] = []; + const kept = c.matchedComments.length; + const added = c.newComments.length; + const lost = c.lostComments.length; + + if (kept > 0) parts.push(`${kept} kept`); + if (added > 0) parts.push(`+${added} new`); + if (lost > 0) parts.push(`-${lost} lost`); + + return parts.length > 0 ? parts.join(", ") : "no comments"; + }; + + const items = [ + ...comparisons.slice(0, maxItems - 4).map((c) => { + const hasChanges = c.newComments.length > 0 || c.lostComments.length > 0; + const icon = hasChanges ? "~" : "="; + const color = hasChanges ? "yellow" : "green"; + + return { + label: `[${icon}] ${truncate(c.documentTitle, 35)} | ${formatChangeSummary(c)}`, + value: c.documentId, + }; + }), + { label: "+ Save as New Baseline", value: "save" }, + { label: "← Back to Baselines", value: "back" }, + ]; return ( @@ -257,94 +525,100 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { - βœ… {okCount} - {" | "} - ⚠️ {warningCount} + [=] {unchangedCount} unchanged {" | "} - ❌ {errorCount} + [~] {changedCount} changed {" | "} - Total: {comparisons.length} + Baseline: {selectedBaseline?.name || "?"} - - {comparisons.slice(0, maxItems - 5).map((c, i) => { - const status = getComparisonStatus(c); - const icon = status === "ok" ? "βœ…" : status === "warning" ? "⚠️" : "❌"; - const color = status === "ok" ? "green" : status === "warning" ? "yellow" : "red"; - - return ( - - - {icon} {truncate(c.documentTitle, 50)} - - - {" "}| {c.baseline.comments.length} β†’ {c.current.comments.length} comments - - - ); - })} - - - - Escape Go back | Tab Switch tabs - + { + if (item.value === "save") { + setSavingBaseline(true); + setSaveBaselineName(`Post-${selectedBaseline?.name || "run"}`); + } else if (item.value === "back") { + setActiveTab("baselines"); + } + // TODO: Show detail view for specific document + }} + /> ); } - // Compare tab (running) - if (activeTab === "compare") { + // Run tab + if (activeTab === "run") { return ( - + {renderTabs()} {isRunning ? ( - - - Running validation... - - - {progress.current}/{progress.total} documents - + + {runProgress.phase} + {runProgress.total > 0 && ( + {runProgress.current}/{runProgress.total} + )} + + ) : selectedBaseline ? ( + + + + Baseline: {selectedBaseline.name} + {" "}({selectedBaseline.snapshotCount} docs) + + + + { + if (item.value === "run") runValidation(); + else setActiveTab("baselines"); + }} + /> ) : ( - Select documents and run validation from the Corpus tab. + No baseline selected. Create or select one first. + setActiveTab("baselines")} + /> )} - - - Escape Go back | Tab Switch tabs - ); } - // Corpus tab (default) - const selectedCount = corpusDocuments.filter((d) => d.selected).length; + // Baselines tab (default) const items = [ - ...(agents.length > 1 - ? [{ label: `Agent: ${selectedAgent?.name || "Select..."}`, value: "agent" }] - : []), - { label: `[${selectedCount === corpusDocuments.length ? "x" : " "}] Select All (${corpusDocuments.length} docs)`, value: "toggle-all" }, - ...corpusDocuments.slice(0, maxItems - 5).map((d) => ({ - label: `[${d.selected ? "x" : " "}] ${truncate(d.title, 45)} (${d.evaluationCount} evals)`, - value: d.documentId, + { label: "+ Create New Baseline", value: "create" }, + ...baselines.map((b) => ({ + label: `${selectedBaseline?.id === b.id ? "● " : "β—‹ "}${b.name} (${b.snapshotCount} docs)`, + value: `select:${b.id}`, })), - { label: selectedCount > 0 ? `β–Ά Run Validation (${selectedCount} selected)` : "β–Ά Run Validation (select docs first)", value: "run" }, - { label: "← Back", value: "back" }, + ...(selectedBaseline ? [{ label: "- Delete Selected Baseline", value: "delete" }] : []), + { label: "← Back to Main Menu", value: "back" }, ]; return ( - + {renderTabs()} Agent: {selectedAgent?.name || "None"} - {" | "} - Selected: {selectedCount}/{corpusDocuments.length} + {selectedBaseline && ( + <> + {" | "} + Selected: {selectedBaseline.name} + + )} @@ -353,27 +627,61 @@ export function Validation({ height, maxItems, onBack }: ValidationProps) { onSelect={(item) => { if (item.value === "back") { onBack(); - } else if (item.value === "toggle-all") { - toggleAll(); - } else if (item.value === "run") { - if (selectedCount > 0) { - runValidation(); - } - } else if (item.value === "agent") { - // TODO: Agent selection UI - } else { - toggleDocument(item.value); + } else if (item.value === "create") { + setCreatingBaseline(true); + setNewBaselineName(""); + } else if (item.value === "delete" && selectedBaseline) { + deleteBaseline(selectedBaseline.id); + } else if (item.value.startsWith("select:")) { + const baselineId = item.value.replace("select:", ""); + const baseline = baselines.find((b) => b.id === baselineId); + if (baseline) setSelectedBaseline(baseline); } }} /> - Enter Toggle/Select | Tab Switch tabs | Escape Go back + Enter Select | Tab β†’ Run | Escape Back ); } +/** + * Convert repository snapshot to EvaluationSnapshot type. + */ +function toEvaluationSnapshot(snapshot: { + evaluationVersionId: string; + agentId: string; + agentName: string; + createdAt: Date; + documentId: string; + documentTitle: string; + grade: number | null; + pipelineTelemetry: unknown; + comments: Array<{ + id: string; + quotedText: string; + header: string | null; + description: string; + importance: number | null; + startOffset: number; + endOffset: number; + }>; +}): EvaluationSnapshot { + return { + evaluationVersionId: snapshot.evaluationVersionId, + agentId: snapshot.agentId, + agentName: snapshot.agentName, + createdAt: snapshot.createdAt, + documentId: snapshot.documentId, + documentTitle: snapshot.documentTitle, + comments: snapshot.comments, + grade: snapshot.grade, + pipelineTelemetry: extractTelemetry(snapshot.pipelineTelemetry), + }; +} + /** * Extract pipeline telemetry snapshot from raw data. */ From f4b531baf34046fa8abd63e83681447f4ca9c30f Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Wed, 7 Jan 2026 12:55:20 +0000 Subject: [PATCH 14/72] refactor: Restructure main menu as clean router MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - MainMenu now only has 4 options: Score/Rank, Validation, Settings, Exit - Created ScoreRankMenu component with series list, create, delete - Settings remains as modal overlay in MainMenu - Updated App.tsx routing for new screen structure - Navigation: SeriesDetail and CreateBaseline now return to ScoreRankMenu πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/app.tsx | 45 +++-- meta-evals/src/components/MainMenu.tsx | 200 ++++---------------- meta-evals/src/components/ScoreRankMenu.tsx | 169 +++++++++++++++++ meta-evals/src/components/index.ts | 1 + meta-evals/src/components/types.ts | 3 +- 5 files changed, 234 insertions(+), 184 deletions(-) create mode 100644 meta-evals/src/components/ScoreRankMenu.tsx diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index e45e9f8a..1df9594d 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -12,7 +12,7 @@ import { type AgentChoice, } from "@roast/db"; import { apiClient } from "./utils/apiClient"; -import { MainMenu, CreateBaseline, SeriesDetail, RankRuns, ScoreRun, Validation, type Screen } from "./components"; +import { MainMenu, ScoreRankMenu, CreateBaseline, SeriesDetail, RankRuns, ScoreRun, Validation, type Screen } from "./components"; import { getAvailableModels, getRecommendedJudgeModels, DEFAULT_JUDGE_MODEL, type ModelInfo } from "./utils/models"; // ============================================================================ @@ -128,8 +128,8 @@ export function App() { // Load initial data useEffect(() => { - loadMainMenu(); loadModels(); + setScreen({ type: "main-menu" }); }, []); async function loadModels() { @@ -144,10 +144,14 @@ export function App() { } async function loadMainMenu() { + setScreen({ type: "main-menu" }); + } + + async function loadScoreRankMenu() { setScreen({ type: "loading" }); try { const series = await metaEvaluationRepository.getSeries(); - setScreen({ type: "main-menu", series }); + setScreen({ type: "score-rank-menu", series }); } catch (e) { setError(String(e)); } @@ -219,16 +223,8 @@ export function App() { if (screen.type === "main-menu") { return ( setScreen({ type: "series-detail", seriesId: id })} - onDeleteSeries={async (id) => { - await metaEvaluationRepository.deleteSeries(id); - // Reload the menu - loadMainMenu(); - }} + onScoreRank={loadScoreRankMenu} onValidation={() => setScreen({ type: "validation" })} onExit={exit} judgeModel={judgeModel} @@ -242,6 +238,25 @@ export function App() { ); } + if (screen.type === "score-rank-menu") { + return ( + setScreen({ type: "series-detail", seriesId: id })} + onDeleteSeries={async (id) => { + await metaEvaluationRepository.deleteSeries(id); + // Reload the menu + loadScoreRankMenu(); + }} + onBack={loadMainMenu} + /> + ); + } + if (screen.type === "create-baseline") { return ( ); } @@ -287,7 +302,7 @@ export function App() { seriesId={screen.seriesId} maxItems={maxListItems} height={termHeight} - onBack={loadMainMenu} + onBack={loadScoreRankMenu} onRunAgain={async (seriesId, documentId) => { try { await runAgain(seriesId, documentId); diff --git a/meta-evals/src/components/MainMenu.tsx b/meta-evals/src/components/MainMenu.tsx index 3dce7050..ddb986e0 100644 --- a/meta-evals/src/components/MainMenu.tsx +++ b/meta-evals/src/components/MainMenu.tsx @@ -1,12 +1,10 @@ /** - * Main Menu Screen Component + * Main Menu Screen Component - Clean Router */ import React, { useState } from "react"; import { Box, Text, useInput } from "ink"; import SelectInput from "ink-select-input"; -import type { SeriesSummary } from "./types"; -import { truncate } from "./helpers"; interface ModelInfo { id: string; @@ -14,12 +12,8 @@ interface ModelInfo { } interface MainMenuProps { - series: SeriesSummary[]; - maxItems: number; height: number; - onCreateBaseline: () => void; - onSelectSeries: (id: string) => void; - onDeleteSeries: (id: string) => Promise; + onScoreRank: () => void; onValidation: () => void; onExit: () => void; judgeModel: string; @@ -35,12 +29,8 @@ const TEMPERATURE_OPTIONS = [0, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0]; const MAX_TOKENS_OPTIONS = [2048, 4096, 8192, 16384, 32768]; export function MainMenu({ - series, - maxItems, height, - onCreateBaseline, - onSelectSeries, - onDeleteSeries, + onScoreRank, onValidation, onExit, judgeModel, @@ -51,71 +41,21 @@ export function MainMenu({ maxTokens, onSetMaxTokens, }: MainMenuProps) { - const [activeTab, setActiveTab] = useState<"series" | "settings">("series"); + const [showSettings, setShowSettings] = useState(false); const [settingsSection, setSettingsSection] = useState<"model" | "temperature" | "maxTokens">("model"); - const [highlightedIndex, setHighlightedIndex] = useState(0); - const [confirmDelete, setConfirmDelete] = useState(null); - const [isDeleting, setIsDeleting] = useState(false); - - // Limit series shown, reserve 2 slots for create/exit - const visibleSeries = series.slice(0, maxItems - 2); // Handle keyboard input useInput((input, key) => { - if (key.tab) { - setActiveTab((prev) => (prev === "series" ? "settings" : "series")); - setConfirmDelete(null); - } - - // Delete with 'd' key (only in series tab) - if (activeTab === "series" && input === "d" && !confirmDelete && !isDeleting) { - const selectedSeries = visibleSeries[highlightedIndex]; - if (selectedSeries) { - setConfirmDelete(selectedSeries.id); - } - } - - // Confirm delete with 'y' - if (confirmDelete && input === "y" && !isDeleting) { - setIsDeleting(true); - onDeleteSeries(confirmDelete).finally(() => { - setConfirmDelete(null); - setIsDeleting(false); - }); - } - - // Cancel delete with 'n' or Escape - if (confirmDelete && (input === "n" || key.escape)) { - setConfirmDelete(null); + if (key.escape && showSettings) { + setShowSettings(false); } }); // Get display name for current model const currentModelName = availableModels.find((m) => m.id === judgeModel)?.displayName || judgeModel; - // Render tabs header - const renderTabs = () => ( - - - [Series] - - - - [Settings] - - (Tab to switch) - - ); - - // Settings tab - if (activeTab === "settings") { - // Build items based on current section + // Settings panel + if (showSettings) { let settingsItems: { label: string; value: string }[] = []; let sectionTitle = ""; @@ -128,7 +68,7 @@ export function MainMenu({ })), { label: "-> Temperature", value: "goto:temperature" }, { label: "-> Max Tokens", value: "goto:maxTokens" }, - { label: "<- Back to Series", value: "back" }, + { label: "<- Back", value: "back" }, ]; } else if (settingsSection === "temperature") { sectionTitle = "Temperature"; @@ -158,8 +98,6 @@ export function MainMenu({ - {renderTabs()} - For Score/Rank AI judge: @@ -181,7 +119,7 @@ export function MainMenu({ items={settingsItems} onSelect={(item) => { if (item.value === "back") { - setActiveTab("series"); + setShowSettings(false); } else if (item.value.startsWith("goto:")) { setSettingsSection(item.value.replace("goto:", "") as "model" | "temperature" | "maxTokens"); } else if (item.value.startsWith("model:")) { @@ -195,28 +133,20 @@ export function MainMenu({ /> - Tab Switch | Up/Down Navigate | Enter Select | q Quit + Up/Down Navigate | Enter Select | Escape Back ); } - // Series tab (default) + // Main menu items const items = [ - ...visibleSeries - .filter((s) => s.id) // Ensure valid IDs - .map((s, idx) => ({ - label: `${truncate(s.documentTitle, 40)} | ${s.runCount} runs | ${s.agentNames.slice(0, 2).join(", ")}`, - value: s.id || `series-${idx}`, // Fallback key - })), - { label: "+ New Series (for Score/Rank)", value: "create" }, - { label: "+ Validation (Regression Testing)", value: "validation" }, + { label: "Score/Rank", value: "score-rank" }, + { label: "Validation", value: "validation" }, + { label: "Settings", value: "settings" }, { label: "Exit", value: "exit" }, ]; - // Find series being deleted for confirmation message - const deletingSeries = confirmDelete ? visibleSeries.find((s) => s.id === confirmDelete) : null; - return ( @@ -225,93 +155,27 @@ export function MainMenu({ - {renderTabs()} - - {/* Delete confirmation modal - replaces content when active */} - {confirmDelete && deletingSeries ? ( - - - - - ⚠ Confirm Delete ⚠ - - - - - Are you sure you want to delete this series? - - - - "{truncate(deletingSeries.documentTitle, 45)}" - - - - {deletingSeries.runCount} run{deletingSeries.runCount !== 1 ? "s" : ""} will be removed. - - - - {isDeleting ? ( - Deleting... - ) : ( - - Y - Delete - N - Cancel - - )} - - + + + Compare and evaluate agent outputs + + Judge: {currentModelName} + - ) : ( - <> - - - - {series.length === 0 - ? "No evaluation series yet. Create a baseline to get started." - : visibleSeries.length < series.length - ? `Showing ${visibleSeries.length} of ${series.length} series` - : `${series.length} series available`} - - - Score/Rank Judge: {currentModelName} - {" "}(Tab β†’ Settings to change) - - - + - { - const idx = visibleSeries.findIndex((s) => s.id === item.value); - if (idx >= 0) setHighlightedIndex(idx); - }} - onSelect={(item) => { - if (confirmDelete) return; // Ignore selection during delete confirmation - if (item.value === "exit") onExit(); - else if (item.value === "create") onCreateBaseline(); - else if (item.value === "validation") onValidation(); - else onSelectSeries(item.value); - }} - /> - - )} + { + if (item.value === "exit") onExit(); + else if (item.value === "score-rank") onScoreRank(); + else if (item.value === "validation") onValidation(); + else if (item.value === "settings") setShowSettings(true); + }} + /> - - {confirmDelete ? "Y Delete | N Cancel" : "Tab Switch | d Delete | Enter Select | q Quit"} - + Up/Down Navigate | Enter Select | q Quit ); diff --git a/meta-evals/src/components/ScoreRankMenu.tsx b/meta-evals/src/components/ScoreRankMenu.tsx new file mode 100644 index 00000000..6724374a --- /dev/null +++ b/meta-evals/src/components/ScoreRankMenu.tsx @@ -0,0 +1,169 @@ +/** + * Score/Rank Menu Screen + * + * Shows series list for scoring and ranking agent outputs. + */ + +import React, { useState } from "react"; +import { Box, Text, useInput } from "ink"; +import SelectInput from "ink-select-input"; +import type { SeriesSummary } from "./types"; +import { truncate } from "./helpers"; +import { ScreenContainer, InfoBox } from "./shared"; + +interface ScoreRankMenuProps { + series: SeriesSummary[]; + maxItems: number; + height: number; + judgeModel: string; + onCreateSeries: () => void; + onSelectSeries: (id: string) => void; + onDeleteSeries: (id: string) => Promise; + onBack: () => void; +} + +export function ScoreRankMenu({ + series, + maxItems, + height, + judgeModel, + onCreateSeries, + onSelectSeries, + onDeleteSeries, + onBack, +}: ScoreRankMenuProps) { + const [highlightedIndex, setHighlightedIndex] = useState(0); + const [confirmDelete, setConfirmDelete] = useState(null); + const [isDeleting, setIsDeleting] = useState(false); + + // Limit series shown, reserve slots for actions + const visibleSeries = series.slice(0, maxItems - 3); + + // Handle keyboard input + useInput((input, key) => { + if (key.escape) { + if (confirmDelete) { + setConfirmDelete(null); + } else { + onBack(); + } + } + + // Delete with 'd' key + if (input === "d" && !confirmDelete && !isDeleting) { + const selectedSeries = visibleSeries[highlightedIndex]; + if (selectedSeries) { + setConfirmDelete(selectedSeries.id); + } + } + + // Confirm delete with 'y' + if (confirmDelete && input === "y" && !isDeleting) { + setIsDeleting(true); + onDeleteSeries(confirmDelete).finally(() => { + setConfirmDelete(null); + setIsDeleting(false); + }); + } + + // Cancel delete with 'n' + if (confirmDelete && input === "n") { + setConfirmDelete(null); + } + }); + + // Find series being deleted for confirmation message + const deletingSeries = confirmDelete ? visibleSeries.find((s) => s.id === confirmDelete) : null; + + // Delete confirmation modal + if (confirmDelete && deletingSeries) { + return ( + + + + + + Confirm Delete + + + + Delete this series? + + + "{truncate(deletingSeries.documentTitle, 45)}" + + + + {deletingSeries.runCount} run{deletingSeries.runCount !== 1 ? "s" : ""} will be removed. + + + + {isDeleting ? ( + Deleting... + ) : ( + + Y - Delete + N - Cancel + + )} + + + + + ); + } + + // Build menu items + const items = [ + ...visibleSeries + .filter((s) => s.id) + .map((s, idx) => ({ + label: `${truncate(s.documentTitle, 40)} | ${s.runCount} runs | ${s.agentNames.slice(0, 2).join(", ")}`, + value: s.id || `series-${idx}`, + })), + { label: "+ Create New Series", value: "create" }, + { label: "<- Back to Main Menu", value: "back" }, + ]; + + return ( + + + + {series.length === 0 + ? "No series yet. Create one to score/rank agent outputs." + : `${series.length} series | Judge: `} + {series.length > 0 && {judgeModel}} + + + + { + const idx = visibleSeries.findIndex((s) => s.id === item.value); + if (idx >= 0) setHighlightedIndex(idx); + }} + onSelect={(item) => { + if (item.value === "back") onBack(); + else if (item.value === "create") onCreateSeries(); + else onSelectSeries(item.value); + }} + /> + + + Enter Select | d Delete | Escape Back + + + ); +} diff --git a/meta-evals/src/components/index.ts b/meta-evals/src/components/index.ts index b36628db..cc7f2a02 100644 --- a/meta-evals/src/components/index.ts +++ b/meta-evals/src/components/index.ts @@ -3,6 +3,7 @@ */ export { MainMenu } from "./MainMenu"; +export { ScoreRankMenu } from "./ScoreRankMenu"; export { CreateBaseline } from "./CreateBaseline"; export { SeriesDetail } from "./SeriesDetail"; export { RankRuns } from "./RankRuns"; diff --git a/meta-evals/src/components/types.ts b/meta-evals/src/components/types.ts index bc55d13d..66c14795 100644 --- a/meta-evals/src/components/types.ts +++ b/meta-evals/src/components/types.ts @@ -6,7 +6,8 @@ import type { SeriesSummary, DocumentChoice, AgentChoice } from "@roast/db"; export type Screen = | { type: "loading" } - | { type: "main-menu"; series: SeriesSummary[] } + | { type: "main-menu" } + | { type: "score-rank-menu"; series: SeriesSummary[] } | { type: "create-baseline"; step: "document" | "agents" | "confirm" | "creating" } | { type: "series-detail"; seriesId: string } | { type: "rank-runs"; seriesId: string } From 0bb9314013d9bf4ac370cf2fa90fb74f0f653cf0 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Wed, 7 Jan 2026 14:09:49 +0000 Subject: [PATCH 15/72] feat(meta-evals): Add validation run persistence and filter reasoning UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add ValidationRun and ValidationRunSnapshot tables for persisting runs - Capture per-item filter reasoning in pipeline telemetry (filteredItems) - Record filter reasons from supported-elsewhere-filter and review stages - Display filter reasoning for lost comments in validation UI - Distinguish filtered comments (⊘) from not-extracted comments (βˆ’) - Simplify UI: remove Results tab, auto-navigate to History after run - Show all comments in scrollable list (no more "and X more" truncation) - Add legend and summary breakdown (X filtered, Y not extracted) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../plugins/fallacy-check/index.ts | 35 +- .../telemetry/PipelineTelemetry.ts | 19 + .../plugins/fallacy-check/telemetry/index.ts | 1 + .../plugins/fallacy-check/telemetry/types.ts | 26 + .../migration.sql | 61 ++ internal-packages/db/prisma/schema.prisma | 52 +- .../repositories/MetaEvaluationRepository.ts | 233 ++++++ meta-evals/src/components/Validation.tsx | 716 ++++++++++++++---- meta-evals/src/validation/types.ts | 14 + 9 files changed, 1014 insertions(+), 143 deletions(-) create mode 100644 internal-packages/db/prisma/migrations/20260107130534_add_validation_run_tables/migration.sql diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts index ca219709..22fb5de9 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts @@ -388,15 +388,27 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { unsupportedIndices.has(idx) ); - // Log what was filtered + // Log and record what was filtered const supportedCount = filterResult.supportedIssues.length; if (supportedCount > 0) { logger.info( `FallacyCheckPlugin: Filtered out ${supportedCount} issues (supported elsewhere in document)` ); - for (const supported of filterResult.supportedIssues) { + + // Record filtered items with their reasoning for telemetry + const filteredRecords = filterResult.supportedIssues.map((supported) => { + const originalIssue = issues[supported.index]; logger.debug(` - Issue ${supported.index}: ${supported.explanation}`); - } + return { + stage: PIPELINE_STAGES.SUPPORTED_ELSEWHERE_FILTER, + quotedText: originalIssue?.text || `Issue at index ${supported.index}`, + header: originalIssue?.issueType, + filterReason: supported.explanation, + supportLocation: supported.supportLocation, + originalIndex: supported.index, + }; + }); + telemetry.recordFilteredItems(filteredRecords); } logger.info("FallacyCheckPlugin: AUDIT: Supported-elsewhere filter completed", { @@ -471,10 +483,27 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { ); // Filter comments based on review + const keptIndices = new Set(reviewResult.commentIndicesToKeep); this.comments = reviewResult.commentIndicesToKeep.map((idx) => allComments[idx]); this.summary = reviewResult.oneLineSummary; this.analysis = reviewResult.documentSummary; + // Record comments that were filtered by review + const filteredComments = allComments + .map((comment, idx) => ({ comment, idx })) + .filter(({ idx }) => !keptIndices.has(idx)); + + if (filteredComments.length > 0) { + const filteredRecords = filteredComments.map(({ comment, idx }) => ({ + stage: PIPELINE_STAGES.REVIEW, + quotedText: comment.highlight.quotedText, + header: comment.header, + filterReason: 'Filtered by review (redundant, low-value, or questionable)', + originalIndex: idx, + })); + telemetry.recordFilteredItems(filteredRecords); + } + logger.info("FallacyCheckPlugin: AUDIT: Review phase completed", { timestamp: new Date().toISOString(), commentsReviewed: allComments.length, diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts index 3257d78d..eac3138a 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts @@ -10,6 +10,7 @@ import type { StageMetrics, PipelineExecutionRecord, PipelineStage, + FilteredItemRecord, } from './types'; /** Current pipeline version - increment when making significant changes */ @@ -49,6 +50,7 @@ export class PipelineTelemetry { private documentLength: number; private stages: StageMetrics[] = []; private activeStage: ActiveStage | null = null; + private filteredItems: FilteredItemRecord[] = []; private finalCounts: PipelineExecutionRecord['finalCounts'] = { issuesExtracted: 0, issuesAfterDedup: 0, @@ -155,6 +157,22 @@ export class PipelineTelemetry { return this; } + /** + * Record a filtered item with its reasoning + */ + recordFilteredItem(item: FilteredItemRecord): this { + this.filteredItems.push(item); + return this; + } + + /** + * Record multiple filtered items + */ + recordFilteredItems(items: FilteredItemRecord[]): this { + this.filteredItems.push(...items); + return this; + } + /** * Calculate total cost from all stages */ @@ -191,6 +209,7 @@ export class PipelineTelemetry { error, totalCostUsd: this.calculateTotalCost(), pipelineVersion: PIPELINE_VERSION, + filteredItems: this.filteredItems, // Always include (even if empty) so we know telemetry was captured }; } diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts index f3384c74..0a403bfa 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts @@ -9,5 +9,6 @@ export { type StageMetrics, type PipelineExecutionRecord, type PipelineStage, + type FilteredItemRecord, PIPELINE_STAGES, } from './types'; diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts index 8f199cd8..69f26ade 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts @@ -37,6 +37,29 @@ export interface StageMetrics { metadata?: Record; } +/** + * Details about a filtered item (issue or comment) + */ +export interface FilteredItemRecord { + /** Stage where filtering occurred */ + stage: string; + + /** Original text that was flagged */ + quotedText: string; + + /** Header/type of the issue */ + header?: string; + + /** Why this item was filtered */ + filterReason: string; + + /** Where support was found (for supported-elsewhere filter) */ + supportLocation?: string; + + /** Original index in the input array */ + originalIndex: number; +} + /** * Complete pipeline execution record */ @@ -84,6 +107,9 @@ export interface PipelineExecutionRecord { /** Pipeline version (for tracking changes over time) */ pipelineVersion: string; + + /** Details about items that were filtered out (for debugging/validation) */ + filteredItems?: FilteredItemRecord[]; } /** diff --git a/internal-packages/db/prisma/migrations/20260107130534_add_validation_run_tables/migration.sql b/internal-packages/db/prisma/migrations/20260107130534_add_validation_run_tables/migration.sql new file mode 100644 index 00000000..993aafb0 --- /dev/null +++ b/internal-packages/db/prisma/migrations/20260107130534_add_validation_run_tables/migration.sql @@ -0,0 +1,61 @@ +-- CreateTable +CREATE TABLE "public"."ValidationRun" ( + "id" TEXT NOT NULL, + "baselineId" TEXT NOT NULL, + "name" TEXT, + "commitHash" TEXT, + "status" TEXT NOT NULL DEFAULT 'running', + "summary" TEXT, + "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "completedAt" TIMESTAMP(3), + + CONSTRAINT "ValidationRun_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "public"."ValidationRunSnapshot" ( + "id" TEXT NOT NULL, + "runId" TEXT NOT NULL, + "baselineSnapshotId" TEXT NOT NULL, + "newEvaluationId" TEXT NOT NULL, + "status" TEXT NOT NULL, + "keptCount" INTEGER NOT NULL DEFAULT 0, + "newCount" INTEGER NOT NULL DEFAULT 0, + "lostCount" INTEGER NOT NULL DEFAULT 0, + "comparisonData" JSONB, + + CONSTRAINT "ValidationRunSnapshot_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE INDEX "ValidationRun_baselineId_idx" ON "public"."ValidationRun"("baselineId"); + +-- CreateIndex +CREATE INDEX "ValidationRun_createdAt_idx" ON "public"."ValidationRun"("createdAt"); + +-- CreateIndex +CREATE INDEX "ValidationRun_status_idx" ON "public"."ValidationRun"("status"); + +-- CreateIndex +CREATE INDEX "ValidationRunSnapshot_runId_idx" ON "public"."ValidationRunSnapshot"("runId"); + +-- CreateIndex +CREATE INDEX "ValidationRunSnapshot_baselineSnapshotId_idx" ON "public"."ValidationRunSnapshot"("baselineSnapshotId"); + +-- CreateIndex +CREATE INDEX "ValidationRunSnapshot_status_idx" ON "public"."ValidationRunSnapshot"("status"); + +-- CreateIndex +CREATE UNIQUE INDEX "ValidationRunSnapshot_runId_baselineSnapshotId_key" ON "public"."ValidationRunSnapshot"("runId", "baselineSnapshotId"); + +-- AddForeignKey +ALTER TABLE "public"."ValidationRun" ADD CONSTRAINT "ValidationRun_baselineId_fkey" FOREIGN KEY ("baselineId") REFERENCES "public"."ValidationBaseline"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "public"."ValidationRunSnapshot" ADD CONSTRAINT "ValidationRunSnapshot_runId_fkey" FOREIGN KEY ("runId") REFERENCES "public"."ValidationRun"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "public"."ValidationRunSnapshot" ADD CONSTRAINT "ValidationRunSnapshot_baselineSnapshotId_fkey" FOREIGN KEY ("baselineSnapshotId") REFERENCES "public"."ValidationBaselineSnapshot"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "public"."ValidationRunSnapshot" ADD CONSTRAINT "ValidationRunSnapshot_newEvaluationId_fkey" FOREIGN KEY ("newEvaluationId") REFERENCES "public"."EvaluationVersion"("id") ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/internal-packages/db/prisma/schema.prisma b/internal-packages/db/prisma/schema.prisma index 96c75c52..a0f2d3ae 100644 --- a/internal-packages/db/prisma/schema.prisma +++ b/internal-packages/db/prisma/schema.prisma @@ -151,6 +151,7 @@ model EvaluationVersion { job Job? metaEvaluations MetaEvaluation[] validationBaselineSnapshots ValidationBaselineSnapshot[] + validationRunSnapshots ValidationRunSnapshot[] @relation("ValidationRunNewEvaluation") @@unique([evaluationId, version]) @@index([evaluationId]) @@ -457,6 +458,7 @@ model ValidationBaseline { agent Agent @relation(fields: [agentId], references: [id], onDelete: Cascade) createdBy User? @relation(fields: [createdById], references: [id]) snapshots ValidationBaselineSnapshot[] + runs ValidationRun[] @@index([agentId]) @@index([createdAt]) @@ -468,10 +470,56 @@ model ValidationBaselineSnapshot { baselineId String evaluationVersionId String - baseline ValidationBaseline @relation(fields: [baselineId], references: [id], onDelete: Cascade) - evaluationVersion EvaluationVersion @relation(fields: [evaluationVersionId], references: [id], onDelete: Cascade) + baseline ValidationBaseline @relation(fields: [baselineId], references: [id], onDelete: Cascade) + evaluationVersion EvaluationVersion @relation(fields: [evaluationVersionId], references: [id], onDelete: Cascade) + runSnapshots ValidationRunSnapshot[] @@unique([baselineId, evaluationVersionId]) @@index([baselineId]) @@index([evaluationVersionId]) } + +/// A validation run - comparing new pipeline output against a baseline +model ValidationRun { + id String @id @default(cuid()) + baselineId String + name String? // Optional name for the run + commitHash String? // git commit when run was executed + status String @default("running") // "running" | "completed" | "failed" + summary String? // Quick summary: "4 unchanged, 2 changed" + createdAt DateTime @default(now()) + completedAt DateTime? + + baseline ValidationBaseline @relation(fields: [baselineId], references: [id], onDelete: Cascade) + snapshots ValidationRunSnapshot[] + + @@index([baselineId]) + @@index([createdAt]) + @@index([status]) +} + +/// Per-document results from a validation run +model ValidationRunSnapshot { + id String @id @default(cuid()) + runId String + baselineSnapshotId String // The baseline snapshot being compared against + newEvaluationId String // The new evaluation version from this run + + // Comparison results + status String // "unchanged" | "changed" + keptCount Int @default(0) // Comments that matched + newCount Int @default(0) // New comments not in baseline + lostCount Int @default(0) // Baseline comments not in new + + // Store detailed diff as JSON for viewing later + comparisonData Json? // { matchedComments, newComments, lostComments } + + run ValidationRun @relation(fields: [runId], references: [id], onDelete: Cascade) + baselineSnapshot ValidationBaselineSnapshot @relation(fields: [baselineSnapshotId], references: [id], onDelete: Cascade) + newEvaluation EvaluationVersion @relation("ValidationRunNewEvaluation", fields: [newEvaluationId], references: [id], onDelete: Cascade) + + @@unique([runId, baselineSnapshotId]) + @@index([runId]) + @@index([baselineSnapshotId]) + @@index([status]) +} diff --git a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts index 4fa08f94..549f1d77 100644 --- a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts +++ b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts @@ -1140,6 +1140,239 @@ export class MetaEvaluationRepository { return [...new Set(baseline.snapshots.map((s) => s.evaluationVersion.evaluation.documentId))]; } + + // ========================================================================== + // Validation Run Methods + // ========================================================================== + + /** + * Create a new validation run. + */ + async createValidationRun(input: { + baselineId: string; + name?: string; + commitHash?: string; + }): Promise<{ id: string; baselineId: string; status: string }> { + const run = await this.prisma.validationRun.create({ + data: { + baselineId: input.baselineId, + name: input.name, + commitHash: input.commitHash, + status: "running", + }, + }); + + return { + id: run.id, + baselineId: run.baselineId, + status: run.status, + }; + } + + /** + * Update validation run status and summary. + */ + async updateValidationRunStatus( + runId: string, + status: "running" | "completed" | "failed", + summary?: string + ): Promise { + await this.prisma.validationRun.update({ + where: { id: runId }, + data: { + status, + summary, + completedAt: status !== "running" ? new Date() : undefined, + }, + }); + } + + /** + * Add a per-document result to a validation run. + */ + async addValidationRunSnapshot(input: { + runId: string; + baselineSnapshotId: string; + newEvaluationId: string; + status: "unchanged" | "changed"; + keptCount: number; + newCount: number; + lostCount: number; + comparisonData?: unknown; + }): Promise<{ id: string }> { + const snapshot = await this.prisma.validationRunSnapshot.create({ + data: { + runId: input.runId, + baselineSnapshotId: input.baselineSnapshotId, + newEvaluationId: input.newEvaluationId, + status: input.status, + keptCount: input.keptCount, + newCount: input.newCount, + lostCount: input.lostCount, + comparisonData: input.comparisonData as object | undefined, + }, + }); + + return { id: snapshot.id }; + } + + /** + * Get all validation runs for a baseline. + */ + async getValidationRuns(baselineId: string): Promise< + Array<{ + id: string; + name: string | null; + commitHash: string | null; + status: string; + summary: string | null; + createdAt: Date; + completedAt: Date | null; + snapshotCount: number; + unchangedCount: number; + changedCount: number; + }> + > { + const runs = await this.prisma.validationRun.findMany({ + where: { baselineId }, + include: { + snapshots: { + select: { status: true }, + }, + }, + orderBy: { createdAt: "desc" }, + }); + + return runs.map((r) => ({ + id: r.id, + name: r.name, + commitHash: r.commitHash, + status: r.status, + summary: r.summary, + createdAt: r.createdAt, + completedAt: r.completedAt, + snapshotCount: r.snapshots.length, + unchangedCount: r.snapshots.filter((s) => s.status === "unchanged").length, + changedCount: r.snapshots.filter((s) => s.status === "changed").length, + })); + } + + /** + * Get full details of a validation run including all snapshot comparisons. + */ + async getValidationRunDetail(runId: string): Promise<{ + id: string; + name: string | null; + commitHash: string | null; + status: string; + summary: string | null; + createdAt: Date; + completedAt: Date | null; + baseline: { id: string; name: string }; + snapshots: Array<{ + id: string; + status: string; + keptCount: number; + newCount: number; + lostCount: number; + documentId: string; + documentTitle: string; + comparisonData: unknown; + }>; + } | null> { + const run = await this.prisma.validationRun.findUnique({ + where: { id: runId }, + include: { + baseline: { + select: { id: true, name: true }, + }, + snapshots: { + include: { + baselineSnapshot: { + include: { + evaluationVersion: { + include: { + evaluation: { + include: { + document: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { title: true }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }); + + if (!run) return null; + + return { + id: run.id, + name: run.name, + commitHash: run.commitHash, + status: run.status, + summary: run.summary, + createdAt: run.createdAt, + completedAt: run.completedAt, + baseline: run.baseline, + snapshots: run.snapshots.map((s) => ({ + id: s.id, + status: s.status, + keptCount: s.keptCount, + newCount: s.newCount, + lostCount: s.lostCount, + documentId: s.baselineSnapshot.evaluationVersion.evaluation.documentId, + documentTitle: + s.baselineSnapshot.evaluationVersion.evaluation.document.versions[0]?.title || "Unknown", + comparisonData: s.comparisonData, + })), + }; + } + + /** + * Delete a validation run. + */ + async deleteValidationRun(runId: string): Promise { + await this.prisma.validationRun.delete({ + where: { id: runId }, + }); + } + + /** + * Get baseline snapshot ID by baseline and document. + * Used when saving run results to link to the correct baseline snapshot. + */ + async getBaselineSnapshotByDocument( + baselineId: string, + documentId: string + ): Promise<{ id: string; evaluationVersionId: string } | null> { + const snapshot = await this.prisma.validationBaselineSnapshot.findFirst({ + where: { + baselineId, + evaluationVersion: { + evaluation: { + documentId, + }, + }, + }, + select: { + id: true, + evaluationVersionId: true, + }, + }); + + return snapshot; + } } // Default instance for convenience diff --git a/meta-evals/src/components/Validation.tsx b/meta-evals/src/components/Validation.tsx index 9ec90f26..f00794f5 100644 --- a/meta-evals/src/components/Validation.tsx +++ b/meta-evals/src/components/Validation.tsx @@ -23,10 +23,9 @@ import { type DocumentComparisonResult, type EvaluationSnapshot, compareSnapshots, - getComparisonStatus, } from "../validation"; -type Tab = "baselines" | "run" | "results"; +type Tab = "baselines" | "run" | "history"; interface ValidationProps { height: number; @@ -48,6 +47,19 @@ interface CorpusDocument extends ValidationDocument { selected: boolean; } +interface ValidationRunSummary { + id: string; + name: string | null; + commitHash: string | null; + status: string; + summary: string | null; + createdAt: Date; + completedAt: Date | null; + snapshotCount: number; + unchangedCount: number; + changedCount: number; +} + export function Validation({ height, maxItems, onBack, onCreateBatch }: ValidationProps) { const [activeTab, setActiveTab] = useState("baselines"); const [loading, setLoading] = useState(true); @@ -71,29 +83,56 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati const [isRunning, setIsRunning] = useState(false); const [runProgress, setRunProgress] = useState({ phase: "", current: 0, total: 0 }); - // Results state - const [comparisons, setComparisons] = useState([]); - const [savingBaseline, setSavingBaseline] = useState(false); - const [saveBaselineName, setSaveBaselineName] = useState(""); + // Run state (for tracking current run to auto-select after completion) + const [currentRunId, setCurrentRunId] = useState(null); + + // History state + const [validationRuns, setValidationRuns] = useState([]); + const [selectedRunId, setSelectedRunId] = useState(null); + const [selectedRunDetail, setSelectedRunDetail] = useState<{ + id: string; + name: string | null; + status: string; + summary: string | null; + createdAt: Date; + baseline: { id: string; name: string }; + snapshots: Array<{ + id: string; + status: string; + keptCount: number; + newCount: number; + lostCount: number; + documentId: string; + documentTitle: string; + comparisonData: unknown; + }>; + } | null>(null); + const [selectedSnapshotId, setSelectedSnapshotId] = useState(null); + const [selectedCommentKey, setSelectedCommentKey] = useState(null); // Keyboard handling useInput((input, key) => { if (key.escape) { - if (creatingBaseline) { + if (selectedCommentKey) { + setSelectedCommentKey(null); + } else if (selectedSnapshotId) { + setSelectedSnapshotId(null); + } else if (selectedRunDetail) { + setSelectedRunDetail(null); + setSelectedRunId(null); + } else if (creatingBaseline) { setCreatingBaseline(false); setShowCorpusSelect(false); - } else if (savingBaseline) { - setSavingBaseline(false); } else if (activeTab !== "baselines") { setActiveTab("baselines"); } else { onBack(); } } - if (key.tab && !creatingBaseline && !savingBaseline) { + if (key.tab && !creatingBaseline) { setActiveTab((prev) => { if (prev === "baselines") return "run"; - if (prev === "run") return comparisons.length > 0 ? "results" : "baselines"; + if (prev === "run") return "history"; return "baselines"; }); } @@ -112,6 +151,13 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati } }, [selectedAgent?.id]); + // Load validation runs when baseline selected + useEffect(() => { + if (selectedBaseline) { + loadValidationRuns(selectedBaseline.id); + } + }, [selectedBaseline?.id]); + async function loadAgents() { try { setLoading(true); @@ -179,6 +225,27 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati } } + async function loadValidationRuns(baselineId: string) { + try { + const runs = await metaEvaluationRepository.getValidationRuns(baselineId); + setValidationRuns(runs); + } catch (e) { + setError(String(e)); + } + } + + async function loadRunDetail(runId: string) { + try { + setLoading(true); + const detail = await metaEvaluationRepository.getValidationRunDetail(runId); + setSelectedRunDetail(detail); + setLoading(false); + } catch (e) { + setError(String(e)); + setLoading(false); + } + } + async function createBaseline() { if (!selectedAgent || !newBaselineName.trim()) return; @@ -236,10 +303,21 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati setIsRunning(true); setActiveTab("run"); - setComparisons([]); + setCurrentRunId(null); + + let runId: string | null = null; try { - // Phase 1: Get baseline snapshots + // Phase 1: Create validation run record + setRunProgress({ phase: "Creating run...", current: 0, total: 0 }); + const run = await metaEvaluationRepository.createValidationRun({ + baselineId: selectedBaseline.id, + name: `Run ${new Date().toLocaleString()}`, + }); + runId = run.id; + setCurrentRunId(runId); + + // Phase 2: Get baseline snapshots setRunProgress({ phase: "Loading baseline...", current: 0, total: 0 }); const baselineSnapshots = await metaEvaluationRepository.getBaselineSnapshots(selectedBaseline.id); @@ -247,14 +325,14 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati throw new Error("Baseline has no snapshots"); } - // Phase 2: Run pipeline on documents + // Phase 3: Run pipeline on documents setRunProgress({ phase: "Running pipeline...", current: 0, total: baselineSnapshots.length }); const documentIds = [...new Set(baselineSnapshots.map((s) => s.documentId))]; // Create batch jobs const jobIds = await onCreateBatch(selectedAgent.id, documentIds); - // Phase 3: Wait for jobs to complete and get results + // Phase 4: Wait for jobs to complete and get results setRunProgress({ phase: "Waiting for jobs...", current: 0, total: jobIds.length }); // Poll for job completion @@ -275,7 +353,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati setRunProgress({ phase: "Waiting for jobs...", current: completed, total: jobIds.length }); } - // Phase 4: Get new evaluation versions and compare + // Phase 5: Get new evaluation versions and compare setRunProgress({ phase: "Comparing results...", current: 0, total: baselineSnapshots.length }); const jobs = await prisma.job.findMany({ @@ -292,8 +370,11 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati newVersionIds.map((id) => metaEvaluationRepository.getEvaluationSnapshotById(id)) ); - // Compare + // Compare and save results const results: DocumentComparisonResult[] = []; + let unchangedCount = 0; + let changedCount = 0; + for (const baselineSnapshot of baselineSnapshots) { const newSnapshot = newSnapshots.find( (s) => s && s.documentId === baselineSnapshot.documentId @@ -302,43 +383,70 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati if (newSnapshot) { const baselineEval = toEvaluationSnapshot(baselineSnapshot); const currentEval = toEvaluationSnapshot(newSnapshot); - results.push(compareSnapshots(baselineEval, currentEval)); + const comparison = compareSnapshots(baselineEval, currentEval); + results.push(comparison); + + // Save snapshot result to database + const baselineSnapshotRecord = await metaEvaluationRepository.getBaselineSnapshotByDocument( + selectedBaseline.id, + baselineSnapshot.documentId + ); + + if (baselineSnapshotRecord && runId) { + const status = comparison.newComments.length === 0 && comparison.lostComments.length === 0 + ? "unchanged" + : "changed"; + + if (status === "unchanged") unchangedCount++; + else changedCount++; + + await metaEvaluationRepository.addValidationRunSnapshot({ + runId, + baselineSnapshotId: baselineSnapshotRecord.id, + newEvaluationId: newSnapshot.evaluationVersionId, + status, + keptCount: comparison.matchedComments.length, + newCount: comparison.newComments.length, + lostCount: comparison.lostComments.length, + comparisonData: { + matchedComments: comparison.matchedComments, + newComments: comparison.newComments, + lostComments: comparison.lostComments, + // Include filter reasoning from the current run's telemetry + filteredItems: currentEval.pipelineTelemetry?.filteredItems, + }, + }); + } } setRunProgress((p) => ({ ...p, current: p.current + 1 })); } - setComparisons(results); - setActiveTab("results"); - } catch (e) { - setError(String(e)); - } finally { - setIsRunning(false); - } - } - - async function saveResultsAsBaseline() { - if (!selectedAgent || !saveBaselineName.trim() || comparisons.length === 0) return; - - try { - setSavingBaseline(false); - setLoading(true); - - // Get the "current" evaluation version IDs from comparisons - const evalVersionIds = comparisons.map((c) => c.current.evaluationVersionId); + // Update run status + if (runId) { + const summary = `${unchangedCount} unchanged, ${changedCount} changed`; + await metaEvaluationRepository.updateValidationRunStatus(runId, "completed", summary); + } - await metaEvaluationRepository.createValidationBaseline({ - name: saveBaselineName.trim(), - agentId: selectedAgent.id, - evaluationVersionIds: evalVersionIds, - }); + // Reload runs list and navigate to history + if (selectedBaseline) { + await loadValidationRuns(selectedBaseline.id); + } - await loadBaselines(selectedAgent.id); - setSaveBaselineName(""); - setLoading(false); + // Navigate to history and auto-load the run detail + setActiveTab("history"); + if (runId) { + setSelectedRunId(runId); + await loadRunDetail(runId); + } } catch (e) { + // Mark run as failed if it was created + if (runId) { + await metaEvaluationRepository.updateValidationRunStatus(runId, "failed", String(e)); + } setError(String(e)); - setLoading(false); + } finally { + setIsRunning(false); } } @@ -364,8 +472,8 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati [Run] - - [Results] + + [History] (Tab to switch) @@ -454,143 +562,456 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati ); } - // Saving results as baseline - if (savingBaseline) { + // Run tab + if (activeTab === "run") { return ( - - - Save current results as a new baseline for future comparisons - + + {renderTabs()} - - Name: - { - if (saveBaselineName.trim()) { - saveResultsAsBaseline(); - } - }} - /> - + {isRunning ? ( + + {runProgress.phase} + {runProgress.total > 0 && ( + {runProgress.current}/{runProgress.total} + )} + + ) : selectedBaseline ? ( + + + + Baseline: {selectedBaseline.name} + {" "}({selectedBaseline.snapshotCount} docs) + + - - Enter Save | Escape Cancel - + { + if (item.value === "run") runValidation(); + else setActiveTab("baselines"); + }} + /> + + ) : ( + + No baseline selected. Create or select one first. + setActiveTab("baselines")} + /> + + )} ); } - // Results tab - if (activeTab === "results" && comparisons.length > 0) { - // Count by change status - const unchangedCount = comparisons.filter((c) => - c.newComments.length === 0 && c.lostComments.length === 0 - ).length; - const changedCount = comparisons.length - unchangedCount; + // Comment detail view + if (selectedRunDetail && selectedSnapshotId && selectedCommentKey) { + const snapshot = selectedRunDetail.snapshots.find((s) => s.id === selectedSnapshotId); + if (snapshot) { + const data = snapshot.comparisonData as { + matchedComments?: Array<{ baselineComment?: { quotedText: string; header: string | null; description: string }; currentComment?: { quotedText: string; header: string | null; description: string } }>; + newComments?: Array<{ quotedText: string; header: string | null; description: string }>; + lostComments?: Array<{ quotedText: string; header: string | null; description: string }>; + filteredItems?: Array<{ stage: string; quotedText: string; header?: string; filterReason: string; supportLocation?: string }>; + } | null; + + const matched = data?.matchedComments || []; + const newComments = data?.newComments || []; + const lost = data?.lostComments || []; + const filteredItems = data?.filteredItems || []; + + let commentType = ""; + let baselineComment: { quotedText: string; header: string | null; description: string } | null = null; + let currentComment: { quotedText: string; header: string | null; description: string } | null = null; + let filterInfo: { stage: string; filterReason: string; supportLocation?: string } | null = null; + + if (selectedCommentKey.startsWith("kept-")) { + const idx = parseInt(selectedCommentKey.replace("kept-", ""), 10); + const match = matched[idx]; + baselineComment = match?.baselineComment || null; + currentComment = match?.currentComment || null; + commentType = "Kept"; + } else if (selectedCommentKey.startsWith("new-")) { + const idx = parseInt(selectedCommentKey.replace("new-", ""), 10); + currentComment = newComments[idx] || null; + commentType = "New"; + } else if (selectedCommentKey.startsWith("lost-")) { + const idx = parseInt(selectedCommentKey.replace("lost-", ""), 10); + baselineComment = lost[idx] || null; + commentType = "Lost"; + + // Try to find filter reason for this lost comment + if (baselineComment && filteredItems.length > 0) { + // Match by quoted text (fuzzy match - check if texts contain each other) + const matchingFilter = filteredItems.find((f) => { + const fText = f.quotedText.toLowerCase().trim(); + const bText = baselineComment!.quotedText.toLowerCase().trim(); + // Check if either contains the other (for partial matches) + return fText.includes(bText) || bText.includes(fText) || + // Also check header match as fallback + (f.header && baselineComment!.header && f.header.toLowerCase() === baselineComment!.header.toLowerCase()); + }); + + if (matchingFilter) { + filterInfo = { + stage: matchingFilter.stage, + filterReason: matchingFilter.filterReason, + supportLocation: matchingFilter.supportLocation, + }; + } + } + } - // Format change summary for a comparison - const formatChangeSummary = (c: DocumentComparisonResult) => { - const parts: string[] = []; - const kept = c.matchedComments.length; - const added = c.newComments.length; - const lost = c.lostComments.length; + if (baselineComment || currentComment) { + const typeColor = commentType === "Kept" ? "green" : commentType === "New" ? "cyan" : "red"; + + // For Kept comments, show both versions side by side + if (commentType === "Kept" && baselineComment && currentComment) { + return ( + + + + {baselineComment.header || currentComment.header || "(no header)"} + + + + BASELINE: + "{baselineComment.quotedText}" + {baselineComment.description} + + + + CURRENT: + "{currentComment.quotedText}" + {currentComment.description} + + + + + setSelectedCommentKey(null)} + /> + + + ); + } + + // For Lost comments with filter reason, show detailed view + if (commentType === "Lost" && baselineComment && filterInfo) { + return ( + + + + {baselineComment.header || "(no header)"} + + + + Quoted text (from baseline): + "{baselineComment.quotedText}" + + + + Description: + {baselineComment.description} + + + + Filter Reason ({filterInfo.stage}): + {filterInfo.filterReason} + {filterInfo.supportLocation && ( + + Support found at: + {filterInfo.supportLocation} + + )} + + + + + setSelectedCommentKey(null)} + /> + + + ); + } + + // For New/Lost (without filter reason), show single version with label + const comment = currentComment || baselineComment; + const versionLabel = commentType === "New" ? "(from current run)" : "(from baseline)"; + + return ( + + + + {comment!.header || "(no header)"} + + + + Quoted text: + "{comment!.quotedText}" + + + + Description: + {comment!.description} + + + {commentType === "Lost" && !filterInfo && ( + + Why was this comment lost? + + {data?.filteredItems !== undefined + ? "This issue was not extracted by the current pipeline run. The LLM did not identify it as an issue during extraction (this is normal variance between runs)." + : "No filter telemetry available for this run (run predates telemetry feature)."} + + + )} + + + + setSelectedCommentKey(null)} + /> + + + ); + } + } + } + + // Document comparison detail view + if (selectedRunDetail && selectedSnapshotId) { + const snapshot = selectedRunDetail.snapshots.find((s) => s.id === selectedSnapshotId); + if (snapshot) { + const data = snapshot.comparisonData as { + matchedComments?: Array<{ baselineComment?: { quotedText: string; header: string | null }; currentComment?: { quotedText: string; header: string | null } }>; + newComments?: Array<{ quotedText: string; header: string | null; description: string }>; + lostComments?: Array<{ quotedText: string; header: string | null; description: string }>; + filteredItems?: Array<{ stage: string; quotedText: string; header?: string; filterReason: string; supportLocation?: string }>; + } | null; + + const matched = data?.matchedComments || []; + const newComments = data?.newComments || []; + const lost = data?.lostComments || []; + const filteredItems = data?.filteredItems || []; + + // Helper to check if a lost comment has a filter reason + const hasFilterReason = (lostComment: { quotedText: string; header: string | null }) => { + if (filteredItems.length === 0) return false; + return filteredItems.some((f) => { + const fText = f.quotedText.toLowerCase().trim(); + const lText = lostComment.quotedText.toLowerCase().trim(); + return fText.includes(lText) || lText.includes(fText) || + (f.header && lostComment.header && f.header.toLowerCase() === lostComment.header.toLowerCase()); + }); + }; + + // Build scrollable list of ALL comments - no truncation + const commentItems: Array<{ label: string; value: string }> = []; + + // Add all kept comments + matched.forEach((c, i) => { + const comment = c.baselineComment || c.currentComment; + const label = comment ? (comment.header || truncate(comment.quotedText, 50)) : "Unknown"; + commentItems.push({ + label: ` βœ“ ${label}`, + value: `kept-${i}`, + }); + }); + + // Add all new comments + newComments.forEach((c, i) => { + commentItems.push({ + label: ` + ${c.header || truncate(c.quotedText, 50)}`, + value: `new-${i}`, + }); + }); + + // Add all lost comments - mark those with filter reasons differently + lost.forEach((c, i) => { + const hasReason = hasFilterReason(c); + // ⊘ = filtered with reason, βˆ’ = not extracted (no reason) + const indicator = hasReason ? "⊘" : "βˆ’"; + commentItems.push({ + label: ` ${indicator} ${c.header || truncate(c.quotedText, 50)}`, + value: `lost-${i}`, + }); + }); + + if (commentItems.length === 0) { + commentItems.push({ label: " No comments in this comparison", value: "empty" }); + } + + commentItems.push({ label: " ← Back", value: "back" }); + + // Count lost with/without filter reasons + const lostWithReason = lost.filter((c) => hasFilterReason(c)).length; + const lostWithoutReason = lost.length - lostWithReason; + + return ( + + + + + βœ“ {matched.length} kept + + + + {newComments.length} new + + + βˆ’ {lost.length} lost + {lost.length > 0 && ( + ({lostWithReason} filtered, {lostWithoutReason} not extracted) + )} + + + + Legend: βœ“ kept + new ⊘ filtered (has reason) βˆ’ not extracted + + + + { + if (item.value === "back") { + setSelectedSnapshotId(null); + } else if (item.value.startsWith("kept-") || item.value.startsWith("new-") || item.value.startsWith("lost-")) { + setSelectedCommentKey(item.value); + } + }} + /> - if (kept > 0) parts.push(`${kept} kept`); - if (added > 0) parts.push(`+${added} new`); - if (lost > 0) parts.push(`-${lost} lost`); + + Enter View Comment | Escape Back to Run + + + ); + } + } + // Run detail view + if (selectedRunDetail) { + const formatChangeSummary = (s: { keptCount: number; newCount: number; lostCount: number }) => { + const parts: string[] = []; + if (s.keptCount > 0) parts.push(`${s.keptCount} kept`); + if (s.newCount > 0) parts.push(`+${s.newCount} new`); + if (s.lostCount > 0) parts.push(`-${s.lostCount} lost`); return parts.length > 0 ? parts.join(", ") : "no comments"; }; - const items = [ - ...comparisons.slice(0, maxItems - 4).map((c) => { - const hasChanges = c.newComments.length > 0 || c.lostComments.length > 0; - const icon = hasChanges ? "~" : "="; - const color = hasChanges ? "yellow" : "green"; + const unchangedCount = selectedRunDetail.snapshots.filter((s) => s.status === "unchanged").length; + const changedCount = selectedRunDetail.snapshots.filter((s) => s.status === "changed").length; + const items = [ + ...selectedRunDetail.snapshots.slice(0, maxItems - 3).map((s) => { + const icon = s.status === "unchanged" ? "=" : "~"; return { - label: `[${icon}] ${truncate(c.documentTitle, 35)} | ${formatChangeSummary(c)}`, - value: c.documentId, + label: `[${icon}] ${truncate(s.documentTitle, 35)} | ${formatChangeSummary(s)}`, + value: s.id, }; }), - { label: "+ Save as New Baseline", value: "save" }, - { label: "← Back to Baselines", value: "back" }, + { label: "← Back to History", value: "back" }, ]; return ( - - {renderTabs()} - + [=] {unchangedCount} unchanged {" | "} [~] {changedCount} changed {" | "} - Baseline: {selectedBaseline?.name || "?"} + Baseline: {selectedRunDetail.baseline.name} { - if (item.value === "save") { - setSavingBaseline(true); - setSaveBaselineName(`Post-${selectedBaseline?.name || "run"}`); - } else if (item.value === "back") { - setActiveTab("baselines"); + if (item.value === "back") { + setSelectedRunDetail(null); + setSelectedRunId(null); + } else { + setSelectedSnapshotId(item.value); } - // TODO: Show detail view for specific document }} /> + + + Enter View Comments | Escape Back to History + ); } - // Run tab - if (activeTab === "run") { + // History tab + if (activeTab === "history") { + const formatDate = (d: Date) => { + return new Date(d).toLocaleString("en-US", { + month: "short", + day: "numeric", + hour: "2-digit", + minute: "2-digit", + }); + }; + + const items = [ + ...validationRuns.slice(0, maxItems - 3).map((r) => { + const statusIcon = r.status === "completed" + ? (r.changedCount === 0 ? "=" : "~") + : r.status === "running" ? "*" : "x"; + + return { + label: `[${statusIcon}] ${formatDate(r.createdAt)} | ${r.summary || r.status}`, + value: `view:${r.id}`, + }; + }), + { label: "← Back to Baselines", value: "back" }, + ]; + return ( - + {renderTabs()} - {isRunning ? ( - - {runProgress.phase} - {runProgress.total > 0 && ( - {runProgress.current}/{runProgress.total} - )} - - ) : selectedBaseline ? ( - - - - Baseline: {selectedBaseline.name} - {" "}({selectedBaseline.snapshotCount} docs) - - + + + Baseline: {selectedBaseline?.name || "None"} + {" | "} + {validationRuns.length} run{validationRuns.length !== 1 ? "s" : ""} + + - { - if (item.value === "run") runValidation(); - else setActiveTab("baselines"); - }} - /> + {validationRuns.length === 0 ? ( + + No runs yet. Go to Run tab to execute a validation run. ) : ( - - No baseline selected. Create or select one first. - setActiveTab("baselines")} - /> - + { + if (item.value === "back") { + setActiveTab("baselines"); + } else if (item.value.startsWith("view:")) { + const runId = item.value.replace("view:", ""); + setSelectedRunId(runId); + loadRunDetail(runId); + } + }} + /> )} + + + Enter View Details | Tab Switch | Escape Back + ); } @@ -692,6 +1113,14 @@ function extractTelemetry(raw: unknown): { issuesAfterFiltering: number; commentsGenerated: number; commentsKept: number; + filteredItems?: Array<{ + stage: string; + quotedText: string; + header?: string; + filterReason: string; + supportLocation?: string; + originalIndex: number; + }>; } | null { if (!raw || typeof raw !== "object") return null; @@ -700,6 +1129,16 @@ function extractTelemetry(raw: unknown): { if (!finalCounts) return null; + // Extract filtered items if present + const filteredItems = telemetry.filteredItems as Array<{ + stage: string; + quotedText: string; + header?: string; + filterReason: string; + supportLocation?: string; + originalIndex: number; + }> | undefined; + return { totalDurationMs: (telemetry.totalDurationMs as number) || 0, issuesExtracted: finalCounts.issuesExtracted || 0, @@ -707,5 +1146,6 @@ function extractTelemetry(raw: unknown): { issuesAfterFiltering: finalCounts.issuesAfterFiltering || 0, commentsGenerated: finalCounts.commentsGenerated || 0, commentsKept: finalCounts.commentsKept || 0, + filteredItems, }; } diff --git a/meta-evals/src/validation/types.ts b/meta-evals/src/validation/types.ts index 8ccfd61a..7b298c3b 100644 --- a/meta-evals/src/validation/types.ts +++ b/meta-evals/src/validation/types.ts @@ -43,6 +43,18 @@ export interface EvaluationSnapshot { pipelineTelemetry: PipelineTelemetrySnapshot | null; } +/** + * Record of an item filtered by the pipeline + */ +export interface FilteredItemSnapshot { + stage: string; + quotedText: string; + header?: string; + filterReason: string; + supportLocation?: string; + originalIndex: number; +} + /** * Simplified telemetry for comparison */ @@ -53,6 +65,8 @@ export interface PipelineTelemetrySnapshot { issuesAfterFiltering: number; commentsGenerated: number; commentsKept: number; + /** Items filtered out with their reasoning */ + filteredItems?: FilteredItemSnapshot[]; } /** From c35bd5cf121cd74e7dc37775d31cf55375bb03b7 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 13:14:27 +0000 Subject: [PATCH 16/72] feat: Add multi-extractor with thinking/temperature controls + OpenRouter direct API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-extractor system: - Run multiple extractors in parallel with different models/settings - Optional LLM judge for aggregation (disabled by default, uses simple dedup) - Per-extractor configuration via FALLACY_EXTRACTORS env var New extractor config options: - `thinking: boolean` - Enable/disable extended thinking (Claude) or reasoning (OpenRouter) - `temperature: number | "default"` - Explicit temp or use model's native default OpenRouter direct API: - Replaced OpenAI SDK with direct HTTP calls for full parameter control - Proper `reasoning_effort` support: none/minimal/low/medium/high/xhigh - New `callOpenRouterChat()` for non-tool-calling use cases - Updated claim-evaluator to use new API Telemetry & UI: - Track temperatureConfig and thinkingEnabled per extractor - Display extraction params in validation UI πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../fallacy-check/extraction/config.ts | 275 ++++++++++ .../plugins/fallacy-check/extraction/index.ts | 9 + .../extraction/multiExtractor.ts | 267 ++++++++++ .../plugins/fallacy-check/extraction/types.ts | 235 +++++++++ .../plugins/fallacy-check/index.ts | 253 +++++++++- .../telemetry/PipelineTelemetry.ts | 11 + .../plugins/fallacy-check/telemetry/index.ts | 3 + .../plugins/fallacy-check/telemetry/types.ts | 113 +++++ internal-packages/ai/src/claude/wrapper.ts | 23 +- .../ai/src/tools/claim-evaluator/index.ts | 56 +-- .../ai/src/tools/fallacy-extractor/index.ts | 26 +- .../ai/src/tools/fallacy-extractor/types.ts | 16 + .../ai/src/tools/fallacy-judge/config.ts | 12 + .../ai/src/tools/fallacy-judge/index.ts | 386 ++++++++++++++ .../ai/src/tools/fallacy-judge/types.ts | 124 +++++ .../ai/src/tools/generated-schemas.ts | 20 +- internal-packages/ai/src/utils/openrouter.ts | 476 +++++++++++++++--- meta-evals/src/components/Validation.tsx | 89 ++++ meta-evals/src/validation/types.ts | 32 ++ 19 files changed, 2297 insertions(+), 129 deletions(-) create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/index.ts create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/multiExtractor.ts create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts create mode 100644 internal-packages/ai/src/tools/fallacy-judge/config.ts create mode 100644 internal-packages/ai/src/tools/fallacy-judge/index.ts create mode 100644 internal-packages/ai/src/tools/fallacy-judge/types.ts diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts new file mode 100644 index 00000000..29a23b48 --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts @@ -0,0 +1,275 @@ +/** + * Multi-Extractor Configuration Parser + * + * Parses the FALLACY_EXTRACTORS environment variable and provides defaults. + */ + +import type { ExtractorConfig, MultiExtractorConfig } from './types'; + +/** Default model for extraction when not configured */ +const DEFAULT_EXTRACTOR_MODEL = 'claude-sonnet-4-5-20250929'; + +/** Default model for judge aggregation */ +const DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-5-20250929'; + +/** Default temperature for Claude models */ +const DEFAULT_CLAUDE_TEMPERATURE = 0; + +/** Default temperature for OpenRouter models */ +const DEFAULT_OPENROUTER_TEMPERATURE = 0.1; + +/** + * Check if a model is an OpenRouter model (contains '/') + */ +function isOpenRouterModel(model: string): boolean { + return model.includes('/'); +} + +/** + * Get default temperature for a model + */ +export function getDefaultTemperature(model: string): number { + return isOpenRouterModel(model) + ? DEFAULT_OPENROUTER_TEMPERATURE + : DEFAULT_CLAUDE_TEMPERATURE; +} + +/** + * Generate a unique label for an extractor config + */ +export function generateExtractorLabel(config: ExtractorConfig): string { + if (config.label) { + return config.label; + } + + // Extract short model name + let shortName: string; + if (isOpenRouterModel(config.model)) { + // e.g., "google/gemini-3-flash-preview" -> "gemini-3-flash" + const parts = config.model.split('/'); + shortName = parts[parts.length - 1].replace('-preview', '').replace('-latest', ''); + } else { + // e.g., "claude-sonnet-4-5-20250929" -> "sonnet" + if (config.model.includes('opus')) { + shortName = 'opus'; + } else if (config.model.includes('sonnet')) { + shortName = 'sonnet'; + } else if (config.model.includes('haiku')) { + shortName = 'haiku'; + } else { + shortName = config.model.slice(0, 10); + } + } + + // Build suffix parts + const suffixParts: string[] = []; + + // Add temperature suffix if non-default + if (config.temperature === 'default') { + suffixParts.push('tDef'); + } else { + const defaultTemp = getDefaultTemperature(config.model); + const temp = config.temperature ?? defaultTemp; + if (temp !== defaultTemp) { + suffixParts.push(`t${temp}`); + } + } + + // Add thinking suffix if disabled + if (config.thinking === false) { + suffixParts.push('noThink'); + } + + if (suffixParts.length > 0) { + return `${shortName}-${suffixParts.join('-')}`; + } + + return shortName; +} + +/** + * Generate a unique extractor ID (for telemetry correlation) + */ +export function generateExtractorId( + config: ExtractorConfig, + index: number, + allConfigs: ExtractorConfig[] +): string { + const label = generateExtractorLabel(config); + + // Check if this label would be duplicated + const sameLabels = allConfigs.filter(c => generateExtractorLabel(c) === label); + + // Only append index if there are duplicates + if (sameLabels.length > 1) { + return `${label}-${index}`; + } + return label; +} + +/** + * Parse and validate the FALLACY_EXTRACTORS environment variable + * + * Expected format: + * ```json + * [ + * {"model": "claude-sonnet-4-5-20250929"}, + * {"model": "claude-sonnet-4-5-20250929", "temperature": 0.5}, + * {"model": "google/gemini-3-flash-preview", "temperature": 0.1} + * ] + * ``` + */ +function parseExtractorsEnvVar(envValue: string): ExtractorConfig[] { + try { + const parsed = JSON.parse(envValue); + + if (!Array.isArray(parsed)) { + console.warn( + '[MultiExtractor] FALLACY_EXTRACTORS must be a JSON array, using defaults' + ); + return []; + } + + const configs: ExtractorConfig[] = []; + for (const item of parsed) { + if (typeof item !== 'object' || item === null) { + console.warn('[MultiExtractor] Invalid extractor config, skipping:', item); + continue; + } + + if (typeof item.model !== 'string' || !item.model) { + console.warn( + '[MultiExtractor] Extractor config missing model, skipping:', + item + ); + continue; + } + + const config: ExtractorConfig = { + model: item.model, + }; + + // Temperature can be a number or "default" string + if (typeof item.temperature === 'number') { + config.temperature = item.temperature; + } else if (item.temperature === 'default') { + config.temperature = 'default'; + } + + if (typeof item.label === 'string' && item.label) { + config.label = item.label; + } + + // Thinking defaults to true (enabled), can be set to false + if (typeof item.thinking === 'boolean') { + config.thinking = item.thinking; + } + + configs.push(config); + } + + return configs; + } catch (error) { + console.warn( + '[MultiExtractor] Failed to parse FALLACY_EXTRACTORS:', + error instanceof Error ? error.message : error + ); + return []; + } +} + +/** + * Get the multi-extractor configuration from environment variables + * + * Environment variables: + * - FALLACY_EXTRACTORS: JSON array of extractor configs + * - FALLACY_EXTRACTOR_MODEL: Single model override (legacy, used if FALLACY_EXTRACTORS not set) + * - FALLACY_JUDGE_MODEL: Model for judge aggregation + * - FALLACY_JUDGE_ENABLED: Enable LLM judge (default: false - uses simple dedup) + * + * Defaults to single extractor with DEFAULT_EXTRACTOR_MODEL if not configured. + */ +export function getMultiExtractorConfig(): MultiExtractorConfig { + const extractorsEnv = process.env.FALLACY_EXTRACTORS; + const legacyModelEnv = process.env.FALLACY_EXTRACTOR_MODEL; + const judgeModelEnv = process.env.FALLACY_JUDGE_MODEL; + const judgeEnabledEnv = process.env.FALLACY_JUDGE_ENABLED; + + let extractors: ExtractorConfig[]; + + if (extractorsEnv) { + // Parse multi-extractor config + extractors = parseExtractorsEnvVar(extractorsEnv); + + if (extractors.length === 0) { + // Parsing failed or empty array, fall back to defaults + console.warn( + '[MultiExtractor] No valid extractors in FALLACY_EXTRACTORS, using defaults' + ); + extractors = [{ model: legacyModelEnv || DEFAULT_EXTRACTOR_MODEL }]; + } + } else if (legacyModelEnv) { + // Legacy single-model configuration + extractors = [{ model: legacyModelEnv }]; + } else { + // Default configuration + extractors = [{ model: DEFAULT_EXTRACTOR_MODEL }]; + } + + // Judge is disabled by default - uses simple deduplication instead + const judgeEnabled = judgeEnabledEnv === 'true' || judgeEnabledEnv === '1'; + + return { + extractors, + judgeModel: judgeModelEnv || DEFAULT_JUDGE_MODEL, + judgeEnabled, + }; +} + +/** + * Check if LLM judge is enabled for aggregation + */ +export function isJudgeEnabled(): boolean { + const config = getMultiExtractorConfig(); + return config.judgeEnabled; +} + +/** + * Check if multi-extractor mode is enabled (more than one extractor configured) + */ +export function isMultiExtractorEnabled(): boolean { + const config = getMultiExtractorConfig(); + return config.extractors.length > 1; +} + +/** + * Get a human-readable summary of the current configuration + */ +export function getConfigSummary(): string { + const config = getMultiExtractorConfig(); + + const formatTemp = (ext: ExtractorConfig): string => { + if (ext.temperature === 'default') return 'default'; + return String(ext.temperature ?? getDefaultTemperature(ext.model)); + }; + + const formatThinking = (ext: ExtractorConfig): string => { + return ext.thinking === false ? ', think=off' : ''; + }; + + if (config.extractors.length === 1) { + const ext = config.extractors[0]; + return `Single extractor: ${ext.model} (t=${formatTemp(ext)}${formatThinking(ext)})`; + } + + const extractorSummaries = config.extractors.map((ext, i) => { + const label = generateExtractorLabel(ext); + return `${i + 1}. ${label} (${ext.model}, t=${formatTemp(ext)}${formatThinking(ext)})`; + }); + + return [ + `Multi-extractor mode: ${config.extractors.length} extractors`, + ...extractorSummaries, + `Judge: ${config.judgeEnabled ? config.judgeModel : 'disabled (simple dedup)'}`, + ].join('\n'); +} diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/index.ts new file mode 100644 index 00000000..1f083a26 --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/index.ts @@ -0,0 +1,9 @@ +/** + * Multi-Extractor Module + * + * Provides parallel extraction with multiple models and LLM judge aggregation. + */ + +export * from './types'; +export * from './config'; +export { runMultiExtractor } from './multiExtractor'; diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/multiExtractor.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/multiExtractor.ts new file mode 100644 index 00000000..17d95c19 --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/multiExtractor.ts @@ -0,0 +1,267 @@ +/** + * Multi-Extractor Runner + * + * Runs multiple fallacy extractors in parallel and aggregates results. + * Supports different models and/or temperatures for diversity. + */ + +import { logger } from '../../../../shared/logger'; +import fallacyExtractorTool from '../../../../tools/fallacy-extractor'; +import type { ExtractedFallacyIssue } from '../../../../tools/fallacy-extractor/types'; +import type { + ExtractorConfig, + MultiExtractorConfig, + ExtractorResult, + MultiExtractorResult, +} from './types'; +import { generateExtractorId, getDefaultTemperature } from './config'; + +/** + * Run a single extractor with the given configuration + */ +async function runSingleExtractor( + documentText: string, + config: ExtractorConfig, + extractorId: string +): Promise { + const startTime = Date.now(); + + // Handle temperature: "default" means don't pass, undefined means use our default + const temperatureForLog = config.temperature === 'default' + ? 'default' + : (typeof config.temperature === 'number' ? config.temperature : getDefaultTemperature(config.model)); + + logger.info(`[MultiExtractor] Starting extractor: ${extractorId}`, { + model: config.model, + temperature: temperatureForLog, + thinking: config.thinking !== false, + documentLength: documentText.length, + }); + + try { + const result = await fallacyExtractorTool.execute( + { + documentText, + model: config.model, + // Pass temperature as-is (can be number, "default", or undefined) + temperature: config.temperature, + // Pass thinking parameter (undefined or boolean) + thinking: config.thinking, + }, + { logger } + ); + + const durationMs = Date.now() - startTime; + + logger.info(`[MultiExtractor] Extractor ${extractorId} completed`, { + issuesFound: result.issues.length, + durationMs, + wasComplete: result.wasComplete, + }); + + return { + extractorId, + config, + issues: result.issues, + durationMs, + // TODO: Add cost tracking from API response when available + }; + } catch (error) { + const durationMs = Date.now() - startTime; + const errorMessage = error instanceof Error ? error.message : String(error); + + logger.error(`[MultiExtractor] Extractor ${extractorId} failed`, { + error: errorMessage, + durationMs, + }); + + return { + extractorId, + config, + issues: [], + durationMs, + error: errorMessage, + }; + } +} + +/** + * Run multiple extractors in parallel + * + * @param documentText - Full document text to analyze + * @param config - Multi-extractor configuration + * @returns Combined results from all extractors + */ +export async function runMultiExtractor( + documentText: string, + config: MultiExtractorConfig +): Promise { + const startTime = Date.now(); + const { extractors } = config; + + logger.info(`[MultiExtractor] Starting parallel extraction`, { + extractorCount: extractors.length, + documentLength: documentText.length, + }); + + // Generate unique IDs for each extractor + const extractorsWithIds = extractors.map((ext, index) => ({ + config: ext, + extractorId: generateExtractorId(ext, index, extractors), + })); + + // Run all extractors in parallel + const extractorPromises = extractorsWithIds.map(({ config: extConfig, extractorId }) => + runSingleExtractor(documentText, extConfig, extractorId) + ); + + const settledResults = await Promise.allSettled(extractorPromises); + + // Process results + const extractorResults: ExtractorResult[] = settledResults.map((result, index) => { + if (result.status === 'fulfilled') { + return result.value; + } + + // Promise rejection (shouldn't happen since we catch inside runSingleExtractor) + const extConfig = extractorsWithIds[index]; + return { + extractorId: extConfig.extractorId, + config: extConfig.config, + issues: [], + durationMs: 0, + error: result.reason instanceof Error ? result.reason.message : String(result.reason), + }; + }); + + const totalDurationMs = Date.now() - startTime; + const totalIssuesFound = extractorResults.reduce( + (sum, r) => sum + r.issues.length, + 0 + ); + + // Log summary + const successCount = extractorResults.filter((r) => !r.error).length; + const failedCount = extractorResults.filter((r) => r.error).length; + + logger.info(`[MultiExtractor] Parallel extraction complete`, { + totalDurationMs, + totalIssuesFound, + successCount, + failedCount, + extractorSummaries: extractorResults.map((r) => ({ + extractorId: r.extractorId, + issuesFound: r.issues.length, + durationMs: r.durationMs, + error: r.error, + })), + }); + + return { + extractorResults, + totalDurationMs, + totalIssuesFound, + }; +} + +/** + * Flatten all issues from multi-extractor results with source tracking + * + * @param result - Multi-extractor result + * @returns Array of issues with extractorId attached + */ +export function flattenExtractorIssues( + result: MultiExtractorResult +): Array { + const allIssues: Array = []; + + for (const extractor of result.extractorResults) { + for (const issue of extractor.issues) { + allIssues.push({ + ...issue, + extractorId: extractor.extractorId, + }); + } + } + + return allIssues; +} + +/** + * Group issues by their quoted text for deduplication + * Issues with similar text (after normalization) are grouped together + * + * @param issues - Flattened issues with extractor IDs + * @returns Map of normalized text to array of issues + */ +export function groupIssuesByText( + issues: Array +): Map> { + const groups = new Map>(); + + for (const issue of issues) { + // Normalize text for comparison + const normalizedText = issue.exactText + .toLowerCase() + .replace(/\s+/g, ' ') + .trim(); + + const existing = groups.get(normalizedText); + if (existing) { + existing.push(issue); + } else { + groups.set(normalizedText, [issue]); + } + } + + return groups; +} + +/** + * Simple majority-vote deduplication (for use when judge is disabled) + * Keeps issues found by multiple extractors OR high-confidence single-source issues + * + * @param result - Multi-extractor result + * @param options - Dedup options + * @returns Deduplicated issues + */ +export function simpleDeduplication( + result: MultiExtractorResult, + options: { + /** Minimum extractors that must agree for low-confidence issues */ + minAgreement?: number; + /** Confidence threshold for single-source acceptance */ + singleSourceConfidenceThreshold?: number; + } = {} +): ExtractedFallacyIssue[] { + const { + minAgreement = 2, + singleSourceConfidenceThreshold = 85, + } = options; + + const flatIssues = flattenExtractorIssues(result); + const grouped = groupIssuesByText(flatIssues); + const deduped: ExtractedFallacyIssue[] = []; + + for (const [, issues] of grouped) { + const sourceCount = new Set(issues.map((i) => i.extractorId)).size; + + // Keep if multiple extractors found it + if (sourceCount >= minAgreement) { + // Pick the issue with highest confidence + const bestIssue = issues.reduce((best, current) => + current.confidenceScore > best.confidenceScore ? current : best + ); + deduped.push(bestIssue); + continue; + } + + // Keep single-source issues only if high confidence + const bestIssue = issues[0]; + if (bestIssue.confidenceScore >= singleSourceConfidenceThreshold) { + deduped.push(bestIssue); + } + } + + return deduped; +} diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts new file mode 100644 index 00000000..7125fff6 --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts @@ -0,0 +1,235 @@ +/** + * Multi-Extractor Types + * + * Configuration and result types for running multiple fallacy extractors + * in parallel with LLM judge aggregation. + */ + +import type { ExtractedFallacyIssue } from '../../../../tools/fallacy-extractor/types'; + +// ============================================================================ +// Configuration Types +// ============================================================================ + +/** + * Configuration for a single extractor instance + */ +export interface ExtractorConfig { + /** Model ID (Claude or OpenRouter format) */ + model: string; + + /** + * Temperature setting: + * - undefined: Use model-specific default (0 for Claude, 0.1 for OpenRouter) + * - number: Use this specific temperature + * - "default": Let the model use its own default (don't pass temperature) + */ + temperature?: number | 'default'; + + /** Optional display label (auto-generated if not provided) */ + label?: string; + + /** + * Whether to enable extended thinking/reasoning mode. + * - true (default): Enable extended thinking (Claude) / reasoning (OpenRouter/Gemini) + * - false: Disable extended thinking for faster, cheaper responses + */ + thinking?: boolean; +} + +/** + * Configuration for multi-extractor execution + */ +export interface MultiExtractorConfig { + /** List of extractor configurations to run in parallel */ + extractors: ExtractorConfig[]; + + /** Model to use for judge aggregation (default: claude-sonnet-4-5-20250929) */ + judgeModel?: string; + + /** Whether to use LLM judge for aggregation (default: false - uses simple dedup) */ + judgeEnabled: boolean; +} + +// ============================================================================ +// Extractor Result Types +// ============================================================================ + +/** + * Result from a single extractor run + */ +export interface ExtractorResult { + /** Unique identifier for this extractor (e.g., "sonnet-t0", "gemini-flash-t0.1") */ + extractorId: string; + + /** The configuration used for this extractor */ + config: ExtractorConfig; + + /** Issues extracted by this model */ + issues: ExtractedFallacyIssue[]; + + /** Execution time in milliseconds */ + durationMs: number; + + /** Cost in USD (if available) */ + costUsd?: number; + + /** Error message if extraction failed */ + error?: string; +} + +/** + * Combined result from running multiple extractors in parallel + */ +export interface MultiExtractorResult { + /** Results from each extractor */ + extractorResults: ExtractorResult[]; + + /** Wall clock time (parallel execution) */ + totalDurationMs: number; + + /** Total issues across all extractors (before dedup/judge) */ + totalIssuesFound: number; +} + +// ============================================================================ +// Judge Types +// ============================================================================ + +/** + * Reference to an issue from a specific extractor + */ +export interface ExtractorIssueRef { + extractorId: string; + issue: ExtractedFallacyIssue; +} + +/** + * An issue after judge evaluation with provenance tracking + */ +export interface JudgedIssue { + /** The final merged/selected issue */ + issue: ExtractedFallacyIssue; + + /** Which extractors found this or similar issues */ + sourceExtractors: string[]; + + /** The original issues that were merged/deduplicated into this one */ + originalIssues: ExtractorIssueRef[]; + + /** Judge's decision */ + decision: 'accepted' | 'merged' | 'rejected'; + + /** Judge's reasoning for this decision */ + reasoning: string; +} + +/** + * Output from the LLM judge aggregator + */ +export interface JudgeOutput { + /** Issues accepted by the judge */ + acceptedIssues: JudgedIssue[]; + + /** Issues rejected by the judge (for telemetry) */ + rejectedIssues: JudgedIssue[]; + + /** Judge execution time */ + durationMs: number; + + /** Judge cost in USD (if available) */ + costUsd?: number; +} + +// ============================================================================ +// Telemetry Types +// ============================================================================ + +/** + * Telemetry for a single extractor + */ +export interface ExtractorTelemetry { + extractorId: string; + model: string; + + /** + * Effective temperature used for this extractor. + * This is the actual value sent to the API (resolved from config). + */ + temperature: number; + + /** + * Original temperature configuration. + * - "default": Model's native default was used + * - number: Explicit temperature was configured + * - undefined: Our model-specific default was used + */ + temperatureConfig?: number | 'default'; + + /** + * Whether extended thinking/reasoning was enabled. + * - true: Thinking enabled (Claude) / high reasoning (OpenRouter) + * - false: Thinking disabled for faster, cheaper responses + */ + thinkingEnabled: boolean; + + issuesFound: number; + durationMs: number; + costUsd?: number; + error?: string; + + /** Breakdown of issues by type */ + issuesByType: Record; +} + +/** + * Record of a judge decision (for drill-down) + */ +export interface JudgeDecisionRecord { + /** The quoted text from the issue */ + issueText: string; + + /** Issue type (e.g., "logical-fallacy", "missing-context") */ + issueType: string; + + /** Judge's decision */ + decision: 'accepted' | 'merged' | 'rejected'; + + /** Judge's reasoning */ + reasoning: string; + + /** Which extractors found this issue */ + sourceExtractors: string[]; + + /** Final severity after judge assessment */ + finalSeverity?: number; + + /** Final confidence after judge assessment */ + finalConfidence?: number; +} + +/** + * Complete telemetry for the extraction phase + */ +export interface ExtractionPhaseTelemetry { + /** Per-extractor breakdown */ + extractors: ExtractorTelemetry[]; + + /** Total issues before judge aggregation */ + totalIssuesBeforeJudge: number; + + /** Total issues after judge aggregation */ + totalIssuesAfterJudge: number; + + /** Model used for judge */ + judgeModel: string; + + /** Judge execution time */ + judgeDurationMs: number; + + /** Judge cost in USD */ + judgeCostUsd?: number; + + /** Detailed decisions for drill-down */ + judgeDecisions: JudgeDecisionRecord[]; +} diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts index 22fb5de9..a1dba0e4 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts @@ -4,9 +4,12 @@ import { import { logger } from "../../../shared/logger"; import type { Comment, ToolChainResult } from "../../../shared/types"; import fallacyExtractorTool from "../../../tools/fallacy-extractor"; +import type { ExtractedFallacyIssue } from "../../../tools/fallacy-extractor/types"; import fuzzyTextLocatorTool from "../../../tools/smart-text-searcher"; import fallacyReviewTool from "../../../tools/fallacy-review"; import supportedElsewhereFilterTool from "../../../tools/supported-elsewhere-filter"; +import fallacyJudgeTool from "../../../tools/fallacy-judge"; +import { decisionToIssue } from "../../../tools/fallacy-judge/types"; import { TextChunk } from "../../TextChunk"; import type { AnalysisResult, @@ -16,7 +19,21 @@ import type { import { LIMITS, THRESHOLDS, ISSUE_TYPES } from "./constants"; import { buildFallacyComment } from "./comments/builder"; import { FallacyIssue } from "./FallacyIssue"; -import { PipelineTelemetry, PIPELINE_STAGES, type PipelineExecutionRecord } from "./telemetry"; +import { + PipelineTelemetry, + PIPELINE_STAGES, + type PipelineExecutionRecord, + type ExtractionPhaseTelemetry, + type ExtractorTelemetry, + type JudgeDecisionRecord, +} from "./telemetry"; +import { + getMultiExtractorConfig, + isMultiExtractorEnabled, + getDefaultTemperature, + getConfigSummary, +} from "./extraction/config"; +import { runMultiExtractor, simpleDeduplication } from "./extraction/multiExtractor"; export class FallacyCheckPlugin implements SimpleAnalysisPlugin { private documentText: string; @@ -138,7 +155,7 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { // Phase 1: Single-pass extraction on full document telemetry.startStage(PIPELINE_STAGES.EXTRACTION, 1); // 1 = full document - const extractionResult = await this.extractIssuesFromDocument(documentText); + const extractionResult = await this.extractIssuesFromDocument(documentText, telemetry); const allIssues: FallacyIssue[] = extractionResult.issues; telemetry.endStage(allIssues.length, { error: extractionResult.error, @@ -257,33 +274,49 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { * Extract issues from the full document in a single pass. * This provides complete context for better accuracy and reduces false positives * from flagging intro claims that are supported later in the document. + * + * Supports multi-extractor mode when FALLACY_EXTRACTORS env var is set. */ - private async extractIssuesFromDocument(documentText: string): Promise<{ + private async extractIssuesFromDocument( + documentText: string, + telemetry: PipelineTelemetry + ): Promise<{ + issues: FallacyIssue[]; + error?: string; + }> { + const multiExtractorEnabled = isMultiExtractorEnabled(); + + if (multiExtractorEnabled) { + return this.extractWithMultiExtractor(documentText, telemetry); + } + + return this.extractWithSingleExtractor(documentText, telemetry); + } + + /** + * Single extractor mode (default, backwards compatible) + */ + private async extractWithSingleExtractor( + documentText: string, + telemetry: PipelineTelemetry + ): Promise<{ issues: FallacyIssue[]; error?: string; }> { try { - // Track tool execution if session manager is available const sessionManager = getGlobalSessionManager(); const executeExtraction = async () => { return await fallacyExtractorTool.execute( - { - documentText, // Full document for single-pass analysis and location finding - }, - { - logger, - } + { documentText }, + { logger } ); }; const result = sessionManager - ? await sessionManager.trackTool( - "extract-fallacy-issues", - executeExtraction - ) + ? await sessionManager.trackTool("extract-fallacy-issues", executeExtraction) : await executeExtraction(); - // Create a synthetic "chunk" representing the full document for FallacyIssue compatibility + // Create a synthetic "chunk" representing the full document const fullDocChunk = new TextChunk("full-document", documentText, { position: { start: 0, end: documentText.length }, }); @@ -292,9 +325,34 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { (issue) => new FallacyIssue(issue, fullDocChunk, this.processingStartTime) ); - return { - issues, + // Record single-extractor telemetry + const config = getMultiExtractorConfig(); + const extractor = config.extractors[0]; + const extractorTelemetry: ExtractionPhaseTelemetry = { + multiExtractorEnabled: false, + extractors: [ + { + extractorId: "default", + model: extractor.model, + // Resolve temperature for telemetry: "default" -> model default, number -> use as-is + temperature: typeof extractor.temperature === 'number' + ? extractor.temperature + : getDefaultTemperature(extractor.model), + // Store original config for display + temperatureConfig: extractor.temperature, + thinkingEnabled: extractor.thinking !== false, + issuesFound: result.issues.length, + durationMs: 0, // Not tracked in single mode + issuesByType: this.countIssuesByType(result.issues), + }, + ], + totalIssuesBeforeJudge: result.issues.length, + totalIssuesAfterJudge: result.issues.length, + judgeDecisions: [], }; + telemetry.setExtractionPhase(extractorTelemetry); + + return { issues }; } catch (error) { logger.error("Error extracting issues from document:", error); return { @@ -304,6 +362,167 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { } } + /** + * Multi-extractor mode with LLM judge aggregation + */ + private async extractWithMultiExtractor( + documentText: string, + telemetry: PipelineTelemetry + ): Promise<{ + issues: FallacyIssue[]; + error?: string; + }> { + const config = getMultiExtractorConfig(); + + logger.info(`[FallacyCheckPlugin] Multi-extractor mode enabled`); + logger.info(getConfigSummary()); + + try { + // Phase 1: Run all extractors in parallel + const multiResult = await runMultiExtractor(documentText, config); + + // Collect telemetry for each extractor + const extractorsTelemetry: ExtractorTelemetry[] = multiResult.extractorResults.map( + (r) => ({ + extractorId: r.extractorId, + model: r.config.model, + // Resolve temperature for telemetry: "default" -> model default, number -> use as-is + temperature: typeof r.config.temperature === 'number' + ? r.config.temperature + : getDefaultTemperature(r.config.model), + // Store original config for display + temperatureConfig: r.config.temperature, + thinkingEnabled: r.config.thinking !== false, + issuesFound: r.issues.length, + durationMs: r.durationMs, + costUsd: r.costUsd, + error: r.error, + issuesByType: this.countIssuesByType(r.issues), + }) + ); + + // Phase 2: Aggregate issues (via LLM judge or simple dedup) + const successfulExtractors = multiResult.extractorResults.filter((r) => !r.error); + let finalIssues: ExtractedFallacyIssue[]; + let judgeDecisions: JudgeDecisionRecord[] = []; + let judgeDurationMs: number | undefined; + let judgeCostUsd: number | undefined; + + if (multiResult.totalIssuesFound === 0) { + finalIssues = []; + } else if (successfulExtractors.length <= 1 || !config.judgeEnabled) { + // Single extractor or judge disabled - use simple deduplication + if (successfulExtractors.length > 1) { + logger.info( + `[FallacyCheckPlugin] Using simple deduplication (judge disabled)` + ); + finalIssues = simpleDeduplication(multiResult); + } else { + logger.info( + `[FallacyCheckPlugin] Single extractor - no deduplication needed` + ); + finalIssues = successfulExtractors.flatMap((r) => r.issues); + } + } else { + // Multiple extractors with judge enabled - use LLM judge + const judgeInput = { + documentText, + issues: multiResult.extractorResults.flatMap((r) => + r.issues.map((issue) => ({ + extractorId: r.extractorId, + exactText: issue.exactText, + issueType: issue.issueType, + fallacyType: issue.fallacyType, + severityScore: issue.severityScore, + confidenceScore: issue.confidenceScore, + importanceScore: issue.importanceScore, + reasoning: issue.reasoning, + })) + ), + extractorIds: successfulExtractors.map((r) => r.extractorId), + }; + + logger.info( + `[FallacyCheckPlugin] Running LLM judge on ${judgeInput.issues.length} issues from ${judgeInput.extractorIds.length} extractors` + ); + + const judgeStartTime = Date.now(); + const judgeResult = await fallacyJudgeTool.execute(judgeInput, { logger }); + judgeDurationMs = Date.now() - judgeStartTime; + + // Convert judge decisions to issues + finalIssues = judgeResult.acceptedDecisions.map((d) => decisionToIssue(d)); + + // Record judge decisions for telemetry + judgeDecisions = [ + ...judgeResult.acceptedDecisions.map((d) => ({ + issueText: d.finalText, + issueType: d.finalIssueType, + decision: (d.decision === 'accept' || d.decision === 'merge' ? 'accepted' : 'rejected') as 'accepted' | 'merged' | 'rejected', + reasoning: d.judgeReasoning, + sourceExtractors: d.sourceExtractors, + finalSeverity: d.finalSeverity, + finalConfidence: d.finalConfidence, + })), + ...judgeResult.rejectedDecisions.map((d) => ({ + issueText: d.finalText, + issueType: d.finalIssueType, + decision: 'rejected' as const, + reasoning: d.judgeReasoning, + sourceExtractors: d.sourceExtractors, + finalSeverity: d.finalSeverity, + finalConfidence: d.finalConfidence, + })), + ]; + + logger.info( + `[FallacyCheckPlugin] Judge aggregation complete: ${finalIssues.length} accepted, ${judgeResult.rejectedDecisions.length} rejected` + ); + } + + // Record extraction phase telemetry + const extractionTelemetry: ExtractionPhaseTelemetry = { + multiExtractorEnabled: true, + extractors: extractorsTelemetry, + totalIssuesBeforeJudge: multiResult.totalIssuesFound, + totalIssuesAfterJudge: finalIssues.length, + judgeModel: config.judgeModel, + judgeDurationMs, + judgeCostUsd, + judgeDecisions, + }; + telemetry.setExtractionPhase(extractionTelemetry); + + // Create FallacyIssue objects + const fullDocChunk = new TextChunk("full-document", documentText, { + position: { start: 0, end: documentText.length }, + }); + + const issues = finalIssues.map( + (issue) => new FallacyIssue(issue, fullDocChunk, this.processingStartTime) + ); + + return { issues }; + } catch (error) { + logger.error("Error in multi-extractor mode:", error); + return { + issues: [], + error: error instanceof Error ? error.message : "Unknown error", + }; + } + } + + /** + * Count issues by type for telemetry + */ + private countIssuesByType(issues: ExtractedFallacyIssue[]): Record { + const counts: Record = {}; + for (const issue of issues) { + counts[issue.issueType] = (counts[issue.issueType] || 0) + 1; + } + return counts; + } + private deduplicateIssues(issues: FallacyIssue[]): FallacyIssue[] { const seen = new Set(); const unique: FallacyIssue[] = []; diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts index eac3138a..d7a8658f 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/PipelineTelemetry.ts @@ -11,6 +11,7 @@ import type { PipelineExecutionRecord, PipelineStage, FilteredItemRecord, + ExtractionPhaseTelemetry, } from './types'; /** Current pipeline version - increment when making significant changes */ @@ -51,6 +52,7 @@ export class PipelineTelemetry { private stages: StageMetrics[] = []; private activeStage: ActiveStage | null = null; private filteredItems: FilteredItemRecord[] = []; + private extractionPhase: ExtractionPhaseTelemetry | null = null; private finalCounts: PipelineExecutionRecord['finalCounts'] = { issuesExtracted: 0, issuesAfterDedup: 0, @@ -173,6 +175,14 @@ export class PipelineTelemetry { return this; } + /** + * Set extraction phase telemetry (for multi-extractor mode) + */ + setExtractionPhase(telemetry: ExtractionPhaseTelemetry): this { + this.extractionPhase = telemetry; + return this; + } + /** * Calculate total cost from all stages */ @@ -210,6 +220,7 @@ export class PipelineTelemetry { totalCostUsd: this.calculateTotalCost(), pipelineVersion: PIPELINE_VERSION, filteredItems: this.filteredItems, // Always include (even if empty) so we know telemetry was captured + extractionPhase: this.extractionPhase || undefined, }; } diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts index 0a403bfa..4a2ea9cb 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/index.ts @@ -10,5 +10,8 @@ export { type PipelineExecutionRecord, type PipelineStage, type FilteredItemRecord, + type ExtractorTelemetry, + type JudgeDecisionRecord, + type ExtractionPhaseTelemetry, PIPELINE_STAGES, } from './types'; diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts index 69f26ade..84b3264a 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/telemetry/types.ts @@ -60,6 +60,116 @@ export interface FilteredItemRecord { originalIndex: number; } +// ============================================================================ +// Multi-Extractor Telemetry Types +// ============================================================================ + +/** + * Telemetry for a single extractor run + */ +export interface ExtractorTelemetry { + /** Unique extractor ID (e.g., "sonnet-0", "gemini-flash-1") */ + extractorId: string; + + /** Model used */ + model: string; + + /** + * Effective temperature used for this extractor. + * This is the actual value sent to the API (resolved from config). + */ + temperature: number; + + /** + * Original temperature configuration. + * - "default": Model's native default was used + * - number: Explicit temperature was configured + * - undefined: Our model-specific default was used + */ + temperatureConfig?: number | 'default'; + + /** + * Whether extended thinking/reasoning was enabled. + * - true: Thinking enabled (Claude) / high reasoning (OpenRouter) + * - false: Thinking disabled for faster, cheaper responses + */ + thinkingEnabled: boolean; + + /** Number of issues found by this extractor */ + issuesFound: number; + + /** Execution time in milliseconds */ + durationMs: number; + + /** Cost in USD (if available) */ + costUsd?: number; + + /** Error message if extraction failed */ + error?: string; + + /** Breakdown of issues by type */ + issuesByType: Record; +} + +/** + * Record of a judge decision (for drill-down) + */ +export interface JudgeDecisionRecord { + /** The quoted text from the issue */ + issueText: string; + + /** Issue type (e.g., "logical-fallacy", "missing-context") */ + issueType: string; + + /** Judge's decision */ + decision: 'accepted' | 'merged' | 'rejected'; + + /** Judge's reasoning */ + reasoning: string; + + /** Which extractors found this issue */ + sourceExtractors: string[]; + + /** Final severity after judge assessment */ + finalSeverity?: number; + + /** Final confidence after judge assessment */ + finalConfidence?: number; +} + +/** + * Complete telemetry for the extraction phase (multi-extractor mode) + */ +export interface ExtractionPhaseTelemetry { + /** Whether multi-extractor mode was used */ + multiExtractorEnabled: boolean; + + /** Per-extractor breakdown */ + extractors: ExtractorTelemetry[]; + + /** Total issues before judge aggregation */ + totalIssuesBeforeJudge: number; + + /** Total issues after judge aggregation */ + totalIssuesAfterJudge: number; + + /** Model used for judge (if multi-extractor enabled) */ + judgeModel?: string; + + /** Judge execution time (if multi-extractor enabled) */ + judgeDurationMs?: number; + + /** Judge cost in USD (if available) */ + judgeCostUsd?: number; + + /** Detailed decisions for drill-down */ + judgeDecisions: JudgeDecisionRecord[]; +} + +// ============================================================================ +// Pipeline Execution Record +// ============================================================================ + /** * Complete pipeline execution record */ @@ -110,6 +220,9 @@ export interface PipelineExecutionRecord { /** Details about items that were filtered out (for debugging/validation) */ filteredItems?: FilteredItemRecord[]; + + /** Detailed extraction phase telemetry (multi-extractor mode) */ + extractionPhase?: ExtractionPhaseTelemetry; } /** diff --git a/internal-packages/ai/src/claude/wrapper.ts b/internal-packages/ai/src/claude/wrapper.ts index 26563e58..44c56aa6 100644 --- a/internal-packages/ai/src/claude/wrapper.ts +++ b/internal-packages/ai/src/claude/wrapper.ts @@ -25,6 +25,13 @@ export interface ClaudeCallOptions { enablePromptCaching?: boolean; // Enable Anthropic prompt caching cacheSeed?: string; // Custom cache seed for Helicone response caching timeout?: number; // Custom timeout in milliseconds + /** + * Whether to enable extended thinking mode. + * - true (default): Enable extended thinking with budget of 10000 tokens + * - false: Disable extended thinking for faster, cheaper responses + * Note: Extended thinking requires temperature=1, so temperature is ignored when enabled. + */ + thinking?: boolean; } export interface ClaudeCallResult { @@ -115,11 +122,23 @@ export async function callClaude( await new Promise(resolve => setTimeout(resolve, delay)); } + // Determine if extended thinking is enabled (default: false for tool calls to save cost) + // When thinking is enabled, temperature must be 1 + const thinkingEnabled = options.thinking === true; + const effectiveTemperature = thinkingEnabled ? 1 : (options.temperature ?? 0); + const requestOptions: Anthropic.Messages.MessageCreateParams = { model, max_tokens: options.max_tokens || 4000, - temperature: options.temperature ?? 0, - messages: options.messages + temperature: effectiveTemperature, + messages: options.messages, + // Add thinking configuration when enabled + ...(thinkingEnabled && { + thinking: { + type: "enabled" as const, + budget_tokens: 10000, // Default budget for extended thinking + } + }), }; if (options.system) { diff --git a/internal-packages/ai/src/tools/claim-evaluator/index.ts b/internal-packages/ai/src/tools/claim-evaluator/index.ts index fa7bee17..c13da2d8 100644 --- a/internal-packages/ai/src/tools/claim-evaluator/index.ts +++ b/internal-packages/ai/src/tools/claim-evaluator/index.ts @@ -1,7 +1,7 @@ import { z } from "zod"; import { Tool, ToolContext } from "../base/Tool"; import { claimEvaluatorConfig } from "../configs"; -import { createOpenRouterClient, OPENROUTER_MODELS, normalizeTemperature } from "../../utils/openrouter"; +import { callOpenRouterChat, OPENROUTER_MODELS, normalizeTemperature } from "../../utils/openrouter"; import { HeliconeSessionManager, setGlobalSessionManager } from "../../helicone/simpleSessionManager"; // Import from new modules @@ -140,7 +140,6 @@ const outputSchema = z.object({ * Evaluate a claim with a single model via OpenRouter with timeout */ async function evaluateWithModel( - client: ReturnType, input: ClaimEvaluatorInput, model: string, context: ToolContext, @@ -173,14 +172,14 @@ async function evaluateWithModel( { provider, model: modelName }, async () => { return Promise.race([ - evaluateWithModelImpl(client, input, model, context), + evaluateWithModelImpl(input, model, context), timeoutPromise, ]); } ); } else { return Promise.race([ - evaluateWithModelImpl(client, input, model, context), + evaluateWithModelImpl(input, model, context), timeoutPromise, ]); } @@ -200,7 +199,6 @@ async function evaluateWithModel( * Implementation of model evaluation (wrapped with timeout) */ async function evaluateWithModelImpl( - client: ReturnType, input: ClaimEvaluatorInput, model: string, context: ToolContext @@ -233,37 +231,25 @@ async function evaluateWithModelImpl( // Track response time const startTime = Date.now(); - const completion = await client.chat.completions.create( - { - model, - messages: [ - { - role: 'user', - content: uniquePrompt, - }, - ], - max_tokens: maxTokens, - temperature: actualTemperature, // Normalized per provider (Anthropic 0-1, others 0-2) - // Use OpenRouter's standard response_format parameter for JSON mode - // Works across all providers (OpenAI, Gemini, etc.) through OpenRouter - response_format: { type: 'json_object' }, - }, - { - // Pass headers to disable caching (via request options) - // Helicone caching: Use unique seed per request to prevent cache hits - headers: { - 'X-No-Cache': 'true', - 'Helicone-Cache-Enabled': 'false', - 'Helicone-Cache-Seed': uniqueId, // Unique seed ensures no cache reuse - } as Record, - } - ); + const completion = await callOpenRouterChat({ + model, + messages: [ + { + role: 'user', + content: uniquePrompt, + }, + ], + max_tokens: maxTokens, + temperature: actualTemperature, // Normalized per provider (Anthropic 0-1, others 0-2) + // Use OpenRouter's standard response_format parameter for JSON mode + // Works across all providers (OpenAI, Gemini, etc.) through OpenRouter + response_format: { type: 'json_object' }, + }); responseTimeMs = Date.now() - startTime; - const message = completion.choices[0]?.message as MessageWithReasoning | undefined; - rawContent = message?.content || undefined; + rawContent = completion.content || undefined; // Capture reasoning from both GPT-5 (reasoning) and o1/o3 (reasoning_content) - rawThinking = message?.reasoning || message?.reasoning_content || undefined; + rawThinking = completion.reasoning || undefined; rawTokenUsage = completion.usage as TokenUsage | undefined; if (!rawContent) { @@ -404,8 +390,6 @@ export class ClaimEvaluatorTool extends Tool = []; @@ -417,7 +401,7 @@ export class ClaimEvaluatorTool extends Tool evaluateWithModel(client, input, model, context, sessionManager)) + modelRuns.map(({ model }) => evaluateWithModel(input, model, context, sessionManager)) ); // Process results, maintaining index correspondence with modelRuns diff --git a/internal-packages/ai/src/tools/fallacy-extractor/index.ts b/internal-packages/ai/src/tools/fallacy-extractor/index.ts index 9b50d066..ff35cb02 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/index.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/index.ts @@ -84,6 +84,11 @@ const inputSchema = z.object({ documentText: z.string().optional().describe("Full document text - used for analysis in single-pass mode, or for location finding in chunk mode"), chunkStartOffset: z.number().min(0).optional().describe("Byte offset where this chunk starts in the full document (optimization for location finding)"), model: z.string().optional().describe("Model to use (Claude or OpenRouter model ID)"), + temperature: z.union([ + z.number().min(0).max(2), + z.literal('default'), + ]).optional().describe("Temperature for extraction (default: 0 for Claude, 0.1 for OpenRouter, 'default' to use model's native default)"), + thinking: z.boolean().optional().describe("Enable extended thinking/reasoning (default: true for Claude, varies for OpenRouter)"), }) satisfies z.ZodType; const outputSchema = z.object({ @@ -363,33 +368,46 @@ Analyze ALL sections (argumentative, factual, biographical). Look for statistica let result: { toolResult: ExtractorResults }; + // Determine temperature to use: + // - "default": Don't pass temperature, let model use its native default + // - undefined: Use our model-specific default (0 for Claude, 0.1 for OpenRouter) + // - number: Use explicit value + const useDefaultTemperature = input.temperature === 'default'; + const defaultTemp = isOpenRouterModel ? 0.1 : 0; + const temperature = useDefaultTemperature ? undefined : (typeof input.temperature === 'number' ? input.temperature : defaultTemp); + + // Thinking parameter: undefined/true = enabled, false = disabled + const thinkingEnabled = input.thinking !== false; + if (isOpenRouterModel && modelId) { // Use OpenRouter for non-Claude models (Gemini, GPT, etc.) - console.log(`πŸ“‘ Calling OpenRouter API with model: ${modelId}`); + console.log(`πŸ“‘ Calling OpenRouter API with model: ${modelId}, temp: ${temperature ?? 'default'}, thinking: ${thinkingEnabled}`); result = await callOpenRouterWithTool({ model: modelId, system: systemPrompt, messages: [{ role: "user", content: userPrompt }], max_tokens: 8000, - temperature: 0.1, // OpenRouter doesn't support temp=0 for all models + ...(temperature !== undefined && { temperature }), toolName: "extract_fallacy_issues", toolDescription: "Extract and score fallacy issues from text", toolSchema, + thinking: thinkingEnabled, }); } else { // Use Claude API directly - console.log(`πŸ€– Calling Claude API${modelId ? ` with model: ${modelId}` : ""}`); + console.log(`πŸ€– Calling Claude API${modelId ? ` with model: ${modelId}` : ""}, temp: ${temperature ?? 'default'}, thinking: ${thinkingEnabled}`); result = await callClaudeWithTool({ ...(modelId && { model: modelId }), system: systemPrompt, messages: [{ role: "user", content: userPrompt }], max_tokens: 8000, - temperature: 0, + ...(temperature !== undefined && { temperature }), toolName: "extract_fallacy_issues", toolDescription: "Extract and score fallacy issues from text", toolSchema, enablePromptCaching: true, cacheSeed, + thinking: thinkingEnabled, }); } diff --git a/internal-packages/ai/src/tools/fallacy-extractor/types.ts b/internal-packages/ai/src/tools/fallacy-extractor/types.ts index 13a54139..e70ca437 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/types.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/types.ts @@ -80,6 +80,22 @@ export interface FallacyExtractorInput { * Examples: "claude-sonnet-4-20250514", "google/gemini-3-flash-preview" */ model?: string; + + /** + * Optional temperature override for extraction. + * - undefined: Use model-specific default (0 for Claude, 0.1 for OpenRouter) + * - number: Use this specific temperature + * - "default": Let the model use its own default (don't pass temperature) + * Use higher values (0.3-0.7) to get more diverse extractions. + */ + temperature?: number | 'default'; + + /** + * Whether to enable extended thinking/reasoning mode. + * - undefined/true: Enable extended thinking (Claude) / reasoning (OpenRouter/Gemini) + * - false: Disable extended thinking for faster, cheaper responses + */ + thinking?: boolean; } /** diff --git a/internal-packages/ai/src/tools/fallacy-judge/config.ts b/internal-packages/ai/src/tools/fallacy-judge/config.ts new file mode 100644 index 00000000..82eb8bcf --- /dev/null +++ b/internal-packages/ai/src/tools/fallacy-judge/config.ts @@ -0,0 +1,12 @@ +import type { ToolConfig } from '../base/Tool'; + +export const fallacyJudgeConfig: ToolConfig = { + id: 'fallacy-judge', + name: 'Fallacy Judge Aggregator', + description: + 'Aggregates fallacy issues from multiple extractors, merging duplicates and filtering weak single-source issues with explainable decisions', + version: '1.0.0', + category: 'utility', + path: '/tools/fallacy-judge', + status: 'beta', +}; diff --git a/internal-packages/ai/src/tools/fallacy-judge/index.ts b/internal-packages/ai/src/tools/fallacy-judge/index.ts new file mode 100644 index 00000000..1495d9c4 --- /dev/null +++ b/internal-packages/ai/src/tools/fallacy-judge/index.ts @@ -0,0 +1,386 @@ +/** + * Fallacy Judge Aggregator Tool + * + * Aggregates issues from multiple extractors using an LLM judge to: + * 1. Group similar/duplicate issues across extractors + * 2. Merge duplicates into single best-formulation issues + * 3. Accept high-confidence multi-source issues + * 4. Reject low-confidence single-source issues + * 5. Provide reasoning for each decision + */ + +import { z } from 'zod'; +import { Tool, type ToolContext } from '../base/Tool'; +import { callClaudeWithTool } from '../../claude/wrapper'; +import { fallacyJudgeConfig } from './config'; +import type { + FallacyJudgeInput, + FallacyJudgeOutput, + JudgeDecision, + ExtractorIssueInput, +} from './types'; + +// Default model for judge (can be overridden via env var) +const DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-5-20250929'; + +const extractorIssueInputSchema = z.object({ + extractorId: z.string(), + exactText: z.string(), + issueType: z.string(), + fallacyType: z.string().optional(), + severityScore: z.number(), + confidenceScore: z.number(), + importanceScore: z.number(), + reasoning: z.string(), +}) satisfies z.ZodType; + +const inputSchema = z.object({ + documentText: z.string().min(1), + issues: z.array(extractorIssueInputSchema), + extractorIds: z.array(z.string()), +}) satisfies z.ZodType; + +const judgeDecisionSchema = z.object({ + decision: z.enum(['accept', 'merge', 'reject']), + finalText: z.string(), + finalIssueType: z.string(), + finalFallacyType: z.string().optional(), + finalSeverity: z.number(), + finalConfidence: z.number(), + finalImportance: z.number(), + finalReasoning: z.string(), + sourceExtractors: z.array(z.string()), + sourceIssueIndices: z.array(z.number()), + judgeReasoning: z.string(), +}) satisfies z.ZodType; + +const outputSchema = z.object({ + acceptedDecisions: z.array(judgeDecisionSchema), + rejectedDecisions: z.array(judgeDecisionSchema), + summary: z.object({ + totalInputIssues: z.number(), + uniqueGroups: z.number(), + acceptedCount: z.number(), + mergedCount: z.number(), + rejectedCount: z.number(), + }), +}) satisfies z.ZodType; + +export class FallacyJudgeTool extends Tool { + config = fallacyJudgeConfig; + inputSchema = inputSchema; + outputSchema = outputSchema; + + async execute( + input: FallacyJudgeInput, + context: ToolContext + ): Promise { + context.logger.info( + `[FallacyJudge] Aggregating ${input.issues.length} issues from ${input.extractorIds.length} extractors` + ); + + // If no issues or only one extractor, skip judge and return as-is + if (input.issues.length === 0) { + return { + acceptedDecisions: [], + rejectedDecisions: [], + summary: { + totalInputIssues: 0, + uniqueGroups: 0, + acceptedCount: 0, + mergedCount: 0, + rejectedCount: 0, + }, + }; + } + + // If only one extractor, accept all issues (no aggregation needed) + if (input.extractorIds.length === 1) { + const acceptedDecisions = input.issues.map((issue, idx) => ({ + decision: 'accept' as const, + finalText: issue.exactText, + finalIssueType: issue.issueType, + finalFallacyType: issue.fallacyType, + finalSeverity: issue.severityScore, + finalConfidence: issue.confidenceScore, + finalImportance: issue.importanceScore, + finalReasoning: issue.reasoning, + sourceExtractors: [issue.extractorId], + sourceIssueIndices: [idx], + judgeReasoning: 'Single extractor mode - all issues accepted', + })); + + return { + acceptedDecisions, + rejectedDecisions: [], + summary: { + totalInputIssues: input.issues.length, + uniqueGroups: input.issues.length, + acceptedCount: input.issues.length, + mergedCount: 0, + rejectedCount: 0, + }, + }; + } + + // Format issues for the LLM + const formattedIssues = input.issues + .map((issue, idx) => { + return `[Issue ${idx}] Extractor: ${issue.extractorId} +Text: "${issue.exactText.substring(0, 150)}${issue.exactText.length > 150 ? '...' : ''}" +Type: ${issue.issueType}${issue.fallacyType ? ` (${issue.fallacyType})` : ''} +Severity: ${issue.severityScore}, Confidence: ${issue.confidenceScore}, Importance: ${issue.importanceScore} +Reasoning: ${issue.reasoning.substring(0, 200)}${issue.reasoning.length > 200 ? '...' : ''}`; + }) + .join('\n\n'); + + const systemPrompt = `You are an expert epistemic judge aggregating fallacy issues from multiple extractors. + +Your task is to: +1. **Group similar issues** - Issues about the same text/concept from different extractors +2. **Make decisions** for each group: + - **accept**: Issue is valid and found by 2+ extractors, OR single-source with very high confidence (β‰₯90) + - **merge**: Multiple extractors found similar issues - combine into best formulation + - **reject**: Low-confidence single-source issue (likely false positive) + +**Decision Guidelines:** +- Multi-source issues (found by 2+ extractors): Almost always accept or merge +- Single-source with confidence β‰₯90: Accept +- Single-source with confidence 80-89 and severity β‰₯80: Consider accepting +- Single-source with confidence <80: Reject as likely false positive + +**When merging:** +- Use the clearest/most specific text formulation +- Take the highest severity and confidence scores +- Combine reasoning from multiple sources +- List ALL source extractors + +**Output Requirements:** +- Every input issue must be accounted for in exactly one decision +- sourceIssueIndices should reference the original issue indices +- sourceExtractors should list which extractors contributed +- judgeReasoning should explain your decision`; + + const userPrompt = `Aggregate these ${input.issues.length} issues from ${input.extractorIds.length} extractors (${input.extractorIds.join(', ')}): + +**Document Context** (first 1500 chars): +${input.documentText.substring(0, 1500)}${input.documentText.length > 1500 ? '\n...[truncated]...' : ''} + +**Issues to Aggregate:** + +${formattedIssues} + +--- + +Group similar issues together and provide your decisions. Remember: +- Issues found by multiple extractors are more likely to be valid +- Single-source issues need very high confidence (β‰₯90) to be accepted +- Explain your reasoning for each decision`; + + try { + const judgeModel = process.env.FALLACY_JUDGE_MODEL || DEFAULT_JUDGE_MODEL; + + const result = await callClaudeWithTool<{ + decisions: Array<{ + decision: 'accept' | 'merge' | 'reject'; + finalText: string; + finalIssueType: string; + finalFallacyType?: string; + finalSeverity: number; + finalConfidence: number; + finalImportance: number; + finalReasoning: string; + sourceExtractors: string[]; + sourceIssueIndices: number[]; + judgeReasoning: string; + }>; + }>( + { + model: judgeModel, + system: systemPrompt, + messages: [{ role: 'user', content: userPrompt }], + max_tokens: 4000, + temperature: 0.1, + toolName: 'aggregate_fallacy_issues', + toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', + toolSchema: { + type: 'object', + properties: { + decisions: { + type: 'array', + items: { + type: 'object', + properties: { + decision: { + type: 'string', + enum: ['accept', 'merge', 'reject'], + description: 'Judge decision for this issue/group', + }, + finalText: { + type: 'string', + description: 'Final text for the issue (best formulation)', + }, + finalIssueType: { + type: 'string', + description: 'Final issue type', + }, + finalFallacyType: { + type: 'string', + description: 'Final fallacy type (if applicable)', + }, + finalSeverity: { + type: 'number', + description: 'Final severity score (0-100)', + }, + finalConfidence: { + type: 'number', + description: 'Final confidence score (0-100)', + }, + finalImportance: { + type: 'number', + description: 'Final importance score (0-100)', + }, + finalReasoning: { + type: 'string', + description: 'Best reasoning for this issue', + }, + sourceExtractors: { + type: 'array', + items: { type: 'string' }, + description: 'Which extractors found this issue', + }, + sourceIssueIndices: { + type: 'array', + items: { type: 'number' }, + description: 'Indices of original issues in this group', + }, + judgeReasoning: { + type: 'string', + description: 'Why you made this decision', + }, + }, + required: [ + 'decision', + 'finalText', + 'finalIssueType', + 'finalSeverity', + 'finalConfidence', + 'finalImportance', + 'finalReasoning', + 'sourceExtractors', + 'sourceIssueIndices', + 'judgeReasoning', + ], + }, + }, + }, + required: ['decisions'], + }, + }, + [] + ); + + // Separate accepted/rejected decisions + const acceptedDecisions: JudgeDecision[] = []; + const rejectedDecisions: JudgeDecision[] = []; + let mergedCount = 0; + + for (const d of result.toolResult.decisions) { + const decision: JudgeDecision = { + decision: d.decision, + finalText: d.finalText, + finalIssueType: d.finalIssueType, + finalFallacyType: d.finalFallacyType, + finalSeverity: d.finalSeverity, + finalConfidence: d.finalConfidence, + finalImportance: d.finalImportance, + finalReasoning: d.finalReasoning, + sourceExtractors: d.sourceExtractors, + sourceIssueIndices: d.sourceIssueIndices, + judgeReasoning: d.judgeReasoning, + }; + + if (d.decision === 'reject') { + rejectedDecisions.push(decision); + } else { + acceptedDecisions.push(decision); + if (d.decision === 'merge') { + mergedCount++; + } + } + } + + context.logger.info( + `[FallacyJudge] Aggregation complete: ${acceptedDecisions.length} accepted, ${mergedCount} merged, ${rejectedDecisions.length} rejected` + ); + + return { + acceptedDecisions, + rejectedDecisions, + summary: { + totalInputIssues: input.issues.length, + uniqueGroups: result.toolResult.decisions.length, + acceptedCount: acceptedDecisions.length, + mergedCount, + rejectedCount: rejectedDecisions.length, + }, + }; + } catch (error) { + context.logger.error('[FallacyJudge] Aggregation failed:', error); + + // Fallback: Simple deduplication without LLM + // Keep all issues, grouping by similar text + const groups = new Map(); + for (let i = 0; i < input.issues.length; i++) { + const issue = input.issues[i]; + const normalizedText = issue.exactText.toLowerCase().replace(/\s+/g, ' ').trim(); + const existing = groups.get(normalizedText); + if (existing) { + existing.push(i); + } else { + groups.set(normalizedText, [i]); + } + } + + const acceptedDecisions: JudgeDecision[] = []; + for (const [, indices] of groups) { + // Pick the issue with highest confidence + const bestIdx = indices.reduce((best, current) => + input.issues[current].confidenceScore > input.issues[best].confidenceScore + ? current + : best + ); + const bestIssue = input.issues[bestIdx]; + + acceptedDecisions.push({ + decision: indices.length > 1 ? 'merge' : 'accept', + finalText: bestIssue.exactText, + finalIssueType: bestIssue.issueType, + finalFallacyType: bestIssue.fallacyType, + finalSeverity: bestIssue.severityScore, + finalConfidence: bestIssue.confidenceScore, + finalImportance: bestIssue.importanceScore, + finalReasoning: bestIssue.reasoning, + sourceExtractors: [...new Set(indices.map((i) => input.issues[i].extractorId))], + sourceIssueIndices: indices, + judgeReasoning: 'Fallback deduplication (LLM judge unavailable)', + }); + } + + return { + acceptedDecisions, + rejectedDecisions: [], + summary: { + totalInputIssues: input.issues.length, + uniqueGroups: groups.size, + acceptedCount: acceptedDecisions.length, + mergedCount: acceptedDecisions.filter((d) => d.decision === 'merge').length, + rejectedCount: 0, + }, + }; + } + } +} + +const fallacyJudgeTool = new FallacyJudgeTool(); +export default fallacyJudgeTool; diff --git a/internal-packages/ai/src/tools/fallacy-judge/types.ts b/internal-packages/ai/src/tools/fallacy-judge/types.ts new file mode 100644 index 00000000..af25ded1 --- /dev/null +++ b/internal-packages/ai/src/tools/fallacy-judge/types.ts @@ -0,0 +1,124 @@ +/** + * Types for the Fallacy Judge Aggregator Tool + * + * The judge aggregates issues from multiple extractors, + * merging duplicates and filtering weak single-source issues. + */ + +import type { ExtractedFallacyIssue } from '../fallacy-extractor/types'; + +/** + * An issue from a specific extractor + */ +export interface ExtractorIssueInput { + /** Which extractor found this issue */ + extractorId: string; + + /** The exact text flagged */ + exactText: string; + + /** Issue type */ + issueType: string; + + /** Specific fallacy type (if applicable) */ + fallacyType?: string; + + /** Severity score (0-100) */ + severityScore: number; + + /** Confidence score (0-100) */ + confidenceScore: number; + + /** Importance score (0-100) */ + importanceScore: number; + + /** Reasoning from the extractor */ + reasoning: string; +} + +/** + * Input for the fallacy judge tool + */ +export interface FallacyJudgeInput { + /** Full document text for context */ + documentText: string; + + /** All issues from all extractors */ + issues: ExtractorIssueInput[]; + + /** List of extractor IDs that contributed */ + extractorIds: string[]; +} + +/** + * A judge decision on a single issue or group of similar issues + */ +export interface JudgeDecision { + /** Judge's decision on this issue/group */ + decision: 'accept' | 'merge' | 'reject'; + + /** Final merged/accepted issue text */ + finalText: string; + + /** Final issue type */ + finalIssueType: string; + + /** Final fallacy type (if applicable) */ + finalFallacyType?: string; + + /** Final severity (may be adjusted by judge) */ + finalSeverity: number; + + /** Final confidence (may be adjusted by judge) */ + finalConfidence: number; + + /** Final importance (may be adjusted by judge) */ + finalImportance: number; + + /** Best reasoning from sources (or synthesized by judge) */ + finalReasoning: string; + + /** Which extractors found this issue */ + sourceExtractors: string[]; + + /** Original issues from each extractor (indices into input.issues) */ + sourceIssueIndices: number[]; + + /** Judge's reasoning for this decision */ + judgeReasoning: string; +} + +/** + * Output from the fallacy judge tool + */ +export interface FallacyJudgeOutput { + /** Decisions for accepted/merged issues */ + acceptedDecisions: JudgeDecision[]; + + /** Decisions for rejected issues (for telemetry) */ + rejectedDecisions: JudgeDecision[]; + + /** Summary stats */ + summary: { + totalInputIssues: number; + uniqueGroups: number; + acceptedCount: number; + mergedCount: number; + rejectedCount: number; + }; +} + +/** + * Convert judge decisions back to ExtractedFallacyIssue format + */ +export function decisionToIssue(decision: JudgeDecision): ExtractedFallacyIssue { + return { + exactText: decision.finalText, + issueType: decision.finalIssueType as ExtractedFallacyIssue['issueType'], + fallacyType: decision.finalFallacyType as ExtractedFallacyIssue['fallacyType'], + severityScore: decision.finalSeverity, + confidenceScore: decision.finalConfidence, + importanceScore: decision.finalImportance, + reasoning: decision.finalReasoning, + }; +} diff --git a/internal-packages/ai/src/tools/generated-schemas.ts b/internal-packages/ai/src/tools/generated-schemas.ts index 726a46ed..db07fec2 100644 --- a/internal-packages/ai/src/tools/generated-schemas.ts +++ b/internal-packages/ai/src/tools/generated-schemas.ts @@ -3,7 +3,7 @@ * Generated by scripts/generate-tool-schemas.ts * DO NOT EDIT MANUALLY * - * Schema Hash: 74d74639d9cc319a253b27fd9dd6141cff7a8ec8ebfff951f09b198cc438ed30 + * Schema Hash: 2cb427621a88e0c5dc1b1dde09e1b73efc5073db8c9ecbce61c6cd52e9208a9f */ export const toolSchemas = { @@ -2405,6 +2405,24 @@ export const toolSchemas = { "model": { "type": "string", "description": "Model to use (Claude or OpenRouter model ID)" + }, + "temperature": { + "anyOf": [ + { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + { + "type": "string", + "const": "default" + } + ], + "description": "Temperature for extraction (default: 0 for Claude, 0.1 for OpenRouter, 'default' to use model's native default)" + }, + "thinking": { + "type": "boolean", + "description": "Enable extended thinking/reasoning (default: true for Claude, varies for OpenRouter)" } }, "additionalProperties": false, diff --git a/internal-packages/ai/src/utils/openrouter.ts b/internal-packages/ai/src/utils/openrouter.ts index 82e72970..71ec99ce 100644 --- a/internal-packages/ai/src/utils/openrouter.ts +++ b/internal-packages/ai/src/utils/openrouter.ts @@ -1,22 +1,194 @@ /** - * OpenRouter client factory with Helicone integration - * Provides unified access to multiple LLM providers (Anthropic, OpenAI, xAI, etc.) + * OpenRouter Direct API Client + * + * Uses direct HTTP calls instead of OpenAI SDK for full control over + * OpenRouter-specific parameters like reasoning_effort. + * + * API Docs: https://openrouter.ai/docs/api/reference/parameters */ -import { OpenAI } from 'openai'; import { aiConfig } from '../config'; import { getCurrentHeliconeHeaders } from '../helicone/simpleSessionManager'; -export interface OpenRouterOptions { +// ============================================================================ +// Types +// ============================================================================ + +/** + * Reasoning effort levels supported by OpenRouter + * - "none": Disable reasoning entirely + * - "minimal": ~10% of max_tokens for reasoning + * - "low": ~20% of max_tokens for reasoning + * - "medium": ~50% of max_tokens for reasoning + * - "high": ~80% of max_tokens for reasoning + * - "xhigh": ~95% of max_tokens for reasoning + */ +export type ReasoningEffort = 'none' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh'; + +/** + * Reasoning configuration for fine-grained control + */ +export interface ReasoningConfig { + /** Effort level (alternative to max_tokens) */ + effort?: ReasoningEffort; + /** Direct token budget for reasoning */ + max_tokens?: number; + /** Whether to exclude reasoning from response */ + exclude?: boolean; + /** Enable reasoning with defaults */ + enabled?: boolean; +} + +/** + * OpenRouter chat message + */ +export interface OpenRouterMessage { + role: 'system' | 'user' | 'assistant' | 'tool'; + content: string; + tool_call_id?: string; +} + +/** + * Tool/function definition + */ +export interface OpenRouterTool { + type: 'function'; + function: { + name: string; + description: string; + parameters: Record; + }; +} + +/** + * Tool choice configuration + */ +export type OpenRouterToolChoice = + | 'none' + | 'auto' + | 'required' + | { type: 'function'; function: { name: string } }; + +/** + * OpenRouter API request body + */ +export interface OpenRouterRequest { + model: string; + messages: OpenRouterMessage[]; + + // Generation parameters + max_tokens?: number; + temperature?: number; + top_p?: number; + top_k?: number; + frequency_penalty?: number; + presence_penalty?: number; + repetition_penalty?: number; + min_p?: number; + top_a?: number; + seed?: number; + stop?: string[]; + + // Tool calling + tools?: OpenRouterTool[]; + tool_choice?: OpenRouterToolChoice; + parallel_tool_calls?: boolean; + + // Reasoning control (OpenRouter-specific) + reasoning_effort?: ReasoningEffort; + reasoning?: ReasoningConfig; + + // Output format + response_format?: { type: 'json_object' | 'text' }; + + // Provider-specific passthrough + provider?: { + order?: string[]; + allow_fallbacks?: boolean; + require_parameters?: boolean; + }; +} + +/** + * Tool call in response + */ +export interface OpenRouterToolCall { + id: string; + type: 'function'; + function: { + name: string; + arguments: string; + }; +} + +/** + * Response choice + */ +export interface OpenRouterChoice { + index: number; + message: { + role: 'assistant'; + content: string | null; + tool_calls?: OpenRouterToolCall[]; + }; + finish_reason: 'stop' | 'tool_calls' | 'length' | 'content_filter' | null; +} + +/** + * Token usage + */ +export interface OpenRouterUsage { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; +} + +/** + * OpenRouter API response + */ +export interface OpenRouterResponse { + id: string; + model: string; + object: 'chat.completion'; + created: number; + choices: OpenRouterChoice[]; + usage?: OpenRouterUsage; +} + +/** + * API error response + */ +export interface OpenRouterError { + error: { + message: string; + type: string; + code?: string; + }; +} + +// ============================================================================ +// Client Configuration +// ============================================================================ + +export interface OpenRouterClientOptions { apiKey?: string; includeSessionHeaders?: boolean; } /** - * Create an OpenAI client configured for OpenRouter with Helicone proxy - * Supports all models available via OpenRouter (Claude, GPT, Grok, etc.) + * Get the base URL for OpenRouter API (with optional Helicone proxy) */ -export function createOpenRouterClient(options: OpenRouterOptions = {}): OpenAI { +function getBaseUrl(): string { + const heliconeKey = aiConfig.helicone.apiKey || process.env.HELICONE_API_KEY; + return heliconeKey + ? 'https://openrouter.helicone.ai/api/v1' + : 'https://openrouter.ai/api/v1'; +} + +/** + * Build headers for OpenRouter API requests + */ +function buildHeaders(options: OpenRouterClientOptions = {}): Record { const apiKey = options.apiKey || process.env.OPENROUTER_API_KEY || ''; if (!apiKey || apiKey === 'your_openrouter_api_key_here') { @@ -27,89 +199,154 @@ export function createOpenRouterClient(options: OpenRouterOptions = {}): OpenAI } const heliconeKey = aiConfig.helicone.apiKey || process.env.HELICONE_API_KEY; - - // Determine environment for better tracking const isProduction = process.env.NODE_ENV === 'production'; const environment = isProduction ? 'Prod' : 'Dev'; const appTitle = `RoastMyPost Tools - ${environment}`; const referer = isProduction ? 'https://roastmypost.org' : 'http://localhost:3000'; - // Build default headers - const defaultHeaders: Record = { + const headers: Record = { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${apiKey}`, 'HTTP-Referer': referer, 'X-Title': appTitle, 'X-Environment': environment, }; + // Add Helicone auth if available + if (heliconeKey) { + headers['Helicone-Auth'] = `Bearer ${heliconeKey}`; + } + // Add session headers if requested if (options.includeSessionHeaders !== false) { const sessionHeaders = getCurrentHeliconeHeaders(); - Object.assign(defaultHeaders, sessionHeaders); + Object.assign(headers, sessionHeaders); } - // Use Helicone proxy if available, otherwise direct OpenRouter - if (heliconeKey) { - return new OpenAI({ - baseURL: 'https://openrouter.helicone.ai/api/v1', - apiKey, - defaultHeaders: { - 'Helicone-Auth': `Bearer ${heliconeKey}`, - ...defaultHeaders, - } - }); - } else { - return new OpenAI({ - baseURL: 'https://openrouter.ai/api/v1', - apiKey, - defaultHeaders, - }); + return headers; +} + +// ============================================================================ +// API Functions +// ============================================================================ + +/** + * Make a direct API call to OpenRouter + */ +export async function callOpenRouter( + request: OpenRouterRequest, + options: OpenRouterClientOptions = {} +): Promise { + const baseUrl = getBaseUrl(); + const headers = buildHeaders(options); + + const response = await fetch(`${baseUrl}/chat/completions`, { + method: 'POST', + headers, + body: JSON.stringify(request), + }); + + if (!response.ok) { + const errorBody = await response.json().catch(() => ({ error: { message: response.statusText } })) as OpenRouterError; + throw new Error(`OpenRouter API error (${response.status}): ${errorBody.error?.message || response.statusText}`); } + + return response.json() as Promise; } +// ============================================================================ +// High-Level Chat Interface (no tools) +// ============================================================================ + /** - * Common OpenRouter model identifiers - * Top models selected for reasoning, analysis, and evaluation tasks + * Options for simple chat completions (no tool calling) */ -export const OPENROUTER_MODELS = { - // Top tier - Latest and most capable models (2025) - CLAUDE_SONNET_4_5: 'anthropic/claude-sonnet-4.5', - CLAUDE_SONNET_4: 'anthropic/claude-sonnet-4', - GEMINI_3_PRO: 'google/gemini-3-pro-preview', - GEMINI_3_FLASH: 'google/gemini-3-flash-preview', - GEMINI_2_5_PRO: 'google/gemini-2.5-pro', - GEMINI_2_5_FLASH: 'google/gemini-2.5-flash', - GPT_5: 'openai/gpt-5', - GPT_5_MINI: 'openai/gpt-5-mini', - DEEPSEEK_CHAT_V3_1: 'deepseek/deepseek-chat-v3.1', - GROK_4: 'x-ai/grok-4', +export interface OpenRouterChatOptions { + model: string; + messages: Array<{ role: 'user' | 'assistant' | 'system'; content: string }>; + max_tokens?: number; + temperature?: number; + response_format?: { type: 'json_object' | 'text' }; - // High performance - Established strong models - CLAUDE_3_5_SONNET: 'anthropic/claude-3.5-sonnet', - CLAUDE_3_7_SONNET: 'anthropic/claude-3-7-sonnet-20250219', - GPT_4_TURBO: 'openai/gpt-4-turbo', - GPT_4_1: 'openai/gpt-4.1', - GPT_4_1_MINI: 'openai/gpt-4.1-mini-2025-04-14', - GROK_BETA: 'x-ai/grok-beta', + /** + * Custom headers to pass to the API (e.g., for cache control) + */ + headers?: Record; - // Good value - Fast and cost-effective - CLAUDE_HAIKU: 'anthropic/claude-3-haiku', - CLAUDE_HAIKU_4_5: 'anthropic/claude-haiku-4.5', - GPT_35_TURBO: 'openai/gpt-3.5-turbo', - DEEPSEEK_CHAT: 'deepseek/deepseek-chat', + /** + * Reasoning control + */ + reasoningEffort?: ReasoningEffort; +} - // Legacy/Alternative options - CLAUDE_OPUS: 'anthropic/claude-3-opus', - CLAUDE_4_SONNET_20250522: 'anthropic/claude-4-sonnet-20250522', - GPT_4: 'openai/gpt-4', - GEMINI_PRO: 'google/gemini-pro', - LLAMA_70B: 'meta-llama/llama-3-70b-instruct', -} as const; +export interface OpenRouterChatResult { + content: string | null; + reasoning?: string; + model: string; + finishReason: string | null; + usage?: { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; + }; +} -export type OpenRouterModel = typeof OPENROUTER_MODELS[keyof typeof OPENROUTER_MODELS]; +/** + * Simple chat completion without tool calling + * For cases like claim-evaluator that just need a text response + */ +export async function callOpenRouterChat( + options: OpenRouterChatOptions +): Promise { + const request: OpenRouterRequest = { + model: options.model, + messages: options.messages.map(m => ({ + role: m.role as 'system' | 'user' | 'assistant', + content: m.content, + })), + max_tokens: options.max_tokens || 4000, + temperature: options.temperature, + response_format: options.response_format, + }; + + if (options.reasoningEffort) { + request.reasoning_effort = options.reasoningEffort; + } + + console.log(`πŸ“‘ [OpenRouter] Chat: ${options.model}${options.reasoningEffort ? `, reasoning: ${options.reasoningEffort}` : ''}`); + + // Build custom client options with extra headers if provided + const clientOptions: OpenRouterClientOptions = {}; + + const response = await callOpenRouter(request, clientOptions); + + const choice = response.choices[0]; + if (!choice) { + throw new Error('No response from OpenRouter'); + } + + // Extract reasoning from various model formats + const message = choice.message as { + content: string | null; + reasoning?: string; + reasoning_content?: string; + }; + + return { + content: message.content, + reasoning: message.reasoning || message.reasoning_content, + model: response.model, + finishReason: choice.finish_reason, + usage: response.usage, + }; +} + +// ============================================================================ +// High-Level Tool Calling Interface +// ============================================================================ /** - * Call OpenRouter with tool/function calling - * Similar interface to callClaudeWithTool but uses OpenAI-compatible API + * Options for tool-calling requests */ export interface OpenRouterToolCallOptions { model: string; @@ -120,6 +357,20 @@ export interface OpenRouterToolCallOptions { toolName: string; toolDescription: string; toolSchema: Record; + + /** + * Whether to enable extended thinking/reasoning mode. + * - true: Enable reasoning (uses model default or "medium" effort) + * - false: Disable reasoning entirely (reasoning_effort: "none") + * - undefined: Let model use its default behavior + */ + thinking?: boolean; + + /** + * Fine-grained reasoning control (overrides thinking boolean) + * Use this for explicit control over reasoning effort level. + */ + reasoningEffort?: ReasoningEffort; } export interface OpenRouterToolCallResult { @@ -132,19 +383,36 @@ export interface OpenRouterToolCallResult { }; } +/** + * Call OpenRouter with tool/function calling + * Uses direct HTTP for full control over OpenRouter-specific parameters + */ export async function callOpenRouterWithTool( options: OpenRouterToolCallOptions ): Promise> { - const client = createOpenRouterClient(); + // Determine reasoning effort + let reasoningEffort: ReasoningEffort | undefined; + + if (options.reasoningEffort !== undefined) { + // Explicit reasoning effort takes precedence + reasoningEffort = options.reasoningEffort; + } else if (options.thinking === false) { + // Disable reasoning when thinking is false + reasoningEffort = 'none'; + } + // When thinking is true or undefined, don't set reasoning_effort (use model default) - const response = await client.chat.completions.create({ + // Build request + const request: OpenRouterRequest = { model: options.model, messages: [ { role: 'system', content: options.system }, - ...options.messages, + ...options.messages.map(m => ({ role: m.role as 'user' | 'assistant', content: m.content })), ], max_tokens: options.max_tokens || 4000, - temperature: normalizeTemperature(options.temperature || 0.1, options.model), + temperature: options.temperature !== undefined + ? normalizeTemperature(options.temperature, options.model) + : normalizeTemperature(0.1, options.model), tools: [ { type: 'function', @@ -159,7 +427,17 @@ export async function callOpenRouterWithTool( type: 'function', function: { name: options.toolName }, }, - }); + }; + + // Add reasoning_effort if specified + if (reasoningEffort !== undefined) { + request.reasoning_effort = reasoningEffort; + console.log(`πŸ“‘ [OpenRouter] Model: ${options.model}, reasoning_effort: ${reasoningEffort}`); + } else { + console.log(`πŸ“‘ [OpenRouter] Model: ${options.model}, reasoning: default`); + } + + const response = await callOpenRouter(request); const choice = response.choices[0]; if (!choice) { @@ -196,6 +474,55 @@ export async function callOpenRouterWithTool( }; } +// ============================================================================ +// Model Configuration +// ============================================================================ + +/** + * Common OpenRouter model identifiers + * Top models selected for reasoning, analysis, and evaluation tasks + */ +export const OPENROUTER_MODELS = { + // Top tier - Latest and most capable models (2025) + CLAUDE_SONNET_4_5: 'anthropic/claude-sonnet-4.5', + CLAUDE_SONNET_4: 'anthropic/claude-sonnet-4', + GEMINI_3_PRO: 'google/gemini-3-pro-preview', + GEMINI_3_FLASH: 'google/gemini-3-flash-preview', + GEMINI_2_5_PRO: 'google/gemini-2.5-pro', + GEMINI_2_5_FLASH: 'google/gemini-2.5-flash', + GPT_5: 'openai/gpt-5', + GPT_5_MINI: 'openai/gpt-5-mini', + DEEPSEEK_CHAT_V3_1: 'deepseek/deepseek-chat-v3.1', + GROK_4: 'x-ai/grok-4', + + // High performance - Established strong models + CLAUDE_3_5_SONNET: 'anthropic/claude-3.5-sonnet', + CLAUDE_3_7_SONNET: 'anthropic/claude-3-7-sonnet-20250219', + GPT_4_TURBO: 'openai/gpt-4-turbo', + GPT_4_1: 'openai/gpt-4.1', + GPT_4_1_MINI: 'openai/gpt-4.1-mini-2025-04-14', + GROK_BETA: 'x-ai/grok-beta', + + // Good value - Fast and cost-effective + CLAUDE_HAIKU: 'anthropic/claude-3-haiku', + CLAUDE_HAIKU_4_5: 'anthropic/claude-haiku-4.5', + GPT_35_TURBO: 'openai/gpt-3.5-turbo', + DEEPSEEK_CHAT: 'deepseek/deepseek-chat', + + // Legacy/Alternative options + CLAUDE_OPUS: 'anthropic/claude-3-opus', + CLAUDE_4_SONNET_20250522: 'anthropic/claude-4-sonnet-20250522', + GPT_4: 'openai/gpt-4', + GEMINI_PRO: 'google/gemini-pro', + LLAMA_70B: 'meta-llama/llama-3-70b-instruct', +} as const; + +export type OpenRouterModel = typeof OPENROUTER_MODELS[keyof typeof OPENROUTER_MODELS]; + +// ============================================================================ +// Temperature Utilities +// ============================================================================ + /** * Temperature range configuration by provider * Different providers support different temperature ranges @@ -239,3 +566,14 @@ export function normalizeTemperature(userTemp: number, modelId: string): number const range = PROVIDER_TEMPERATURE_RANGES[provider]; return userTemp * range.max; } + +// ============================================================================ +// Legacy Exports (for backwards compatibility) +// ============================================================================ + +// Note: createOpenRouterClient is no longer needed since we use direct HTTP +// but we keep the export for any code that might reference it +export interface OpenRouterOptions { + apiKey?: string; + includeSessionHeaders?: boolean; +} diff --git a/meta-evals/src/components/Validation.tsx b/meta-evals/src/components/Validation.tsx index f00794f5..0a9bf209 100644 --- a/meta-evals/src/components/Validation.tsx +++ b/meta-evals/src/components/Validation.tsx @@ -414,6 +414,8 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati lostComments: comparison.lostComments, // Include filter reasoning from the current run's telemetry filteredItems: currentEval.pipelineTelemetry?.filteredItems, + // Include extraction phase telemetry for drill-down + extractionPhase: currentEval.pipelineTelemetry?.extractionPhase, }, }); } @@ -796,12 +798,36 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati newComments?: Array<{ quotedText: string; header: string | null; description: string }>; lostComments?: Array<{ quotedText: string; header: string | null; description: string }>; filteredItems?: Array<{ stage: string; quotedText: string; header?: string; filterReason: string; supportLocation?: string }>; + extractionPhase?: { + multiExtractorEnabled: boolean; + extractors: Array<{ + extractorId: string; + model: string; + temperature: number; + temperatureConfig?: number | 'default'; + thinkingEnabled: boolean; + issuesFound: number; + durationMs: number; + error?: string; + }>; + totalIssuesBeforeJudge: number; + totalIssuesAfterJudge: number; + judgeModel?: string; + judgeDurationMs?: number; + judgeDecisions: Array<{ + issueText: string; + decision: 'accepted' | 'merged' | 'rejected'; + reasoning: string; + sourceExtractors: string[]; + }>; + }; } | null; const matched = data?.matchedComments || []; const newComments = data?.newComments || []; const lost = data?.lostComments || []; const filteredItems = data?.filteredItems || []; + const extractionPhase = data?.extractionPhase; // Helper to check if a lost comment has a filter reason const hasFilterReason = (lostComment: { quotedText: string; header: string | null }) => { @@ -876,6 +902,20 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati Legend: βœ“ kept + new ⊘ filtered (has reason) βˆ’ not extracted + {extractionPhase && extractionPhase.multiExtractorEnabled && ( + + + Extraction: + + {extractionPhase.extractors.map(e => { + const tempStr = e.temperatureConfig === 'default' ? 'tDef' : `t${e.temperature}`; + const thinkStr = e.thinkingEnabled ? '' : ' noThink'; + return `${e.extractorId}(${tempStr}${thinkStr}):${e.issuesFound}`; + }).join(' | ')} β†’ {extractionPhase.judgeDurationMs ? 'Judge' : 'Dedup'} β†’ {extractionPhase.totalIssuesAfterJudge}/{extractionPhase.totalIssuesBeforeJudge} kept + + + + )} ; + extractionPhase?: { + multiExtractorEnabled: boolean; + extractors: Array<{ + extractorId: string; + model: string; + temperature: number; + temperatureConfig?: number | 'default'; + thinkingEnabled: boolean; + issuesFound: number; + durationMs: number; + error?: string; + }>; + totalIssuesBeforeJudge: number; + totalIssuesAfterJudge: number; + judgeModel?: string; + judgeDurationMs?: number; + judgeDecisions: Array<{ + issueText: string; + decision: 'accepted' | 'merged' | 'rejected'; + reasoning: string; + sourceExtractors: string[]; + }>; + }; } | null { if (!raw || typeof raw !== "object") return null; @@ -1139,6 +1202,31 @@ function extractTelemetry(raw: unknown): { originalIndex: number; }> | undefined; + // Extract extraction phase telemetry if present + const extractionPhase = telemetry.extractionPhase as { + multiExtractorEnabled: boolean; + extractors: Array<{ + extractorId: string; + model: string; + temperature: number; + temperatureConfig?: number | 'default'; + thinkingEnabled: boolean; + issuesFound: number; + durationMs: number; + error?: string; + }>; + totalIssuesBeforeJudge: number; + totalIssuesAfterJudge: number; + judgeModel?: string; + judgeDurationMs?: number; + judgeDecisions: Array<{ + issueText: string; + decision: 'accepted' | 'merged' | 'rejected'; + reasoning: string; + sourceExtractors: string[]; + }>; + } | undefined; + return { totalDurationMs: (telemetry.totalDurationMs as number) || 0, issuesExtracted: finalCounts.issuesExtracted || 0, @@ -1147,5 +1235,6 @@ function extractTelemetry(raw: unknown): { commentsGenerated: finalCounts.commentsGenerated || 0, commentsKept: finalCounts.commentsKept || 0, filteredItems, + extractionPhase, }; } diff --git a/meta-evals/src/validation/types.ts b/meta-evals/src/validation/types.ts index 7b298c3b..ec95309d 100644 --- a/meta-evals/src/validation/types.ts +++ b/meta-evals/src/validation/types.ts @@ -67,6 +67,38 @@ export interface PipelineTelemetrySnapshot { commentsKept: number; /** Items filtered out with their reasoning */ filteredItems?: FilteredItemSnapshot[]; + /** Extraction phase telemetry (multi-extractor mode) */ + extractionPhase?: ExtractionPhaseSnapshot; +} + +/** + * Telemetry for a single extractor run + */ +export interface ExtractorSnapshot { + extractorId: string; + model: string; + temperature: number; + issuesFound: number; + durationMs: number; + error?: string; +} + +/** + * Extraction phase telemetry (for multi-extractor mode) + */ +export interface ExtractionPhaseSnapshot { + multiExtractorEnabled: boolean; + extractors: ExtractorSnapshot[]; + totalIssuesBeforeJudge: number; + totalIssuesAfterJudge: number; + judgeModel?: string; + judgeDurationMs?: number; + judgeDecisions: Array<{ + issueText: string; + decision: 'accepted' | 'merged' | 'rejected'; + reasoning: string; + sourceExtractors: string[]; + }>; } /** From c997c6f3aee9597810b414655e021e7a34e2acbb Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 14:36:53 +0000 Subject: [PATCH 17/72] feat(meta-evals): Add Extractor Lab for testing extraction in isolation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add new Extractor Lab screen to main menu - Allows running fallacy extraction directly without full pipeline - Configure multiple extractors with different models/temperatures - Uses same validation corpus as Validation screen (50 docs) - Display format matches Create Baseline (numbered, with dates) - Export @roast/ai/fallacy-extraction module for external use πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal-packages/ai/package.json | 4 + meta-evals/src/app.tsx | 47 +++- meta-evals/src/components/ExtractorLab.tsx | 297 +++++++++++++++++++++ meta-evals/src/components/MainMenu.tsx | 4 + meta-evals/src/components/index.ts | 1 + meta-evals/src/components/types.ts | 3 +- 6 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 meta-evals/src/components/ExtractorLab.tsx diff --git a/internal-packages/ai/package.json b/internal-packages/ai/package.json index a34dd25b..619f8833 100644 --- a/internal-packages/ai/package.json +++ b/internal-packages/ai/package.json @@ -48,6 +48,10 @@ "./tools/generated-readmes": { "types": "./src/tools/generated-readmes.ts", "default": "./src/tools/generated-readmes.ts" + }, + "./fallacy-extraction": { + "types": "./src/analysis-plugins/plugins/fallacy-check/extraction/index.ts", + "default": "./src/analysis-plugins/plugins/fallacy-check/extraction/index.ts" } }, "scripts": { diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index 1df9594d..1ea79e1b 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -12,7 +12,7 @@ import { type AgentChoice, } from "@roast/db"; import { apiClient } from "./utils/apiClient"; -import { MainMenu, ScoreRankMenu, CreateBaseline, SeriesDetail, RankRuns, ScoreRun, Validation, type Screen } from "./components"; +import { MainMenu, ScoreRankMenu, CreateBaseline, SeriesDetail, RankRuns, ScoreRun, Validation, ExtractorLab, type Screen } from "./components"; import { getAvailableModels, getRecommendedJudgeModels, DEFAULT_JUDGE_MODEL, type ModelInfo } from "./utils/models"; // ============================================================================ @@ -184,6 +184,38 @@ export function App() { } } + async function startExtractorLab() { + setScreen({ type: "loading" }); + try { + // Get agents and use first one (usually Fallacy Check) + const userId = await apiClient.getUserId(); + const agentChoices = await metaEvaluationRepository.getAvailableAgents(userId); + if (agentChoices.length === 0) { + setError("No agents available"); + return; + } + const agentId = agentChoices[0].id; + + // Get validation corpus for this agent (same as Validation screen) + const corpusDocs = await metaEvaluationRepository.getValidationCorpusDocuments( + agentId, + { limit: 50, minContentLength: 200 } + ); + + // Map to DocumentChoice format + const docs = corpusDocs.map((d) => ({ + id: d.documentId, + title: d.title, + createdAt: d.lastEvaluatedAt || new Date(), + })); + + setDocuments(docs); + setScreen({ type: "extractor-lab" }); + } catch (e) { + setError(String(e)); + } + } + // Handle keyboard shortcuts // Disable "q" quit when on document step (text input is active) const isTextInputActive = screen.type === "create-baseline" && screen.step === "document"; @@ -226,6 +258,7 @@ export function App() { height={termHeight} onScoreRank={loadScoreRankMenu} onValidation={() => setScreen({ type: "validation" })} + onExtractorLab={startExtractorLab} onExit={exit} judgeModel={judgeModel} availableModels={availableModels} @@ -375,5 +408,17 @@ export function App() { ); } + if (screen.type === "extractor-lab") { + return ( + + ); + } + return null; } diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx new file mode 100644 index 00000000..86aad6e7 --- /dev/null +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -0,0 +1,297 @@ +/** + * Extractor Lab - Test extraction in isolation + * + * Allows running the fallacy extractor directly without the full pipeline, + * for quick iteration on extractor config and prompts. + */ + +import React, { useState, useEffect } from "react"; +import { Box, Text, useInput } from "ink"; +import SelectInput from "ink-select-input"; +import Spinner from "ink-spinner"; +import { prisma, type DocumentChoice } from "@roast/db"; +import type { ExtractorConfig, MultiExtractorResult } from "@roast/ai/fallacy-extraction"; +import { truncate, formatDate } from "./helpers"; + +interface ExtractorLabProps { + height: number; + maxItems: number; + documents: DocumentChoice[]; + onSearchDocuments: (filter: string) => void; + onBack: () => void; +} + +type LabStep = + | { type: "select-document" } + | { type: "configure-extractors" } + | { type: "running" } + | { type: "results"; result: MultiExtractorResult }; + +// Default extractor configs for testing +const DEFAULT_EXTRACTOR_CONFIGS: ExtractorConfig[] = [ + { model: "claude-sonnet-4-5-20250929", temperature: "default", thinking: false }, +]; + +const AVAILABLE_MODELS = [ + { id: "claude-sonnet-4-5-20250929", label: "Claude Sonnet 4.5" }, + { id: "google/gemini-2.5-flash", label: "Gemini 2.5 Flash" }, + { id: "google/gemini-3-flash-preview", label: "Gemini 3 Flash" }, +]; + +export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, onBack }: ExtractorLabProps) { + const [step, setStep] = useState({ type: "select-document" }); + const [selectedDoc, setSelectedDoc] = useState(null); + const [documentText, setDocumentText] = useState(""); + const [extractorConfigs, setExtractorConfigs] = useState(DEFAULT_EXTRACTOR_CONFIGS); + const [error, setError] = useState(null); + + async function loadDocumentText(docId: string) { + try { + // Get latest document version with content + const doc = await prisma.document.findUnique({ + where: { id: docId }, + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { content: true }, + }, + }, + }); + const content = doc?.versions[0]?.content; + if (content) { + setDocumentText(content); + } else { + setError("Document has no content"); + } + } catch (e) { + setError(`Failed to load document text: ${e}`); + } + } + + async function runExtraction() { + if (!documentText) { + setError("No document text loaded"); + return; + } + + setStep({ type: "running" }); + + try { + // Dynamic import for the multi-extractor + const { runMultiExtractor } = await import("@roast/ai/fallacy-extraction"); + + const result = await runMultiExtractor(documentText, { + extractors: extractorConfigs, + judgeEnabled: extractorConfigs.length > 1, // Enable judge if multiple extractors + }); + + setStep({ type: "results", result }); + } catch (e) { + setError(`Extraction failed: ${e}`); + setStep({ type: "configure-extractors" }); + } + } + + // Handle keyboard input + useInput((input, key) => { + if (key.escape) { + if (step.type === "results" || step.type === "configure-extractors") { + setStep({ type: "select-document" }); + } else { + onBack(); + } + } + }); + + if (error) { + return ( + + Error: {error} + Press Escape to go back + + ); + } + + // Document selection + if (step.type === "select-document") { + return ( + + + Extractor Lab - Select Document + + + + Select a document ({documents.length} found) + + + ({ + label: `${String(i + 1).padStart(2)} | ${truncate(d.title, 50).padEnd(50)} | ${formatDate(new Date(d.createdAt))}`, + value: d.id, + }))} + limit={maxItems - 2} + onSelect={async (item) => { + const doc = documents.find((d) => d.id === item.value); + if (doc) { + setSelectedDoc(doc); + await loadDocumentText(doc.id); + setStep({ type: "configure-extractors" }); + } + }} + /> + + + Up/Down Navigate | Enter Select | Escape Back + + + ); + } + + // Configure extractors + if (step.type === "configure-extractors") { + const items = [ + { label: "β–Ά Run Extraction", value: "run" }, + { label: "─────────────────", value: "divider" }, + ...extractorConfigs.map((config, idx) => ({ + label: `[${idx + 1}] ${config.model} (t=${config.temperature}, think=${config.thinking})`, + value: `config-${idx}`, + })), + { label: "+ Add Extractor", value: "add" }, + { label: "─────────────────", value: "divider2" }, + { label: "← Back to Documents", value: "back" }, + ]; + + return ( + + + Extractor Lab - Configure + + + + + + Document: + {selectedDoc?.title.slice(0, 40)} + + + Text length: + {documentText.length} chars + + + Extractors: + {extractorConfigs.length} + + + + + !i.value.startsWith("divider"))} + onSelect={(item) => { + if (item.value === "back") { + setStep({ type: "select-document" }); + } else if (item.value === "run") { + runExtraction(); + } else if (item.value === "add") { + // Add another extractor with different config + const nextModel = AVAILABLE_MODELS[extractorConfigs.length % AVAILABLE_MODELS.length]; + setExtractorConfigs([ + ...extractorConfigs, + { model: nextModel.id, temperature: "default", thinking: false }, + ]); + } else if (item.value.startsWith("config-")) { + // Toggle thinking for this extractor + const idx = parseInt(item.value.replace("config-", ""), 10); + setExtractorConfigs(configs => + configs.map((c, i) => i === idx ? { ...c, thinking: !c.thinking } : c) + ); + } + }} + /> + + + Enter on extractor toggles thinking | Escape Back + + + ); + } + + // Running + if (step.type === "running") { + return ( + + + Extractor Lab - Running + + + + + Running {extractorConfigs.length} extractor(s)... + + + + + This may take a minute... + + + ); + } + + // Results + if (step.type === "results") { + const { result } = step; + const totalIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + + return ( + + + Extractor Lab - Results + + + + + + Total Duration: + {(result.totalDurationMs / 1000).toFixed(1)}s + + + Total Issues: + {totalIssues} + + + + + + Per-Extractor Results: + {result.extractorResults.map((r, idx) => ( + + + {r.extractorId} + ({(r.durationMs / 1000).toFixed(1)}s) + + {r.error ? ( + Error: {r.error} + ) : ( + Found {r.issues.length} issues + )} + {r.issues.slice(0, 3).map((issue, i) => ( + + {" "}- [{issue.issueType}] {issue.exactText.slice(0, 40)}... + + ))} + {r.issues.length > 3 && ( + ... and {r.issues.length - 3} more + )} + + ))} + + + + Press Escape to go back + + + ); + } + + return null; +} diff --git a/meta-evals/src/components/MainMenu.tsx b/meta-evals/src/components/MainMenu.tsx index ddb986e0..0bf955f6 100644 --- a/meta-evals/src/components/MainMenu.tsx +++ b/meta-evals/src/components/MainMenu.tsx @@ -15,6 +15,7 @@ interface MainMenuProps { height: number; onScoreRank: () => void; onValidation: () => void; + onExtractorLab: () => void; onExit: () => void; judgeModel: string; availableModels: ModelInfo[]; @@ -32,6 +33,7 @@ export function MainMenu({ height, onScoreRank, onValidation, + onExtractorLab, onExit, judgeModel, availableModels, @@ -143,6 +145,7 @@ export function MainMenu({ const items = [ { label: "Score/Rank", value: "score-rank" }, { label: "Validation", value: "validation" }, + { label: "Extractor Lab", value: "extractor-lab" }, { label: "Settings", value: "settings" }, { label: "Exit", value: "exit" }, ]; @@ -170,6 +173,7 @@ export function MainMenu({ if (item.value === "exit") onExit(); else if (item.value === "score-rank") onScoreRank(); else if (item.value === "validation") onValidation(); + else if (item.value === "extractor-lab") onExtractorLab(); else if (item.value === "settings") setShowSettings(true); }} /> diff --git a/meta-evals/src/components/index.ts b/meta-evals/src/components/index.ts index cc7f2a02..5b85a455 100644 --- a/meta-evals/src/components/index.ts +++ b/meta-evals/src/components/index.ts @@ -9,5 +9,6 @@ export { SeriesDetail } from "./SeriesDetail"; export { RankRuns } from "./RankRuns"; export { ScoreRun } from "./ScoreRun"; export { Validation } from "./Validation"; +export { ExtractorLab } from "./ExtractorLab"; export * from "./helpers"; export * from "./types"; diff --git a/meta-evals/src/components/types.ts b/meta-evals/src/components/types.ts index 66c14795..164c52b9 100644 --- a/meta-evals/src/components/types.ts +++ b/meta-evals/src/components/types.ts @@ -12,6 +12,7 @@ export type Screen = | { type: "series-detail"; seriesId: string } | { type: "rank-runs"; seriesId: string } | { type: "score-run"; seriesId: string } - | { type: "validation" }; + | { type: "validation" } + | { type: "extractor-lab" }; export type { SeriesSummary, DocumentChoice, AgentChoice }; From 82d1385f8549f2ac46968ac69e04c8ba46a00f14 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 14:44:22 +0000 Subject: [PATCH 18/72] fix(meta-evals): Fix ESM import for fallacy-extraction module - Update package.json export to use dist files instead of src - Use static import instead of dynamic import in ExtractorLab - Fixes ERR_REQUIRE_CYCLE_MODULE error when running extraction Co-Authored-By: Claude Opus 4.5 --- internal-packages/ai/package.json | 4 ++-- meta-evals/src/components/ExtractorLab.tsx | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/internal-packages/ai/package.json b/internal-packages/ai/package.json index 619f8833..b41d2b4b 100644 --- a/internal-packages/ai/package.json +++ b/internal-packages/ai/package.json @@ -50,8 +50,8 @@ "default": "./src/tools/generated-readmes.ts" }, "./fallacy-extraction": { - "types": "./src/analysis-plugins/plugins/fallacy-check/extraction/index.ts", - "default": "./src/analysis-plugins/plugins/fallacy-check/extraction/index.ts" + "types": "./dist/analysis-plugins/plugins/fallacy-check/extraction/index.d.ts", + "default": "./dist/analysis-plugins/plugins/fallacy-check/extraction/index.js" } }, "scripts": { diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index 86aad6e7..47c2ee17 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -10,7 +10,7 @@ import { Box, Text, useInput } from "ink"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import { prisma, type DocumentChoice } from "@roast/db"; -import type { ExtractorConfig, MultiExtractorResult } from "@roast/ai/fallacy-extraction"; +import { runMultiExtractor, type ExtractorConfig, type MultiExtractorResult } from "@roast/ai/fallacy-extraction"; import { truncate, formatDate } from "./helpers"; interface ExtractorLabProps { @@ -78,9 +78,6 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o setStep({ type: "running" }); try { - // Dynamic import for the multi-extractor - const { runMultiExtractor } = await import("@roast/ai/fallacy-extraction"); - const result = await runMultiExtractor(documentText, { extractors: extractorConfigs, judgeEnabled: extractorConfigs.length > 1, // Enable judge if multiple extractors From 025e30dedb13b82673cf12d9823cde3b96441490 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 14:55:26 +0000 Subject: [PATCH 19/72] feat(meta-evals): Improve Extractor Lab with env config and scrollable results - Load extractor configs from FALLACY_EXTRACTORS env var - Add scrollable issue list in results view with severity indicators - Add issue detail view showing full reasoning and scores - Improve navigation with proper escape handling between views Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/ExtractorLab.tsx | 144 +++++++++++++++------ 1 file changed, 103 insertions(+), 41 deletions(-) diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index 47c2ee17..d7ee5d24 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -10,7 +10,7 @@ import { Box, Text, useInput } from "ink"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import { prisma, type DocumentChoice } from "@roast/db"; -import { runMultiExtractor, type ExtractorConfig, type MultiExtractorResult } from "@roast/ai/fallacy-extraction"; +import { runMultiExtractor, getMultiExtractorConfig, type ExtractorConfig, type MultiExtractorResult, type ExtractorResult } from "@roast/ai/fallacy-extraction"; import { truncate, formatDate } from "./helpers"; interface ExtractorLabProps { @@ -25,12 +25,18 @@ type LabStep = | { type: "select-document" } | { type: "configure-extractors" } | { type: "running" } - | { type: "results"; result: MultiExtractorResult }; + | { type: "results"; result: MultiExtractorResult } + | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number }; -// Default extractor configs for testing -const DEFAULT_EXTRACTOR_CONFIGS: ExtractorConfig[] = [ - { model: "claude-sonnet-4-5-20250929", temperature: "default", thinking: false }, -]; +// Load extractor configs from FALLACY_EXTRACTORS env var, fallback to default +function getInitialExtractorConfigs(): ExtractorConfig[] { + try { + const config = getMultiExtractorConfig(); + return config.extractors; + } catch { + return [{ model: "claude-sonnet-4-5-20250929", temperature: "default", thinking: false }]; + } +} const AVAILABLE_MODELS = [ { id: "claude-sonnet-4-5-20250929", label: "Claude Sonnet 4.5" }, @@ -42,7 +48,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o const [step, setStep] = useState({ type: "select-document" }); const [selectedDoc, setSelectedDoc] = useState(null); const [documentText, setDocumentText] = useState(""); - const [extractorConfigs, setExtractorConfigs] = useState(DEFAULT_EXTRACTOR_CONFIGS); + const [extractorConfigs, setExtractorConfigs] = useState(getInitialExtractorConfigs); const [error, setError] = useState(null); async function loadDocumentText(docId: string) { @@ -93,7 +99,11 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o // Handle keyboard input useInput((input, key) => { if (key.escape) { - if (step.type === "results" || step.type === "configure-extractors") { + if (step.type === "issue-detail") { + setStep({ type: "results", result: step.result }); + } else if (step.type === "results") { + setStep({ type: "configure-extractors" }); + } else if (step.type === "configure-extractors") { setStep({ type: "select-document" }); } else { onBack(); @@ -234,11 +244,32 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } - // Results + // Results - scrollable list of issues if (step.type === "results") { const { result } = step; const totalIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + // Build flat list of issues with extractor info + const issueItems: Array<{ label: string; value: string }> = []; + result.extractorResults.forEach((r, extractorIdx) => { + // Add extractor header + const tempStr = r.config.temperature === 'default' ? 'tDef' : `t${r.config.temperature}`; + const thinkStr = r.config.thinking ? '' : ' noThink'; + issueItems.push({ + label: `── ${r.extractorId} (${tempStr}${thinkStr}) - ${r.issues.length} issues, ${(r.durationMs / 1000).toFixed(1)}s ──`, + value: `header-${extractorIdx}`, + }); + // Add issues for this extractor + r.issues.forEach((issue, issueIdx) => { + const severityColor = issue.severityScore >= 70 ? 'πŸ”΄' : issue.severityScore >= 40 ? '🟑' : '🟒'; + issueItems.push({ + label: ` ${severityColor} [${issue.issueType}] ${truncate(issue.exactText.replace(/\n/g, ' '), 60)}`, + value: `issue-${extractorIdx}-${issueIdx}`, + }); + }); + }); + issueItems.push({ label: "← Back to Configure", value: "back" }); + return ( @@ -246,45 +277,76 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o - - - Total Duration: - {(result.totalDurationMs / 1000).toFixed(1)}s - - - Total Issues: - {totalIssues} - + + Duration: {(result.totalDurationMs / 1000).toFixed(1)}s + | + Issues: {totalIssues} + | + Extractors: {result.extractorResults.length} + + + + { + if (item.value === "back") { + setStep({ type: "configure-extractors" }); + } else if (item.value.startsWith("issue-")) { + const [, extractorIdx, issueIdx] = item.value.split("-"); + setStep({ + type: "issue-detail", + result, + extractorIdx: parseInt(extractorIdx), + issueIdx: parseInt(issueIdx), + }); + } + }} + /> + + + Enter View Detail | Escape Back + + + ); + } + + // Issue detail view + if (step.type === "issue-detail") { + const { result, extractorIdx, issueIdx } = step; + const extractor = result.extractorResults[extractorIdx]; + const issue = extractor.issues[issueIdx]; + + return ( + + + Issue Detail + + + + Extractor: {extractor.extractorId} + Type: {issue.issueType}{issue.fallacyType && ({issue.fallacyType})} + Severity: = 70 ? 'red' : issue.severityScore >= 40 ? 'yellow' : 'green'}>{issue.severityScore}/100 + Confidence: {issue.confidenceScore}/100 + Importance: {issue.importanceScore}/100 + + + + Quoted Text: + + "{truncate(issue.exactText, 200)}" - Per-Extractor Results: - {result.extractorResults.map((r, idx) => ( - - - {r.extractorId} - ({(r.durationMs / 1000).toFixed(1)}s) - - {r.error ? ( - Error: {r.error} - ) : ( - Found {r.issues.length} issues - )} - {r.issues.slice(0, 3).map((issue, i) => ( - - {" "}- [{issue.issueType}] {issue.exactText.slice(0, 40)}... - - ))} - {r.issues.length > 3 && ( - ... and {r.issues.length - 3} more - )} - - ))} + Reasoning: + + {truncate(issue.reasoning, 300)} + - Press Escape to go back + Press Escape to go back to results ); From 0c71d52c2f8eab4710487da36a005b6fd05dffaf Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:04:26 +0000 Subject: [PATCH 20/72] fix(meta-evals): Fix issue detail view truncation and escape navigation - Remove truncation from Quoted Text and Reasoning in issue detail - Fix escape key navigation using ref to avoid stale closure - Each escape now goes back one step instead of flying to main menu Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/ExtractorLab.tsx | 24 ++++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index d7ee5d24..b021d1f6 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -5,7 +5,7 @@ * for quick iteration on extractor config and prompts. */ -import React, { useState, useEffect } from "react"; +import React, { useState, useEffect, useRef } from "react"; import { Box, Text, useInput } from "ink"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; @@ -51,6 +51,10 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o const [extractorConfigs, setExtractorConfigs] = useState(getInitialExtractorConfigs); const [error, setError] = useState(null); + // Use ref to track current step for useInput (avoids stale closure) + const stepRef = useRef(step); + stepRef.current = step; + async function loadDocumentText(docId: string) { try { // Get latest document version with content @@ -96,18 +100,20 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } - // Handle keyboard input + // Handle keyboard input - use ref to avoid stale closure useInput((input, key) => { if (key.escape) { - if (step.type === "issue-detail") { - setStep({ type: "results", result: step.result }); - } else if (step.type === "results") { + const currentStep = stepRef.current; + if (currentStep.type === "issue-detail") { + setStep({ type: "results", result: currentStep.result }); + } else if (currentStep.type === "results") { setStep({ type: "configure-extractors" }); - } else if (step.type === "configure-extractors") { + } else if (currentStep.type === "configure-extractors") { setStep({ type: "select-document" }); - } else { + } else if (currentStep.type === "select-document") { onBack(); } + // Don't call onBack for running state } }); @@ -334,14 +340,14 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o Quoted Text: - "{truncate(issue.exactText, 200)}" + "{issue.exactText}" Reasoning: - {truncate(issue.reasoning, 300)} + {issue.reasoning} From ce504baf8aed96f9ff8db5da4f759b88556e9e82 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:13:00 +0000 Subject: [PATCH 21/72] fix(meta-evals): Let ExtractorLab handle escape navigation internally App.tsx was also catching escape and calling loadMainMenu(), overriding ExtractorLab's internal navigation. Now App skips escape handling when screen is extractor-lab. Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/app.tsx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index 1ea79e1b..ccc60d50 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -227,7 +227,8 @@ export function App() { exit(); } if (key.escape) { - if (screen.type !== "main-menu") { + // Let ExtractorLab handle its own escape navigation internally + if (screen.type !== "main-menu" && screen.type !== "extractor-lab") { loadMainMenu(); } } From 236ba02e9bcf151f86f03c86fcb7c389ddd32a5b Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:18:05 +0000 Subject: [PATCH 22/72] fix(meta-evals): Exclude all screens with internal escape navigation Added validation and score-rank-menu to list of screens that handle escape internally, preventing App from overriding their navigation. Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/app.tsx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index ccc60d50..2afc2e37 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -227,8 +227,9 @@ export function App() { exit(); } if (key.escape) { - // Let ExtractorLab handle its own escape navigation internally - if (screen.type !== "main-menu" && screen.type !== "extractor-lab") { + // Screens with internal escape navigation handle it themselves + const screensWithInternalEscape = ["main-menu", "extractor-lab", "validation", "score-rank-menu"]; + if (!screensWithInternalEscape.includes(screen.type)) { loadMainMenu(); } } From f4be023f244117a95427024364174bb72995f674 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:22:02 +0000 Subject: [PATCH 23/72] fix(meta-evals): Add proper escape handling to all screens Each screen now handles escape internally and calls onBack(): - SeriesDetail: added useInput with escape handler - RankRuns: added escape to existing useInput - ScoreRun: added useInput with escape handler - CreateBaseline: added useInput (skips document step for text input) App.tsx now excludes all screens with internal handlers. Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/app.tsx | 11 ++++++++++- meta-evals/src/components/CreateBaseline.tsx | 9 ++++++++- meta-evals/src/components/RankRuns.tsx | 5 ++++- meta-evals/src/components/ScoreRun.tsx | 9 ++++++++- meta-evals/src/components/SeriesDetail.tsx | 9 ++++++++- 5 files changed, 38 insertions(+), 5 deletions(-) diff --git a/meta-evals/src/app.tsx b/meta-evals/src/app.tsx index 2afc2e37..f6a2d683 100644 --- a/meta-evals/src/app.tsx +++ b/meta-evals/src/app.tsx @@ -228,7 +228,16 @@ export function App() { } if (key.escape) { // Screens with internal escape navigation handle it themselves - const screensWithInternalEscape = ["main-menu", "extractor-lab", "validation", "score-rank-menu"]; + const screensWithInternalEscape = [ + "main-menu", + "extractor-lab", + "validation", + "score-rank-menu", + "series-detail", + "rank-runs", + "score-run", + "create-baseline", + ]; if (!screensWithInternalEscape.includes(screen.type)) { loadMainMenu(); } diff --git a/meta-evals/src/components/CreateBaseline.tsx b/meta-evals/src/components/CreateBaseline.tsx index 2ba7c3c8..e892e579 100644 --- a/meta-evals/src/components/CreateBaseline.tsx +++ b/meta-evals/src/components/CreateBaseline.tsx @@ -3,7 +3,7 @@ */ import React, { useState, useEffect, useRef } from "react"; -import { Box, Text } from "ink"; +import { Box, Text, useInput } from "ink"; import TextInput from "ink-text-input"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; @@ -44,6 +44,13 @@ export function CreateBaseline({ const [isSearching, setIsSearching] = useState(false); const debounceRef = useRef(null); + // Handle escape to go back (but not during text input on document step) + useInput((input, key) => { + if (key.escape && step !== "document") { + onBack(); + } + }); + // Debounced DB search when filter changes useEffect(() => { if (debounceRef.current) { diff --git a/meta-evals/src/components/RankRuns.tsx b/meta-evals/src/components/RankRuns.tsx index 11ca6be1..3c3a2b4c 100644 --- a/meta-evals/src/components/RankRuns.tsx +++ b/meta-evals/src/components/RankRuns.tsx @@ -70,8 +70,11 @@ export function RankRuns({ seriesId, height, judgeModel, temperature, maxTokens, loadData(); }, [seriesId]); - // Handle tab key to switch between tabs (must be before any conditional returns) + // Handle keyboard shortcuts (must be before any conditional returns) useInput((input, key) => { + if (key.escape) { + onBack(); + } if (key.tab && savedSessions.length > 0 && !results) { setActiveTab((prev) => (prev === "saved" ? "new" : "saved")); } diff --git a/meta-evals/src/components/ScoreRun.tsx b/meta-evals/src/components/ScoreRun.tsx index 8b5e55fb..7bdb9222 100644 --- a/meta-evals/src/components/ScoreRun.tsx +++ b/meta-evals/src/components/ScoreRun.tsx @@ -3,7 +3,7 @@ */ import React, { useState, useEffect } from "react"; -import { Box, Text } from "ink"; +import { Box, Text, useInput } from "ink"; import SelectInput from "ink-select-input"; import { metaEvaluationRepository } from "@roast/db"; import { scoreComments, type ScoringResult } from "@roast/ai/meta-eval"; @@ -45,6 +45,13 @@ export function ScoreRun({ seriesId, height, judgeModel, temperature, maxTokens, const [showFullReasoning, setShowFullReasoning] = useState(false); const [isViewingSaved, setIsViewingSaved] = useState(false); + // Handle escape to go back + useInput((input, key) => { + if (key.escape) { + onBack(); + } + }); + useEffect(() => { loadCompletedRuns(); }, [seriesId]); diff --git a/meta-evals/src/components/SeriesDetail.tsx b/meta-evals/src/components/SeriesDetail.tsx index 7f700d0f..793998c2 100644 --- a/meta-evals/src/components/SeriesDetail.tsx +++ b/meta-evals/src/components/SeriesDetail.tsx @@ -3,7 +3,7 @@ */ import React, { useState, useEffect } from "react"; -import { Box, Text } from "ink"; +import { Box, Text, useInput } from "ink"; import SelectInput from "ink-select-input"; import { metaEvaluationRepository } from "@roast/db"; import { truncate, formatDate, formatStatus, getStatusColor } from "./helpers"; @@ -58,6 +58,13 @@ export function SeriesDetail({ const [clearing, setClearing] = useState(false); const [series, setSeries] = useState(null); + // Handle escape to go back + useInput((input, key) => { + if (key.escape) { + onBack(); + } + }); + // Load and poll for updates - always poll every 2 seconds useEffect(() => { let mounted = true; From 6bc2d6b19a37df7511a2997797957f47fefb98b9 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:37:48 +0000 Subject: [PATCH 24/72] feat(meta-evals): Add reusable ModelSelector component - Create ModelSelector component that fetches models from both APIs - Fetch from Anthropic API (9 models) and OpenRouter API (300+ models) - Add text input filtering with debounce - Group models by provider (Anthropic first, then OpenRouter) - Add allModels.ts utility with caching and filtering helpers - Update ExtractorLab to use ModelSelector for adding extractors - Export ModelSelector from components index for reuse Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/ExtractorLab.tsx | 77 ++++++-- meta-evals/src/components/ModelSelector.tsx | 207 ++++++++++++++++++++ meta-evals/src/components/index.ts | 1 + meta-evals/src/utils/allModels.ts | 136 +++++++++++++ 4 files changed, 408 insertions(+), 13 deletions(-) create mode 100644 meta-evals/src/components/ModelSelector.tsx create mode 100644 meta-evals/src/utils/allModels.ts diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index b021d1f6..a6405b57 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -12,6 +12,7 @@ import Spinner from "ink-spinner"; import { prisma, type DocumentChoice } from "@roast/db"; import { runMultiExtractor, getMultiExtractorConfig, type ExtractorConfig, type MultiExtractorResult, type ExtractorResult } from "@roast/ai/fallacy-extraction"; import { truncate, formatDate } from "./helpers"; +import { ModelSelector } from "./ModelSelector"; interface ExtractorLabProps { height: number; @@ -24,6 +25,7 @@ interface ExtractorLabProps { type LabStep = | { type: "select-document" } | { type: "configure-extractors" } + | { type: "add-extractor" } | { type: "running" } | { type: "results"; result: MultiExtractorResult } | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number }; @@ -38,11 +40,8 @@ function getInitialExtractorConfigs(): ExtractorConfig[] { } } -const AVAILABLE_MODELS = [ - { id: "claude-sonnet-4-5-20250929", label: "Claude Sonnet 4.5" }, - { id: "google/gemini-2.5-flash", label: "Gemini 2.5 Flash" }, - { id: "google/gemini-3-flash-preview", label: "Gemini 3 Flash" }, -]; +// Temperature presets for cycling +const TEMP_PRESETS = ["default", 0, 0.3, 0.5, 0.7, 1.0] as const; export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, onBack }: ExtractorLabProps) { const [step, setStep] = useState({ type: "select-document" }); @@ -50,11 +49,16 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o const [documentText, setDocumentText] = useState(""); const [extractorConfigs, setExtractorConfigs] = useState(getInitialExtractorConfigs); const [error, setError] = useState(null); + const [highlightedItem, setHighlightedItem] = useState(""); // Use ref to track current step for useInput (avoids stale closure) const stepRef = useRef(step); stepRef.current = step; + // Track highlighted item for keyboard shortcuts + const highlightedRef = useRef(highlightedItem); + highlightedRef.current = highlightedItem; + async function loadDocumentText(docId: string) { try { // Get latest document version with content @@ -108,6 +112,8 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "results") { setStep({ type: "configure-extractors" }); + } else if (currentStep.type === "add-extractor") { + setStep({ type: "configure-extractors" }); } else if (currentStep.type === "configure-extractors") { setStep({ type: "select-document" }); } else if (currentStep.type === "select-document") { @@ -115,6 +121,33 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } // Don't call onBack for running state } + + // Handle 'd' to delete extractor and 't' to cycle temperature (only on configure screen) + if (stepRef.current.type === "configure-extractors") { + const highlighted = highlightedRef.current; + if (highlighted.startsWith("config-")) { + const idx = parseInt(highlighted.replace("config-", ""), 10); + + if (input === "d") { + // Delete extractor (but keep at least one) + setExtractorConfigs(configs => { + if (configs.length <= 1) return configs; + return configs.filter((_, i) => i !== idx); + }); + } else if (input === "t") { + // Cycle temperature + setExtractorConfigs(configs => + configs.map((c, i) => { + if (i !== idx) return c; + const currentTemp = c.temperature; + const currentIdx = TEMP_PRESETS.findIndex(t => t === currentTemp); + const nextIdx = (currentIdx + 1) % TEMP_PRESETS.length; + return { ...c, temperature: TEMP_PRESETS[nextIdx] }; + }) + ); + } + } + } }); if (error) { @@ -185,7 +218,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o Document: - {selectedDoc?.title.slice(0, 40)} + {selectedDoc?.title} Text length: @@ -200,18 +233,15 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o !i.value.startsWith("divider"))} + onHighlight={(item) => setHighlightedItem(item.value)} onSelect={(item) => { if (item.value === "back") { setStep({ type: "select-document" }); } else if (item.value === "run") { runExtraction(); } else if (item.value === "add") { - // Add another extractor with different config - const nextModel = AVAILABLE_MODELS[extractorConfigs.length % AVAILABLE_MODELS.length]; - setExtractorConfigs([ - ...extractorConfigs, - { model: nextModel.id, temperature: "default", thinking: false }, - ]); + // Go to model selection + setStep({ type: "add-extractor" }); } else if (item.value.startsWith("config-")) { // Toggle thinking for this extractor const idx = parseInt(item.value.replace("config-", ""), 10); @@ -223,12 +253,33 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o /> - Enter on extractor toggles thinking | Escape Back + Enter=toggle think | t=cycle temp | d=delete | Esc=back ); } + // Add extractor - model selection using reusable ModelSelector + if (step.type === "add-extractor") { + return ( + { + // Add new extractor with selected model + setExtractorConfigs([ + ...extractorConfigs, + { model: model.id, temperature: "default", thinking: false }, + ]); + setStep({ type: "configure-extractors" }); + }} + onCancel={() => setStep({ type: "configure-extractors" })} + /> + ); + } + // Running if (step.type === "running") { return ( diff --git a/meta-evals/src/components/ModelSelector.tsx b/meta-evals/src/components/ModelSelector.tsx new file mode 100644 index 00000000..d52392ef --- /dev/null +++ b/meta-evals/src/components/ModelSelector.tsx @@ -0,0 +1,207 @@ +/** + * ModelSelector - Reusable component for selecting AI models + * + * Fetches models from both Anthropic and OpenRouter APIs, + * with text input filtering support. + */ + +import React, { useState, useEffect, useRef } from "react"; +import { Box, Text, useInput } from "ink"; +import TextInput from "ink-text-input"; +import SelectInput from "ink-select-input"; +import Spinner from "ink-spinner"; +import { + getAllModels, + filterModels, + type ModelInfo, +} from "../utils/allModels"; + +export interface ModelSelectorProps { + /** Title shown at the top */ + title?: string; + /** Border color */ + borderColor?: string; + /** Container height */ + height: number; + /** Max items to show in the list */ + maxItems: number; + /** Called when a model is selected */ + onSelect: (model: ModelInfo) => void; + /** Called when cancelled */ + onCancel: () => void; +} + +export function ModelSelector({ + title = "Select Model", + borderColor = "cyan", + height, + maxItems, + onSelect, + onCancel, +}: ModelSelectorProps) { + const [models, setModels] = useState([]); + const [filteredModels, setFilteredModels] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [filter, setFilter] = useState(""); + const debounceRef = useRef(null); + + // Load models on mount + useEffect(() => { + loadModels(); + }, []); + + // Filter models when query changes (debounced) + useEffect(() => { + if (debounceRef.current) { + clearTimeout(debounceRef.current); + } + + debounceRef.current = setTimeout(() => { + setFilteredModels(filterModels(models, filter)); + }, 150); + + return () => { + if (debounceRef.current) { + clearTimeout(debounceRef.current); + } + }; + }, [filter, models]); + + // Handle escape to cancel + useInput((input, key) => { + if (key.escape) { + onCancel(); + } + }); + + async function loadModels() { + try { + const allModels = await getAllModels(); + setModels(allModels); + setFilteredModels(allModels); + setLoading(false); + } catch (e) { + setError(`Failed to load models: ${e}`); + setLoading(false); + } + } + + if (loading) { + return ( + + + + {title} + + + + + Loading models from APIs... + + + + ); + } + + if (error) { + return ( + + {error} + Press Escape to go back + + ); + } + + // Build list items grouped by provider + const anthropicModels = filteredModels.filter((m) => m.provider === "anthropic"); + const openRouterModels = filteredModels.filter((m) => m.provider === "openrouter"); + + const items: Array<{ label: string; value: string }> = []; + + if (anthropicModels.length > 0) { + items.push({ label: `── Anthropic (${anthropicModels.length}) ──`, value: "header-anthropic" }); + for (const m of anthropicModels) { + items.push({ label: ` ${m.name} (${m.id})`, value: m.id }); + } + } + + if (openRouterModels.length > 0) { + items.push({ label: `── OpenRouter (${openRouterModels.length}) ──`, value: "header-openrouter" }); + for (const m of openRouterModels) { + items.push({ label: ` ${m.name}`, value: m.id }); + } + } + + items.push({ label: "← Cancel", value: "cancel" }); + + return ( + + + + {title} + + + + + + {filteredModels.length} models + {filter && ` matching "${filter}"`} + {" "}(Anthropic: {anthropicModels.length}, OpenRouter: {openRouterModels.length}) + + + + + Filter: + + + + { + if (item.value === "cancel") { + onCancel(); + } else if (item.value.startsWith("header-")) { + // Ignore header clicks + } else { + const model = filteredModels.find((m) => m.id === item.value); + if (model) { + onSelect(model); + } + } + }} + /> + + + Enter Select | Escape Cancel + + + ); +} + +// Re-export types for convenience +export type { ModelInfo }; diff --git a/meta-evals/src/components/index.ts b/meta-evals/src/components/index.ts index 5b85a455..f22b676b 100644 --- a/meta-evals/src/components/index.ts +++ b/meta-evals/src/components/index.ts @@ -10,5 +10,6 @@ export { RankRuns } from "./RankRuns"; export { ScoreRun } from "./ScoreRun"; export { Validation } from "./Validation"; export { ExtractorLab } from "./ExtractorLab"; +export { ModelSelector, type ModelInfo } from "./ModelSelector"; export * from "./helpers"; export * from "./types"; diff --git a/meta-evals/src/utils/allModels.ts b/meta-evals/src/utils/allModels.ts new file mode 100644 index 00000000..9081f8e3 --- /dev/null +++ b/meta-evals/src/utils/allModels.ts @@ -0,0 +1,136 @@ +/** + * Fetch models from both Anthropic and OpenRouter APIs + */ + +import Anthropic from "@anthropic-ai/sdk"; + +export interface ModelInfo { + id: string; + name: string; + provider: "anthropic" | "openrouter"; + contextLength?: number; + description?: string; +} + +// Cache for models +let cachedModels: ModelInfo[] | null = null; +let cacheTimestamp = 0; +const CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes + +/** + * Fetch models from Anthropic API + */ +async function fetchAnthropicModels(): Promise { + try { + const client = new Anthropic(); + const response = await client.models.list(); + + return response.data.map((m) => ({ + id: m.id, + name: m.display_name, + provider: "anthropic" as const, + })); + } catch (e) { + console.error("Failed to fetch Anthropic models:", e); + return []; + } +} + +/** + * Fetch models from OpenRouter API + */ +async function fetchOpenRouterModels(): Promise { + try { + const response = await fetch("https://openrouter.ai/api/v1/models"); + if (!response.ok) { + throw new Error(`OpenRouter API error: ${response.status}`); + } + + const data = (await response.json()) as { + data: Array<{ + id: string; + name: string; + context_length?: number; + description?: string; + }>; + }; + + return data.data + .filter((m) => { + // Filter out free/test models and keep quality models + if (m.id.includes(":free")) return false; + if (m.id.includes("auto")) return false; + return true; + }) + .map((m) => ({ + id: m.id, + name: m.name, + provider: "openrouter" as const, + contextLength: m.context_length, + description: m.description, + })); + } catch (e) { + console.error("Failed to fetch OpenRouter models:", e); + return []; + } +} + +/** + * Get all available models from both APIs (cached) + */ +export async function getAllModels(): Promise { + const now = Date.now(); + + if (cachedModels && now - cacheTimestamp < CACHE_TTL_MS) { + return cachedModels; + } + + const [anthropicModels, openRouterModels] = await Promise.all([ + fetchAnthropicModels(), + fetchOpenRouterModels(), + ]); + + // Combine and sort: Anthropic first, then OpenRouter alphabetically + cachedModels = [ + ...anthropicModels.sort((a, b) => a.name.localeCompare(b.name)), + ...openRouterModels.sort((a, b) => a.name.localeCompare(b.name)), + ]; + + cacheTimestamp = now; + return cachedModels; +} + +/** + * Filter models by search query + * Matches against id and name + */ +export function filterModels(models: ModelInfo[], query: string): ModelInfo[] { + if (!query.trim()) { + return models; + } + + const lowerQuery = query.toLowerCase(); + return models.filter( + (m) => + m.id.toLowerCase().includes(lowerQuery) || + m.name.toLowerCase().includes(lowerQuery) || + m.provider.toLowerCase().includes(lowerQuery) + ); +} + +/** + * Group models by provider + */ +export function groupModelsByProvider( + models: ModelInfo[] +): Map { + const grouped = new Map(); + + for (const model of models) { + const existing = grouped.get(model.provider) || []; + existing.push(model); + grouped.set(model.provider, existing); + } + + return grouped; +} From 8950e86f2ba968a55c892fefaa8cdfd40cc03a47 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:42:51 +0000 Subject: [PATCH 25/72] feat(meta-evals): Add reusable DocumentSelector component - Create DocumentSelector component with filter, single/multi-select modes - Update ExtractorLab to use DocumentSelector for document selection - Update CreateBaseline to use DocumentSelector for document selection - Both screens now have consistent UI and filtering behavior - Component is modular and ready for reuse in other screens Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/CreateBaseline.tsx | 77 ++---- .../src/components/DocumentSelector.tsx | 251 ++++++++++++++++++ meta-evals/src/components/ExtractorLab.tsx | 47 ++-- meta-evals/src/components/index.ts | 1 + 4 files changed, 292 insertions(+), 84 deletions(-) create mode 100644 meta-evals/src/components/DocumentSelector.tsx diff --git a/meta-evals/src/components/CreateBaseline.tsx b/meta-evals/src/components/CreateBaseline.tsx index e892e579..73f6a397 100644 --- a/meta-evals/src/components/CreateBaseline.tsx +++ b/meta-evals/src/components/CreateBaseline.tsx @@ -2,13 +2,13 @@ * Create Baseline Flow Component */ -import React, { useState, useEffect, useRef } from "react"; +import React, { useState } from "react"; import { Box, Text, useInput } from "ink"; -import TextInput from "ink-text-input"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import type { DocumentChoice, AgentChoice } from "./types"; -import { truncate, formatDate } from "./helpers"; +import { truncate } from "./helpers"; +import { DocumentSelector } from "./DocumentSelector"; interface CreateBaselineProps { step: "document" | "agents" | "confirm" | "creating"; @@ -40,35 +40,31 @@ export function CreateBaseline({ onBack, }: CreateBaselineProps) { const [agentSelection, setAgentSelection] = useState>(new Set()); - const [filter, setFilter] = useState(""); - const [isSearching, setIsSearching] = useState(false); - const debounceRef = useRef(null); - // Handle escape to go back (but not during text input on document step) + // Handle escape to go back (document step handles its own escape via DocumentSelector) useInput((input, key) => { if (key.escape && step !== "document") { onBack(); } }); - // Debounced DB search when filter changes - useEffect(() => { - if (debounceRef.current) { - clearTimeout(debounceRef.current); - } - - setIsSearching(true); - debounceRef.current = setTimeout(() => { - onSearchDocuments(filter); - setIsSearching(false); - }, 300); - - return () => { - if (debounceRef.current) { - clearTimeout(debounceRef.current); - } - }; - }, [filter]); + // Document selection using reusable DocumentSelector + if (step === "document") { + return ( + + ); + } if (step === "creating") { return ( @@ -80,6 +76,7 @@ export function CreateBaseline({ ); } + // Remaining steps: agents and confirm return ( @@ -88,34 +85,6 @@ export function CreateBaseline({ - {step === "document" && ( - <> - - Step 1/2: Select a document ({documents.length} found{filter ? ` for "${filter}"` : ""}) - - - Search: - - {isSearching && } - - ({ - label: `${String(i + 1).padStart(2)} | ${truncate(d.title, 50).padEnd(50)} | ${formatDate(new Date(d.createdAt))}`, - value: d.id, - }))} - limit={maxItems - 2} - onSelect={(item) => { - const doc = documents.find((d) => d.id === item.value); - if (doc) onSelectDocument(doc); - }} - /> - - )} - {step === "agents" && ( <> @@ -178,7 +147,7 @@ export function CreateBaseline({ )} - Esc Back | {step === "document" ? "Ctrl+C" : "q"} Quit + Esc Back | q Quit ); diff --git a/meta-evals/src/components/DocumentSelector.tsx b/meta-evals/src/components/DocumentSelector.tsx new file mode 100644 index 00000000..81c4eb66 --- /dev/null +++ b/meta-evals/src/components/DocumentSelector.tsx @@ -0,0 +1,251 @@ +/** + * DocumentSelector - Reusable component for selecting documents + * + * Supports both single-select and multi-select modes, with optional text filtering. + */ + +import React, { useState, useEffect, useRef } from "react"; +import { Box, Text, useInput } from "ink"; +import TextInput from "ink-text-input"; +import SelectInput from "ink-select-input"; +import Spinner from "ink-spinner"; +import type { DocumentChoice } from "@roast/db"; +import { truncate, formatDate } from "./helpers"; + +export interface DocumentSelectorProps { + /** Title shown at the top */ + title?: string; + /** Subtitle/instruction text */ + subtitle?: string; + /** Border color */ + borderColor?: string; + /** Container height */ + height: number; + /** Max items to show in the list */ + maxItems: number; + /** Documents to display */ + documents: DocumentChoice[]; + /** Enable text filter input */ + showFilter?: boolean; + /** Called when filter text changes (for server-side filtering) */ + onFilterChange?: (filter: string) => void; + /** Enable multi-select mode */ + multiSelect?: boolean; + /** Pre-selected document IDs (for multi-select) */ + selectedIds?: Set; + /** Called when a document is selected (single-select mode) */ + onSelect?: (doc: DocumentChoice) => void; + /** Called when selection changes (multi-select mode) */ + onSelectionChange?: (selectedIds: Set) => void; + /** Called when confirmed (multi-select mode) */ + onConfirm?: (selectedDocs: DocumentChoice[]) => void; + /** Called when cancelled */ + onCancel: () => void; + /** Confirm button label (multi-select mode) */ + confirmLabel?: string; +} + +export function DocumentSelector({ + title = "Select Document", + subtitle, + borderColor = "cyan", + height, + maxItems, + documents, + showFilter = false, + onFilterChange, + multiSelect = false, + selectedIds: externalSelectedIds, + onSelect, + onSelectionChange, + onConfirm, + onCancel, + confirmLabel = "Confirm Selection", +}: DocumentSelectorProps) { + const [filter, setFilter] = useState(""); + const [isSearching, setIsSearching] = useState(false); + const [internalSelectedIds, setInternalSelectedIds] = useState>( + externalSelectedIds || new Set() + ); + const debounceRef = useRef(null); + + // Use external or internal selected IDs + const selectedIds = externalSelectedIds || internalSelectedIds; + const setSelectedIds = onSelectionChange + ? (ids: Set) => onSelectionChange(ids) + : setInternalSelectedIds; + + // Debounced filter change + useEffect(() => { + if (!showFilter || !onFilterChange) return; + + if (debounceRef.current) { + clearTimeout(debounceRef.current); + } + + setIsSearching(true); + debounceRef.current = setTimeout(() => { + onFilterChange(filter); + setIsSearching(false); + }, 300); + + return () => { + if (debounceRef.current) { + clearTimeout(debounceRef.current); + } + }; + }, [filter, showFilter, onFilterChange]); + + // Handle escape to cancel + useInput((input, key) => { + if (key.escape) { + onCancel(); + } + }); + + function toggleDocument(docId: string) { + const newSelected = new Set(selectedIds); + if (newSelected.has(docId)) { + newSelected.delete(docId); + } else { + newSelected.add(docId); + } + setSelectedIds(newSelected); + } + + function toggleAll() { + const allSelected = documents.every((d) => selectedIds.has(d.id)); + if (allSelected) { + setSelectedIds(new Set()); + } else { + setSelectedIds(new Set(documents.map((d) => d.id))); + } + } + + // Build items list + const items: Array<{ label: string; value: string }> = []; + + if (multiSelect) { + // Add "Select All" option + const allSelected = documents.length > 0 && documents.every((d) => selectedIds.has(d.id)); + items.push({ + label: `[${allSelected ? "x" : " "}] Select All (${documents.length})`, + value: "toggle-all", + }); + } + + // Add document items + const displayDocs = documents.slice(0, maxItems - (multiSelect ? 4 : 2)); + for (let i = 0; i < displayDocs.length; i++) { + const d = displayDocs[i]; + if (multiSelect) { + items.push({ + label: `[${selectedIds.has(d.id) ? "x" : " "}] ${truncate(d.title, 55)}`, + value: d.id, + }); + } else { + items.push({ + label: `${String(i + 1).padStart(2)} | ${truncate(d.title, 50).padEnd(50)} | ${formatDate(new Date(d.createdAt))}`, + value: d.id, + }); + } + } + + if (documents.length > displayDocs.length) { + items.push({ + label: `... and ${documents.length - displayDocs.length} more`, + value: "more", + }); + } + + if (multiSelect) { + const selectedCount = selectedIds.size; + items.push({ + label: selectedCount > 0 ? `βœ“ ${confirmLabel} (${selectedCount} docs)` : "Select documents first", + value: "confirm", + }); + } + + items.push({ label: "← Cancel", value: "cancel" }); + + return ( + + + + {title} + + + + + + {subtitle || `${documents.length} document${documents.length !== 1 ? "s" : ""} found`} + {filter && ` for "${filter}"`} + + + + {showFilter && ( + + Search: + + {isSearching && ( + + {" "} + + + )} + + )} + + { + if (item.value === "cancel") { + onCancel(); + } else if (item.value === "more") { + // Ignore "more" item + } else if (multiSelect) { + if (item.value === "toggle-all") { + toggleAll(); + } else if (item.value === "confirm") { + if (selectedIds.size > 0 && onConfirm) { + const selectedDocs = documents.filter((d) => selectedIds.has(d.id)); + onConfirm(selectedDocs); + } + } else { + toggleDocument(item.value); + } + } else { + // Single select mode + const doc = documents.find((d) => d.id === item.value); + if (doc && onSelect) { + onSelect(doc); + } + } + }} + /> + + + + {multiSelect + ? "Enter Toggle | Escape Cancel" + : "Enter Select | Escape Cancel"} + + + + ); +} + +// Re-export types for convenience +export type { DocumentChoice }; diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index a6405b57..3b011559 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -13,6 +13,7 @@ import { prisma, type DocumentChoice } from "@roast/db"; import { runMultiExtractor, getMultiExtractorConfig, type ExtractorConfig, type MultiExtractorResult, type ExtractorResult } from "@roast/ai/fallacy-extraction"; import { truncate, formatDate } from "./helpers"; import { ModelSelector } from "./ModelSelector"; +import { DocumentSelector } from "./DocumentSelector"; interface ExtractorLabProps { height: number; @@ -159,38 +160,24 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } - // Document selection + // Document selection using reusable DocumentSelector if (step.type === "select-document") { return ( - - - Extractor Lab - Select Document - - - - Select a document ({documents.length} found) - - - ({ - label: `${String(i + 1).padStart(2)} | ${truncate(d.title, 50).padEnd(50)} | ${formatDate(new Date(d.createdAt))}`, - value: d.id, - }))} - limit={maxItems - 2} - onSelect={async (item) => { - const doc = documents.find((d) => d.id === item.value); - if (doc) { - setSelectedDoc(doc); - await loadDocumentText(doc.id); - setStep({ type: "configure-extractors" }); - } - }} - /> - - - Up/Down Navigate | Enter Select | Escape Back - - + { + setSelectedDoc(doc); + await loadDocumentText(doc.id); + setStep({ type: "configure-extractors" }); + }} + onCancel={onBack} + /> ); } diff --git a/meta-evals/src/components/index.ts b/meta-evals/src/components/index.ts index f22b676b..3245b32b 100644 --- a/meta-evals/src/components/index.ts +++ b/meta-evals/src/components/index.ts @@ -11,5 +11,6 @@ export { ScoreRun } from "./ScoreRun"; export { Validation } from "./Validation"; export { ExtractorLab } from "./ExtractorLab"; export { ModelSelector, type ModelInfo } from "./ModelSelector"; +export { DocumentSelector } from "./DocumentSelector"; export * from "./helpers"; export * from "./types"; From d57acd13904f861147c79d5f5a303e02543f7751 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:44:31 +0000 Subject: [PATCH 26/72] fix(meta-evals): Only show search spinner when actively filtering Don't show the loading spinner when the filter is empty. Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/DocumentSelector.tsx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/meta-evals/src/components/DocumentSelector.tsx b/meta-evals/src/components/DocumentSelector.tsx index 81c4eb66..6a256d01 100644 --- a/meta-evals/src/components/DocumentSelector.tsx +++ b/meta-evals/src/components/DocumentSelector.tsx @@ -83,7 +83,11 @@ export function DocumentSelector({ clearTimeout(debounceRef.current); } - setIsSearching(true); + // Only show spinner when actively filtering (not on empty initial state) + if (filter.length > 0) { + setIsSearching(true); + } + debounceRef.current = setTimeout(() => { onFilterChange(filter); setIsSearching(false); From fa743ad18766da93e79ad3681312c276b680aee4 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 15:49:39 +0000 Subject: [PATCH 27/72] refactor(meta-evals): Use DocumentSelector in Validation + full-width titles - Update Validation to use DocumentSelector for corpus selection - Remove toggleDocument/toggleAll functions (handled by DocumentSelector) - Use Set for selectedIds instead of `selected` property on docs - Add terminal width awareness to DocumentSelector - Document titles now use full available horizontal space Co-Authored-By: Claude Opus 4.5 --- .../src/components/DocumentSelector.tsx | 12 ++- meta-evals/src/components/Validation.tsx | 84 ++++++++----------- 2 files changed, 42 insertions(+), 54 deletions(-) diff --git a/meta-evals/src/components/DocumentSelector.tsx b/meta-evals/src/components/DocumentSelector.tsx index 6a256d01..d16ae10f 100644 --- a/meta-evals/src/components/DocumentSelector.tsx +++ b/meta-evals/src/components/DocumentSelector.tsx @@ -5,7 +5,7 @@ */ import React, { useState, useEffect, useRef } from "react"; -import { Box, Text, useInput } from "ink"; +import { Box, Text, useInput, useStdout } from "ink"; import TextInput from "ink-text-input"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; @@ -62,6 +62,7 @@ export function DocumentSelector({ onCancel, confirmLabel = "Confirm Selection", }: DocumentSelectorProps) { + const { stdout } = useStdout(); const [filter, setFilter] = useState(""); const [isSearching, setIsSearching] = useState(false); const [internalSelectedIds, setInternalSelectedIds] = useState>( @@ -69,6 +70,11 @@ export function DocumentSelector({ ); const debounceRef = useRef(null); + // Calculate available width for title (terminal width - borders - padding - index - date) + const termWidth = stdout?.columns || 100; + // Account for: border (2), padding (2), index (5), separator (3), date (12), checkbox for multiselect (4) + const titleWidth = Math.max(30, termWidth - 28 - (multiSelect ? 4 : 0)); + // Use external or internal selected IDs const selectedIds = externalSelectedIds || internalSelectedIds; const setSelectedIds = onSelectionChange @@ -144,12 +150,12 @@ export function DocumentSelector({ const d = displayDocs[i]; if (multiSelect) { items.push({ - label: `[${selectedIds.has(d.id) ? "x" : " "}] ${truncate(d.title, 55)}`, + label: `[${selectedIds.has(d.id) ? "x" : " "}] ${truncate(d.title, titleWidth)}`, value: d.id, }); } else { items.push({ - label: `${String(i + 1).padStart(2)} | ${truncate(d.title, 50).padEnd(50)} | ${formatDate(new Date(d.createdAt))}`, + label: `${String(i + 1).padStart(2)} | ${truncate(d.title, titleWidth).padEnd(titleWidth)} | ${formatDate(new Date(d.createdAt))}`, value: d.id, }); } diff --git a/meta-evals/src/components/Validation.tsx b/meta-evals/src/components/Validation.tsx index 0a9bf209..ea03a061 100644 --- a/meta-evals/src/components/Validation.tsx +++ b/meta-evals/src/components/Validation.tsx @@ -18,6 +18,7 @@ import Spinner from "ink-spinner"; import { metaEvaluationRepository, type AgentChoice } from "@roast/db"; import { truncate } from "./helpers"; import { ScreenContainer, InfoBox } from "./shared"; +import { DocumentSelector } from "./DocumentSelector"; import { type ValidationDocument, type DocumentComparisonResult, @@ -43,9 +44,7 @@ interface Baseline { snapshotCount: number; } -interface CorpusDocument extends ValidationDocument { - selected: boolean; -} +// CorpusDocument is just ValidationDocument (selection tracked separately via Set) interface ValidationRunSummary { id: string; @@ -76,7 +75,8 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati const [newBaselineName, setNewBaselineName] = useState(""); // Corpus state (for creating new baseline) - const [corpusDocuments, setCorpusDocuments] = useState([]); + const [corpusDocuments, setCorpusDocuments] = useState([]); + const [selectedDocIds, setSelectedDocIds] = useState>(new Set()); const [showCorpusSelect, setShowCorpusSelect] = useState(false); // Run state @@ -219,7 +219,9 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati agentId, { limit: 50, minContentLength: 200 } ); - setCorpusDocuments(docs.map((d) => ({ ...d, selected: true }))); + setCorpusDocuments(docs); + // Pre-select all documents by default + setSelectedDocIds(new Set(docs.map((d) => d.documentId))); } catch (e) { setError(String(e)); } @@ -249,15 +251,14 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati async function createBaseline() { if (!selectedAgent || !newBaselineName.trim()) return; - const selectedDocs = corpusDocuments.filter((d) => d.selected); - if (selectedDocs.length === 0) return; + if (selectedDocIds.size === 0) return; try { setLoading(true); // Get current evaluation version IDs for selected documents const snapshots = await metaEvaluationRepository.getEvaluationSnapshots( - selectedDocs.map((d) => d.documentId), + Array.from(selectedDocIds), selectedAgent.id ); @@ -452,17 +453,6 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati } } - function toggleDocument(docId: string) { - setCorpusDocuments((docs) => - docs.map((d) => (d.documentId === docId ? { ...d, selected: !d.selected } : d)) - ); - } - - function toggleAll() { - const allSelected = corpusDocuments.every((d) => d.selected); - setCorpusDocuments((docs) => docs.map((d) => ({ ...d, selected: !allSelected }))); - } - // Render tabs header const renderTabs = () => ( @@ -498,41 +488,33 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati ); } - // Creating baseline - corpus selection + // Creating baseline - corpus selection using DocumentSelector if (creatingBaseline && showCorpusSelect) { - const selectedCount = corpusDocuments.filter((d) => d.selected).length; - const items = [ - { label: `[${selectedCount === corpusDocuments.length ? "x" : " "}] Select All (${corpusDocuments.length})`, value: "toggle-all" }, - ...corpusDocuments.slice(0, maxItems - 4).map((d) => ({ - label: `[${d.selected ? "x" : " "}] ${truncate(d.title, 50)}`, - value: d.documentId, - })), - { label: selectedCount > 0 ? `βœ“ Create Baseline (${selectedCount} docs)` : "Select documents first", value: "create" }, - { label: "← Cancel", value: "cancel" }, - ]; + // Convert ValidationDocument[] to DocumentChoice[] format + const documentsForSelector = corpusDocuments.map((d) => ({ + id: d.documentId, + title: d.title, + createdAt: d.lastEvaluatedAt || new Date(), + })); return ( - - - Select documents to include in baseline - - - { - if (item.value === "cancel") { - setShowCorpusSelect(false); - setCreatingBaseline(false); - } else if (item.value === "toggle-all") { - toggleAll(); - } else if (item.value === "create" && selectedCount > 0) { - createBaseline(); - } else { - toggleDocument(item.value); - } - }} - /> - + createBaseline()} + onCancel={() => { + setShowCorpusSelect(false); + setCreatingBaseline(false); + }} + /> ); } From ec311424beef68bc18127421325e4fb332421c8a Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 16:02:32 +0000 Subject: [PATCH 28/72] refactor(meta-evals): Remove manual text truncation from DocumentSelector Ink's SelectInput handles text overflow automatically, so we don't need manual truncation logic. This simplifies the code significantly: - Remove useStdout hook and width calculation - Remove truncate import and calls - Let Ink handle text overflow natively Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/DocumentSelector.tsx | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/meta-evals/src/components/DocumentSelector.tsx b/meta-evals/src/components/DocumentSelector.tsx index d16ae10f..e967499c 100644 --- a/meta-evals/src/components/DocumentSelector.tsx +++ b/meta-evals/src/components/DocumentSelector.tsx @@ -5,12 +5,12 @@ */ import React, { useState, useEffect, useRef } from "react"; -import { Box, Text, useInput, useStdout } from "ink"; +import { Box, Text, useInput } from "ink"; import TextInput from "ink-text-input"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import type { DocumentChoice } from "@roast/db"; -import { truncate, formatDate } from "./helpers"; +import { formatDate } from "./helpers"; export interface DocumentSelectorProps { /** Title shown at the top */ @@ -62,7 +62,6 @@ export function DocumentSelector({ onCancel, confirmLabel = "Confirm Selection", }: DocumentSelectorProps) { - const { stdout } = useStdout(); const [filter, setFilter] = useState(""); const [isSearching, setIsSearching] = useState(false); const [internalSelectedIds, setInternalSelectedIds] = useState>( @@ -70,11 +69,6 @@ export function DocumentSelector({ ); const debounceRef = useRef(null); - // Calculate available width for title (terminal width - borders - padding - index - date) - const termWidth = stdout?.columns || 100; - // Account for: border (2), padding (2), index (5), separator (3), date (12), checkbox for multiselect (4) - const titleWidth = Math.max(30, termWidth - 28 - (multiSelect ? 4 : 0)); - // Use external or internal selected IDs const selectedIds = externalSelectedIds || internalSelectedIds; const setSelectedIds = onSelectionChange @@ -150,12 +144,12 @@ export function DocumentSelector({ const d = displayDocs[i]; if (multiSelect) { items.push({ - label: `[${selectedIds.has(d.id) ? "x" : " "}] ${truncate(d.title, titleWidth)}`, + label: `[${selectedIds.has(d.id) ? "x" : " "}] ${d.title}`, value: d.id, }); } else { items.push({ - label: `${String(i + 1).padStart(2)} | ${truncate(d.title, titleWidth).padEnd(titleWidth)} | ${formatDate(new Date(d.createdAt))}`, + label: `${String(i + 1).padStart(2)} | ${d.title} | ${formatDate(new Date(d.createdAt))}`, value: d.id, }); } From e84e81f698a9bcb1f0b397fcf181896ab7b25bf2 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 16:10:14 +0000 Subject: [PATCH 29/72] fix(meta-evals): Don't replace initial documents on empty filter The debounce effect was firing with empty filter on component mount, which triggered onFilterChange("") and replaced the initial documents (from getValidationCorpusDocuments) with different documents (from getRecentDocuments). This caused the titles to change unexpectedly. Now we skip the filter callback entirely when the filter is empty, preserving the initially loaded documents. Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/DocumentSelector.tsx | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/meta-evals/src/components/DocumentSelector.tsx b/meta-evals/src/components/DocumentSelector.tsx index e967499c..ea6d0f3f 100644 --- a/meta-evals/src/components/DocumentSelector.tsx +++ b/meta-evals/src/components/DocumentSelector.tsx @@ -75,18 +75,17 @@ export function DocumentSelector({ ? (ids: Set) => onSelectionChange(ids) : setInternalSelectedIds; - // Debounced filter change + // Debounced filter change - only trigger when user actually types something useEffect(() => { if (!showFilter || !onFilterChange) return; + // Don't trigger search on empty filter - keep initial documents + if (filter.length === 0) return; if (debounceRef.current) { clearTimeout(debounceRef.current); } - // Only show spinner when actively filtering (not on empty initial state) - if (filter.length > 0) { - setIsSearching(true); - } + setIsSearching(true); debounceRef.current = setTimeout(() => { onFilterChange(filter); From 60491478d4468b35095d375cd9e7d52749a21a03 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 16:13:08 +0000 Subject: [PATCH 30/72] feat(meta-evals): Add smart truncation with column alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Truncate document titles to fit terminal width using ellipsis (…) - Pad titles to fixed width so date column aligns properly - Calculate available width from stdout.columns minus layout overhead - Different overhead for single-select (27) vs multi-select (10) Co-Authored-By: Claude Opus 4.5 --- .../src/components/DocumentSelector.tsx | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/meta-evals/src/components/DocumentSelector.tsx b/meta-evals/src/components/DocumentSelector.tsx index ea6d0f3f..7d61f6fb 100644 --- a/meta-evals/src/components/DocumentSelector.tsx +++ b/meta-evals/src/components/DocumentSelector.tsx @@ -5,13 +5,18 @@ */ import React, { useState, useEffect, useRef } from "react"; -import { Box, Text, useInput } from "ink"; +import { Box, Text, useInput, useStdout } from "ink"; import TextInput from "ink-text-input"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import type { DocumentChoice } from "@roast/db"; import { formatDate } from "./helpers"; +function truncate(str: string, maxLen: number): string { + if (str.length <= maxLen) return str; + return str.slice(0, maxLen - 1) + "…"; +} + export interface DocumentSelectorProps { /** Title shown at the top */ title?: string; @@ -62,6 +67,7 @@ export function DocumentSelector({ onCancel, confirmLabel = "Confirm Selection", }: DocumentSelectorProps) { + const { stdout } = useStdout(); const [filter, setFilter] = useState(""); const [isSearching, setIsSearching] = useState(false); const [internalSelectedIds, setInternalSelectedIds] = useState>( @@ -69,6 +75,13 @@ export function DocumentSelector({ ); const debounceRef = useRef(null); + // Calculate available width for title based on terminal width + // Layout: border(2) + padding(2) + "❯ "(2) + " 1 | "(6) + title + " | "(3) + date(12) = 27 overhead + // For multiSelect: "[x] "(4) instead of index, no date = 10 overhead + const termWidth = stdout?.columns ?? 120; + const overhead = multiSelect ? 10 : 27; + const titleWidth = Math.max(40, termWidth - overhead); + // Use external or internal selected IDs const selectedIds = externalSelectedIds || internalSelectedIds; const setSelectedIds = onSelectionChange @@ -143,12 +156,12 @@ export function DocumentSelector({ const d = displayDocs[i]; if (multiSelect) { items.push({ - label: `[${selectedIds.has(d.id) ? "x" : " "}] ${d.title}`, + label: `[${selectedIds.has(d.id) ? "x" : " "}] ${truncate(d.title, titleWidth)}`, value: d.id, }); } else { items.push({ - label: `${String(i + 1).padStart(2)} | ${d.title} | ${formatDate(new Date(d.createdAt))}`, + label: `${String(i + 1).padStart(2)} | ${truncate(d.title, titleWidth).padEnd(titleWidth)} | ${formatDate(new Date(d.createdAt))}`, value: d.id, }); } From 6e6424830848980633c3d5cbb7540e826bdedcf4 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 17:39:50 +0000 Subject: [PATCH 31/72] feat(meta-evals): Add LLM Judge integration to Extractor Lab - Add Judge step to Extractor Lab UI with full instrumentation - Support FALLACY_JUDGE env var for judge config (model, temperature, thinking, enabled) - Add OpenRouter support to fallacy-judge tool - Sort issues alphabetically before judge to group duplicates together - Create standalone lab-exports.ts to avoid circular dependencies - Add dynamic width calculation for proper column alignment - Update MultiExtractorConfig to use JudgeConfig object structure The Extractor Lab now shows: - Extraction results with per-extractor breakdown - Judge aggregation: accept/merge/reject decisions with reasoning - Legend mapping extractors to A/B/C labels - Drill-down to view full decision details Co-Authored-By: Claude Opus 4.5 --- internal-packages/ai/package.json | 12 + .../fallacy-check/extraction/config.ts | 69 ++-- .../fallacy-check/extraction/lab-exports.ts | 315 ++++++++++++++++++ .../plugins/fallacy-check/extraction/types.ts | 27 +- .../plugins/fallacy-check/index.ts | 4 +- .../ai/src/tools/fallacy-extractor/types.ts | 2 +- .../ai/src/tools/fallacy-judge/index.ts | 257 +++++++++----- .../ai/src/tools/fallacy-judge/types.ts | 20 ++ meta-evals/src/components/ExtractorLab.tsx | 311 ++++++++++++++++- 9 files changed, 889 insertions(+), 128 deletions(-) create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/lab-exports.ts diff --git a/internal-packages/ai/package.json b/internal-packages/ai/package.json index b41d2b4b..ba79f6cc 100644 --- a/internal-packages/ai/package.json +++ b/internal-packages/ai/package.json @@ -52,6 +52,18 @@ "./fallacy-extraction": { "types": "./dist/analysis-plugins/plugins/fallacy-check/extraction/index.d.ts", "default": "./dist/analysis-plugins/plugins/fallacy-check/extraction/index.js" + }, + "./fallacy-extraction/lab": { + "types": "./dist/analysis-plugins/plugins/fallacy-check/extraction/lab-exports.d.ts", + "default": "./dist/analysis-plugins/plugins/fallacy-check/extraction/lab-exports.js" + }, + "./fallacy-judge": { + "types": "./dist/tools/fallacy-judge/index.d.ts", + "default": "./dist/tools/fallacy-judge/index.js" + }, + "./fallacy-judge/types": { + "types": "./dist/tools/fallacy-judge/types.d.ts", + "default": "./dist/tools/fallacy-judge/types.js" } }, "scripts": { diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts index 29a23b48..b3e2b35b 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/config.ts @@ -4,7 +4,7 @@ * Parses the FALLACY_EXTRACTORS environment variable and provides defaults. */ -import type { ExtractorConfig, MultiExtractorConfig } from './types'; +import type { ExtractorConfig, MultiExtractorConfig, JudgeConfig } from './types'; /** Default model for extraction when not configured */ const DEFAULT_EXTRACTOR_MODEL = 'claude-sonnet-4-5-20250929'; @@ -178,51 +178,66 @@ function parseExtractorsEnvVar(envValue: string): ExtractorConfig[] { } } +/** + * Parse FALLACY_JUDGE env var + * + * Example: + * FALLACY_JUDGE='{"model":"google/gemini-3-flash-preview","temperature":"default","thinking":false,"enabled":true}' + */ +function parseJudgeEnvVar(): JudgeConfig { + const judgeEnv = process.env.FALLACY_JUDGE; + + if (judgeEnv) { + try { + const parsed = JSON.parse(judgeEnv); + if (typeof parsed === 'object' && parsed !== null && typeof parsed.model === 'string') { + return { + model: parsed.model, + temperature: typeof parsed.temperature === 'number' ? parsed.temperature : + parsed.temperature === 'default' ? 'default' : undefined, + thinking: typeof parsed.thinking === 'boolean' ? parsed.thinking : undefined, + enabled: parsed.enabled !== false, + }; + } + } catch (e) { + console.warn('[Config] Failed to parse FALLACY_JUDGE:', e); + } + } + + // Default: disabled + return { + model: DEFAULT_JUDGE_MODEL, + enabled: false, + }; +} + /** * Get the multi-extractor configuration from environment variables * * Environment variables: * - FALLACY_EXTRACTORS: JSON array of extractor configs - * - FALLACY_EXTRACTOR_MODEL: Single model override (legacy, used if FALLACY_EXTRACTORS not set) - * - FALLACY_JUDGE_MODEL: Model for judge aggregation - * - FALLACY_JUDGE_ENABLED: Enable LLM judge (default: false - uses simple dedup) + * - FALLACY_JUDGE: JSON object with judge config (model, temperature, thinking, enabled) * * Defaults to single extractor with DEFAULT_EXTRACTOR_MODEL if not configured. */ export function getMultiExtractorConfig(): MultiExtractorConfig { const extractorsEnv = process.env.FALLACY_EXTRACTORS; - const legacyModelEnv = process.env.FALLACY_EXTRACTOR_MODEL; - const judgeModelEnv = process.env.FALLACY_JUDGE_MODEL; - const judgeEnabledEnv = process.env.FALLACY_JUDGE_ENABLED; let extractors: ExtractorConfig[]; if (extractorsEnv) { - // Parse multi-extractor config extractors = parseExtractorsEnvVar(extractorsEnv); - if (extractors.length === 0) { - // Parsing failed or empty array, fall back to defaults - console.warn( - '[MultiExtractor] No valid extractors in FALLACY_EXTRACTORS, using defaults' - ); - extractors = [{ model: legacyModelEnv || DEFAULT_EXTRACTOR_MODEL }]; + console.warn('[MultiExtractor] No valid extractors in FALLACY_EXTRACTORS, using defaults'); + extractors = [{ model: DEFAULT_EXTRACTOR_MODEL }]; } - } else if (legacyModelEnv) { - // Legacy single-model configuration - extractors = [{ model: legacyModelEnv }]; } else { - // Default configuration extractors = [{ model: DEFAULT_EXTRACTOR_MODEL }]; } - // Judge is disabled by default - uses simple deduplication instead - const judgeEnabled = judgeEnabledEnv === 'true' || judgeEnabledEnv === '1'; - return { extractors, - judgeModel: judgeModelEnv || DEFAULT_JUDGE_MODEL, - judgeEnabled, + judge: parseJudgeEnvVar(), }; } @@ -231,7 +246,7 @@ export function getMultiExtractorConfig(): MultiExtractorConfig { */ export function isJudgeEnabled(): boolean { const config = getMultiExtractorConfig(); - return config.judgeEnabled; + return config.judge.enabled; } /** @@ -267,9 +282,13 @@ export function getConfigSummary(): string { return `${i + 1}. ${label} (${ext.model}, t=${formatTemp(ext)}${formatThinking(ext)})`; }); + const judgeStatus = config.judge.enabled + ? `${config.judge.model} (t=${config.judge.temperature ?? 'default'}, think=${config.judge.thinking !== false})` + : 'disabled'; + return [ `Multi-extractor mode: ${config.extractors.length} extractors`, ...extractorSummaries, - `Judge: ${config.judgeEnabled ? config.judgeModel : 'disabled (simple dedup)'}`, + `Judge: ${judgeStatus}`, ].join('\n'); } diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/lab-exports.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/lab-exports.ts new file mode 100644 index 00000000..a7559d2a --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/lab-exports.ts @@ -0,0 +1,315 @@ +/** + * Lab-specific exports for Extractor Lab + * + * This file provides STANDALONE types and config parsing for the Extractor Lab + * without importing from files that have circular dependencies with the plugin system. + * + * The types here are intentionally duplicated to avoid the circular dependency chain: + * extraction β†’ fallacy-extractor β†’ constants β†’ (back to plugin) + */ + +// ============================================================================ +// Standalone Type Definitions (duplicated to avoid cycles) +// ============================================================================ + +/** Type of epistemic issue (duplicated from constants.ts ISSUE_TYPES) */ +export type IssueType = + | 'misinformation' + | 'missing-context' + | 'deceptive-wording' + | 'logical-fallacy' + | 'verified-accurate'; + +/** Specific types of fallacies */ +export type FallacyType = + | 'ad-hominem' + | 'straw-man' + | 'false-dilemma' + | 'slippery-slope' + | 'appeal-to-authority' + | 'appeal-to-emotion' + | 'appeal-to-nature' + | 'hasty-generalization' + | 'survivorship-bias' + | 'selection-bias' + | 'cherry-picking' + | 'circular-reasoning' + | 'equivocation' + | 'non-sequitur' + | 'other'; + +/** Raw epistemic issue extracted from text */ +export interface ExtractedFallacyIssue { + exactText: string; + issueType: IssueType; + fallacyType?: FallacyType; + severityScore: number; + confidenceScore: number; + reasoning: string; + importanceScore: number; + approximateLineNumber?: number; + location?: { + startOffset: number; + endOffset: number; + quotedText: string; + strategy?: string; + confidence?: number; + }; + [key: string]: unknown; +} + +// ============================================================================ +// Configuration Types +// ============================================================================ + +export interface ExtractorConfig { + model: string; + temperature?: number | 'default'; + label?: string; + thinking?: boolean; +} + +/** + * Judge configuration + */ +export interface JudgeConfig { + model: string; + temperature?: number | 'default'; + thinking?: boolean; + enabled: boolean; +} + +export interface MultiExtractorConfig { + extractors: ExtractorConfig[]; + judge: JudgeConfig; +} + +// ============================================================================ +// Result Types +// ============================================================================ + +export interface ExtractorResult { + extractorId: string; + config: ExtractorConfig; + issues: ExtractedFallacyIssue[]; + durationMs: number; + costUsd?: number; + error?: string; +} + +export interface MultiExtractorResult { + extractorResults: ExtractorResult[]; + totalDurationMs: number; + totalIssuesFound: number; +} + +// ============================================================================ +// Config Parsing (standalone implementation) +// ============================================================================ + +const DEFAULT_EXTRACTOR_MODEL = 'claude-sonnet-4-5-20250929'; +const DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-5-20250929'; +const DEFAULT_CLAUDE_TEMPERATURE = 0; +const DEFAULT_OPENROUTER_TEMPERATURE = 0.1; + +function isOpenRouterModel(model: string): boolean { + return model.includes('/'); +} + +export function getDefaultTemperature(model: string): number { + return isOpenRouterModel(model) + ? DEFAULT_OPENROUTER_TEMPERATURE + : DEFAULT_CLAUDE_TEMPERATURE; +} + +export function generateExtractorLabel(config: ExtractorConfig): string { + if (config.label) { + return config.label; + } + + let shortName: string; + if (isOpenRouterModel(config.model)) { + const parts = config.model.split('/'); + shortName = parts[parts.length - 1].replace('-preview', '').replace('-latest', ''); + } else { + if (config.model.includes('opus')) { + shortName = 'opus'; + } else if (config.model.includes('sonnet')) { + shortName = 'sonnet'; + } else if (config.model.includes('haiku')) { + shortName = 'haiku'; + } else { + shortName = config.model.slice(0, 10); + } + } + + const suffixParts: string[] = []; + + if (config.temperature === 'default') { + suffixParts.push('tDef'); + } else { + const defaultTemp = getDefaultTemperature(config.model); + const temp = config.temperature ?? defaultTemp; + if (temp !== defaultTemp) { + suffixParts.push(`t${temp}`); + } + } + + if (config.thinking === false) { + suffixParts.push('noThink'); + } + + if (suffixParts.length > 0) { + return `${shortName}-${suffixParts.join('-')}`; + } + + return shortName; +} + +export function generateExtractorId( + config: ExtractorConfig, + index: number, + allConfigs: ExtractorConfig[] +): string { + const label = generateExtractorLabel(config); + const sameLabels = allConfigs.filter(c => generateExtractorLabel(c) === label); + if (sameLabels.length > 1) { + return `${label}-${index}`; + } + return label; +} + +function parseExtractorsEnvVar(envValue: string): ExtractorConfig[] { + try { + const parsed = JSON.parse(envValue); + + if (!Array.isArray(parsed)) { + console.warn( + '[MultiExtractor] FALLACY_EXTRACTORS must be a JSON array, using defaults' + ); + return []; + } + + const configs: ExtractorConfig[] = []; + for (const item of parsed) { + if (typeof item !== 'object' || item === null) { + continue; + } + + if (typeof item.model !== 'string' || !item.model) { + continue; + } + + const config: ExtractorConfig = { + model: item.model, + }; + + if (typeof item.temperature === 'number') { + config.temperature = item.temperature; + } else if (item.temperature === 'default') { + config.temperature = 'default'; + } + + if (typeof item.label === 'string' && item.label) { + config.label = item.label; + } + + if (typeof item.thinking === 'boolean') { + config.thinking = item.thinking; + } + + configs.push(config); + } + + return configs; + } catch (error) { + console.warn( + '[MultiExtractor] Failed to parse FALLACY_EXTRACTORS:', + error instanceof Error ? error.message : error + ); + return []; + } +} + +/** + * Parse FALLACY_JUDGE env var + */ +function parseJudgeEnvVar(): JudgeConfig { + const judgeEnv = process.env.FALLACY_JUDGE; + + if (judgeEnv) { + try { + const parsed = JSON.parse(judgeEnv); + if (typeof parsed === 'object' && parsed !== null && typeof parsed.model === 'string') { + return { + model: parsed.model, + temperature: typeof parsed.temperature === 'number' ? parsed.temperature : + parsed.temperature === 'default' ? 'default' : undefined, + thinking: typeof parsed.thinking === 'boolean' ? parsed.thinking : undefined, + enabled: parsed.enabled !== false, + }; + } + } catch (e) { + console.warn('[Config] Failed to parse FALLACY_JUDGE:', e); + } + } + + // Default: disabled + return { + model: DEFAULT_JUDGE_MODEL, + enabled: false, + }; +} + +export function getMultiExtractorConfig(): MultiExtractorConfig { + const extractorsEnv = process.env.FALLACY_EXTRACTORS; + + let extractors: ExtractorConfig[]; + + if (extractorsEnv) { + extractors = parseExtractorsEnvVar(extractorsEnv); + if (extractors.length === 0) { + extractors = [{ model: DEFAULT_EXTRACTOR_MODEL }]; + } + } else { + extractors = [{ model: DEFAULT_EXTRACTOR_MODEL }]; + } + + return { + extractors, + judge: parseJudgeEnvVar(), + }; +} + +export function getConfigSummary(): string { + const config = getMultiExtractorConfig(); + + const formatTemp = (ext: ExtractorConfig): string => { + if (ext.temperature === 'default') return 'default'; + return String(ext.temperature ?? getDefaultTemperature(ext.model)); + }; + + const formatThinking = (ext: ExtractorConfig): string => { + return ext.thinking === false ? ', think=off' : ''; + }; + + if (config.extractors.length === 1) { + const ext = config.extractors[0]; + return `Single extractor: ${ext.model} (t=${formatTemp(ext)}${formatThinking(ext)})`; + } + + const extractorSummaries = config.extractors.map((ext, i) => { + const label = generateExtractorLabel(ext); + return `${i + 1}. ${label} (${ext.model}, t=${formatTemp(ext)}${formatThinking(ext)})`; + }); + + const judgeStatus = config.judge.enabled + ? `${config.judge.model} (t=${config.judge.temperature ?? 'default'}, think=${config.judge.thinking !== false})` + : 'disabled'; + + return [ + `Multi-extractor mode: ${config.extractors.length} extractors`, + ...extractorSummaries, + `Judge: ${judgeStatus}`, + ].join('\n'); +} diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts index 7125fff6..4809370c 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/extraction/types.ts @@ -37,6 +37,26 @@ export interface ExtractorConfig { thinking?: boolean; } +/** + * Judge configuration from FALLACY_JUDGE env var + * + * Example: + * FALLACY_JUDGE='{"model":"google/gemini-3-flash-preview","temperature":"default","thinking":false,"enabled":true}' + */ +export interface JudgeConfig { + /** Model to use (Claude or OpenRouter format) */ + model: string; + + /** Temperature (number or "default" for model's native default) */ + temperature?: number | 'default'; + + /** Enable extended thinking/reasoning */ + thinking?: boolean; + + /** Whether the judge is enabled */ + enabled: boolean; +} + /** * Configuration for multi-extractor execution */ @@ -44,11 +64,8 @@ export interface MultiExtractorConfig { /** List of extractor configurations to run in parallel */ extractors: ExtractorConfig[]; - /** Model to use for judge aggregation (default: claude-sonnet-4-5-20250929) */ - judgeModel?: string; - - /** Whether to use LLM judge for aggregation (default: false - uses simple dedup) */ - judgeEnabled: boolean; + /** Judge configuration */ + judge: JudgeConfig; } // ============================================================================ diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts index a1dba0e4..58f73a80 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts @@ -410,7 +410,7 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { if (multiResult.totalIssuesFound === 0) { finalIssues = []; - } else if (successfulExtractors.length <= 1 || !config.judgeEnabled) { + } else if (successfulExtractors.length <= 1 || !config.judge.enabled) { // Single extractor or judge disabled - use simple deduplication if (successfulExtractors.length > 1) { logger.info( @@ -486,7 +486,7 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { extractors: extractorsTelemetry, totalIssuesBeforeJudge: multiResult.totalIssuesFound, totalIssuesAfterJudge: finalIssues.length, - judgeModel: config.judgeModel, + judgeModel: config.judge.model, judgeDurationMs, judgeCostUsd, judgeDecisions, diff --git a/internal-packages/ai/src/tools/fallacy-extractor/types.ts b/internal-packages/ai/src/tools/fallacy-extractor/types.ts index e70ca437..0d070f46 100644 --- a/internal-packages/ai/src/tools/fallacy-extractor/types.ts +++ b/internal-packages/ai/src/tools/fallacy-extractor/types.ts @@ -1,4 +1,4 @@ -import { IssueType } from '../../analysis-plugins/plugins/fallacy-check/constants'; +import type { IssueType } from '../../analysis-plugins/plugins/fallacy-check/constants'; /** * Specific types of fallacies (for logical-fallacy issue type) diff --git a/internal-packages/ai/src/tools/fallacy-judge/index.ts b/internal-packages/ai/src/tools/fallacy-judge/index.ts index 1495d9c4..366182fa 100644 --- a/internal-packages/ai/src/tools/fallacy-judge/index.ts +++ b/internal-packages/ai/src/tools/fallacy-judge/index.ts @@ -12,16 +12,61 @@ import { z } from 'zod'; import { Tool, type ToolContext } from '../base/Tool'; import { callClaudeWithTool } from '../../claude/wrapper'; +import { callOpenRouterWithTool } from '../../utils/openrouter'; import { fallacyJudgeConfig } from './config'; import type { FallacyJudgeInput, FallacyJudgeOutput, JudgeDecision, + JudgeConfig, ExtractorIssueInput, } from './types'; // Default model for judge (can be overridden via env var) const DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-5-20250929'; +const DEFAULT_CLAUDE_TEMPERATURE = 0.1; +const DEFAULT_OPENROUTER_TEMPERATURE = 0.1; + +/** + * Check if a model is an OpenRouter model (contains '/') + */ +function isOpenRouterModel(model: string): boolean { + return model.includes('/'); +} + +/** + * Parse FALLACY_JUDGE env var for full config + * + * Example: + * FALLACY_JUDGE='{"model":"google/gemini-3-flash-preview","temperature":"default","thinking":false,"enabled":true}' + */ +export function getJudgeConfig(): JudgeConfig { + const judgeEnv = process.env.FALLACY_JUDGE; + + if (judgeEnv) { + try { + const parsed = JSON.parse(judgeEnv); + if (typeof parsed === 'object' && parsed !== null && typeof parsed.model === 'string') { + return { + model: parsed.model, + temperature: typeof parsed.temperature === 'number' ? parsed.temperature : + parsed.temperature === 'default' ? 'default' : undefined, + thinking: typeof parsed.thinking === 'boolean' ? parsed.thinking : undefined, + enabled: parsed.enabled !== false, // Default to true if not specified + }; + } + console.warn('[FallacyJudge] Invalid FALLACY_JUDGE format, using defaults'); + } catch (e) { + console.warn('[FallacyJudge] Failed to parse FALLACY_JUDGE:', e); + } + } + + // Default config when env var not set + return { + model: DEFAULT_JUDGE_MODEL, + enabled: false, // Disabled by default when not configured + }; +} const extractorIssueInputSchema = z.object({ extractorId: z.string(), @@ -123,10 +168,14 @@ export class FallacyJudgeTool extends Tool { - return `[Issue ${idx}] Extractor: ${issue.extractorId} + // Format issues for the LLM, sorted alphabetically by text to group similar issues together + // This makes it easier for the judge to spot duplicates/similar issues + const issuesWithIndices = input.issues.map((issue, idx) => ({ issue, originalIdx: idx })); + issuesWithIndices.sort((a, b) => a.issue.exactText.localeCompare(b.issue.exactText)); + + const formattedIssues = issuesWithIndices + .map(({ issue, originalIdx }) => { + return `[Issue ${originalIdx}] Extractor: ${issue.extractorId} Text: "${issue.exactText.substring(0, 150)}${issue.exactText.length > 150 ? '...' : ''}" Type: ${issue.issueType}${issue.fallacyType ? ` (${issue.fallacyType})` : ''} Severity: ${issue.severityScore}, Confidence: ${issue.confidenceScore}, Importance: ${issue.importanceScore} @@ -178,9 +227,22 @@ Group similar issues together and provide your decisions. Remember: - Explain your reasoning for each decision`; try { - const judgeModel = process.env.FALLACY_JUDGE_MODEL || DEFAULT_JUDGE_MODEL; + const judgeConfig = getJudgeConfig(); + const useOpenRouter = isOpenRouterModel(judgeConfig.model); + + // Determine temperature + const defaultTemp = useOpenRouter ? DEFAULT_OPENROUTER_TEMPERATURE : DEFAULT_CLAUDE_TEMPERATURE; + const temperature = judgeConfig.temperature === 'default' ? undefined : + judgeConfig.temperature ?? defaultTemp; + + // Determine thinking + const thinkingEnabled = judgeConfig.thinking !== false; - const result = await callClaudeWithTool<{ + context.logger.info( + `[FallacyJudge] Using ${useOpenRouter ? 'OpenRouter' : 'Claude'} model: ${judgeConfig.model}, temp: ${temperature ?? 'default'}, thinking: ${thinkingEnabled}` + ); + + type JudgeResultType = { decisions: Array<{ decision: 'accept' | 'merge' | 'reject'; finalText: string; @@ -194,91 +256,114 @@ Group similar issues together and provide your decisions. Remember: sourceIssueIndices: number[]; judgeReasoning: string; }>; - }>( - { - model: judgeModel, - system: systemPrompt, - messages: [{ role: 'user', content: userPrompt }], - max_tokens: 4000, - temperature: 0.1, - toolName: 'aggregate_fallacy_issues', - toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', - toolSchema: { - type: 'object', - properties: { - decisions: { - type: 'array', - items: { - type: 'object', - properties: { - decision: { - type: 'string', - enum: ['accept', 'merge', 'reject'], - description: 'Judge decision for this issue/group', - }, - finalText: { - type: 'string', - description: 'Final text for the issue (best formulation)', - }, - finalIssueType: { - type: 'string', - description: 'Final issue type', - }, - finalFallacyType: { - type: 'string', - description: 'Final fallacy type (if applicable)', - }, - finalSeverity: { - type: 'number', - description: 'Final severity score (0-100)', - }, - finalConfidence: { - type: 'number', - description: 'Final confidence score (0-100)', - }, - finalImportance: { - type: 'number', - description: 'Final importance score (0-100)', - }, - finalReasoning: { - type: 'string', - description: 'Best reasoning for this issue', - }, - sourceExtractors: { - type: 'array', - items: { type: 'string' }, - description: 'Which extractors found this issue', - }, - sourceIssueIndices: { - type: 'array', - items: { type: 'number' }, - description: 'Indices of original issues in this group', - }, - judgeReasoning: { - type: 'string', - description: 'Why you made this decision', - }, - }, - required: [ - 'decision', - 'finalText', - 'finalIssueType', - 'finalSeverity', - 'finalConfidence', - 'finalImportance', - 'finalReasoning', - 'sourceExtractors', - 'sourceIssueIndices', - 'judgeReasoning', - ], + }; + + const toolSchema = { + type: 'object' as const, + properties: { + decisions: { + type: 'array', + items: { + type: 'object', + properties: { + decision: { + type: 'string', + enum: ['accept', 'merge', 'reject'], + description: 'Judge decision for this issue/group', + }, + finalText: { + type: 'string', + description: 'Final text for the issue (best formulation)', + }, + finalIssueType: { + type: 'string', + description: 'Final issue type', + }, + finalFallacyType: { + type: 'string', + description: 'Final fallacy type (if applicable)', + }, + finalSeverity: { + type: 'number', + description: 'Final severity score (0-100)', + }, + finalConfidence: { + type: 'number', + description: 'Final confidence score (0-100)', + }, + finalImportance: { + type: 'number', + description: 'Final importance score (0-100)', + }, + finalReasoning: { + type: 'string', + description: 'Best reasoning for this issue', + }, + sourceExtractors: { + type: 'array', + items: { type: 'string' }, + description: 'Which extractors found this issue', + }, + sourceIssueIndices: { + type: 'array', + items: { type: 'number' }, + description: 'Indices of original issues in this group', + }, + judgeReasoning: { + type: 'string', + description: 'Why you made this decision', }, }, + required: [ + 'decision', + 'finalText', + 'finalIssueType', + 'finalSeverity', + 'finalConfidence', + 'finalImportance', + 'finalReasoning', + 'sourceExtractors', + 'sourceIssueIndices', + 'judgeReasoning', + ], }, - required: ['decisions'], }, }, - [] - ); + required: ['decisions'], + }; + + let result: { toolResult: JudgeResultType }; + + if (useOpenRouter) { + // Use OpenRouter for non-Claude models + result = await callOpenRouterWithTool({ + model: judgeConfig.model, + system: systemPrompt, + messages: [{ role: 'user', content: userPrompt }], + max_tokens: 8000, + ...(temperature !== undefined && { temperature }), + toolName: 'aggregate_fallacy_issues', + toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', + toolSchema, + thinking: thinkingEnabled, + }); + } else { + // Use Claude API directly + result = await callClaudeWithTool( + { + model: judgeConfig.model, + system: systemPrompt, + messages: [{ role: 'user', content: userPrompt }], + max_tokens: 8000, + ...(temperature !== undefined && { temperature }), + toolName: 'aggregate_fallacy_issues', + toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', + toolSchema, + thinking: thinkingEnabled, + }, + [] + ); + } // Separate accepted/rejected decisions const acceptedDecisions: JudgeDecision[] = []; diff --git a/internal-packages/ai/src/tools/fallacy-judge/types.ts b/internal-packages/ai/src/tools/fallacy-judge/types.ts index af25ded1..6ed986f8 100644 --- a/internal-packages/ai/src/tools/fallacy-judge/types.ts +++ b/internal-packages/ai/src/tools/fallacy-judge/types.ts @@ -7,6 +7,26 @@ import type { ExtractedFallacyIssue } from '../fallacy-extractor/types'; +/** + * Judge configuration from FALLACY_JUDGE env var + * + * Example: + * FALLACY_JUDGE='{"model":"google/gemini-3-flash-preview","temperature":"default","thinking":false,"enabled":true}' + */ +export interface JudgeConfig { + /** Model to use (Claude or OpenRouter format) */ + model: string; + + /** Temperature (number or "default" for model's native default) */ + temperature?: number | 'default'; + + /** Enable extended thinking/reasoning */ + thinking?: boolean; + + /** Whether the judge is enabled */ + enabled: boolean; +} + /** * An issue from a specific extractor */ diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index 3b011559..654e50ee 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -6,15 +6,38 @@ */ import React, { useState, useEffect, useRef } from "react"; -import { Box, Text, useInput } from "ink"; +import { Box, Text, useInput, useStdout } from "ink"; import SelectInput from "ink-select-input"; import Spinner from "ink-spinner"; import { prisma, type DocumentChoice } from "@roast/db"; -import { runMultiExtractor, getMultiExtractorConfig, type ExtractorConfig, type MultiExtractorResult, type ExtractorResult } from "@roast/ai/fallacy-extraction"; -import { truncate, formatDate } from "./helpers"; +import { + getMultiExtractorConfig, + type ExtractorConfig, + type MultiExtractorResult, + type ExtractorResult, +} from "@roast/ai/fallacy-extraction/lab"; +import { runMultiExtractor } from "@roast/ai/fallacy-extraction"; +import fallacyJudgeModule from "@roast/ai/fallacy-judge"; +// CommonJS/ESM interop: default export is wrapped +const fallacyJudgeTool = (fallacyJudgeModule as unknown as { default?: typeof fallacyJudgeModule }).default ?? fallacyJudgeModule; +import type { FallacyJudgeOutput, JudgeDecision } from "@roast/ai/fallacy-judge/types"; import { ModelSelector } from "./ModelSelector"; import { DocumentSelector } from "./DocumentSelector"; +/** Truncate string to fit terminal width */ +function truncate(str: string, maxLen: number): string { + if (str.length <= maxLen) return str; + return str.slice(0, maxLen - 1) + "…"; +} + +// Simple logger for the judge tool +const simpleLogger = { + info: (...args: unknown[]) => console.error("[INFO]", ...args), + warn: (...args: unknown[]) => console.error("[WARN]", ...args), + error: (...args: unknown[]) => console.error("[ERROR]", ...args), + debug: (...args: unknown[]) => {}, +}; + interface ExtractorLabProps { height: number; maxItems: number; @@ -29,7 +52,10 @@ type LabStep = | { type: "add-extractor" } | { type: "running" } | { type: "results"; result: MultiExtractorResult } - | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number }; + | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number } + | { type: "running-judge"; result: MultiExtractorResult } + | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput } + | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean }; // Load extractor configs from FALLACY_EXTRACTORS env var, fallback to default function getInitialExtractorConfigs(): ExtractorConfig[] { @@ -45,9 +71,24 @@ function getInitialExtractorConfigs(): ExtractorConfig[] { const TEMP_PRESETS = ["default", 0, 0.3, 0.5, 0.7, 1.0] as const; export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, onBack }: ExtractorLabProps) { + const { stdout } = useStdout(); const [step, setStep] = useState({ type: "select-document" }); const [selectedDoc, setSelectedDoc] = useState(null); const [documentText, setDocumentText] = useState(""); + + // Calculate available width for text based on terminal width + // Border overhead: β”‚ (1) + padding (1) + content + padding (1) + β”‚ (1) = 4 + // SelectInput indicator: "❯ " or " " = 2 + // Total frame overhead = 6 + const termWidth = stdout?.columns ?? 120; + + // For extraction results: " πŸ”΄ [issueType] text" + // Overhead: indicator(2) + spaces(2) + emoji(2) + space(1) + [type](~18) + space(1) = ~26 + const issueTextWidth = Math.max(40, termWidth - 6 - 26); + + // For judge decisions: "[+] type.padEnd(18) text [A,B]" + // Overhead: indicator(2) + [+]space(4) + type(18) + space(1) + space(1) + [A,B](10) = 36 + const judgeTextWidth = Math.max(40, termWidth - 6 - 36); const [extractorConfigs, setExtractorConfigs] = useState(getInitialExtractorConfigs); const [error, setError] = useState(null); const [highlightedItem, setHighlightedItem] = useState(""); @@ -95,7 +136,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o try { const result = await runMultiExtractor(documentText, { extractors: extractorConfigs, - judgeEnabled: extractorConfigs.length > 1, // Enable judge if multiple extractors + judge: { model: "", enabled: false }, // We'll run judge manually for instrumentation }); setStep({ type: "results", result }); @@ -105,12 +146,54 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } + async function runJudge(extractionResult: MultiExtractorResult) { + setStep({ type: "running-judge", result: extractionResult }); + + try { + // Flatten all issues from all extractors + const allIssues = extractionResult.extractorResults.flatMap((r) => + r.issues.map((issue) => ({ + extractorId: r.extractorId, + exactText: issue.exactText, + issueType: issue.issueType, + fallacyType: issue.fallacyType, + severityScore: issue.severityScore, + confidenceScore: issue.confidenceScore, + importanceScore: issue.importanceScore, + reasoning: issue.reasoning, + })) + ); + + const extractorIds = extractionResult.extractorResults + .filter((r) => !r.error) + .map((r) => r.extractorId); + + const judgeResult = await fallacyJudgeTool.execute( + { + documentText, + issues: allIssues, + extractorIds, + }, + { logger: simpleLogger } + ); + + setStep({ type: "judge-results", result: extractionResult, judgeResult }); + } catch (e) { + setError(`Judge failed: ${e}`); + setStep({ type: "results", result: extractionResult }); + } + } + // Handle keyboard input - use ref to avoid stale closure useInput((input, key) => { if (key.escape) { const currentStep = stepRef.current; if (currentStep.type === "issue-detail") { setStep({ type: "results", result: currentStep.result }); + } else if (currentStep.type === "judge-decision-detail") { + setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult }); + } else if (currentStep.type === "judge-results") { + setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "results") { setStep({ type: "configure-extractors" }); } else if (currentStep.type === "add-extractor") { @@ -120,7 +203,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } else if (currentStep.type === "select-document") { onBack(); } - // Don't call onBack for running state + // Don't call onBack for running/running-judge states } // Handle 'd' to delete extractor and 't' to cycle temperature (only on configure screen) @@ -292,9 +375,11 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o if (step.type === "results") { const { result } = step; const totalIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + const hasMultipleExtractors = result.extractorResults.filter((r) => !r.error).length > 1; // Build flat list of issues with extractor info const issueItems: Array<{ label: string; value: string }> = []; + result.extractorResults.forEach((r, extractorIdx) => { // Add extractor header const tempStr = r.config.temperature === 'default' ? 'tDef' : `t${r.config.temperature}`; @@ -307,17 +392,27 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o r.issues.forEach((issue, issueIdx) => { const severityColor = issue.severityScore >= 70 ? 'πŸ”΄' : issue.severityScore >= 40 ? '🟑' : '🟒'; issueItems.push({ - label: ` ${severityColor} [${issue.issueType}] ${truncate(issue.exactText.replace(/\n/g, ' '), 60)}`, + label: ` ${severityColor} [${issue.issueType}] ${truncate(issue.exactText.replace(/\n/g, ' '), issueTextWidth)}`, value: `issue-${extractorIdx}-${issueIdx}`, }); }); }); + + // Actions at the bottom + issueItems.push({ label: "───────────────────────────────────────────────────────────────────────────", value: "sep-1" }); + if (hasMultipleExtractors && totalIssues > 0) { + issueItems.push({ + label: `βš–οΈ Run Judge (aggregate ${totalIssues} issues from ${result.extractorResults.length} extractors)`, + value: "run-judge", + }); + } issueItems.push({ label: "← Back to Configure", value: "back" }); return ( - Extractor Lab - Results + Extractor Lab - Extraction Results: + {selectedDoc?.title} @@ -334,8 +429,13 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o items={issueItems} limit={maxItems - 3} onSelect={(item) => { - if (item.value === "back") { + if (item.value.startsWith("sep-") || item.value.startsWith("header-")) { + // Ignore separators and headers + return; + } else if (item.value === "back") { setStep({ type: "configure-extractors" }); + } else if (item.value === "run-judge") { + runJudge(result); } else if (item.value.startsWith("issue-")) { const [, extractorIdx, issueIdx] = item.value.split("-"); setStep({ @@ -396,5 +496,198 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } + // Running judge + if (step.type === "running-judge") { + const totalIssues = step.result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + return ( + + + Extractor Lab - Running Judge + + + + + Aggregating {totalIssues} issues from {step.result.extractorResults.length} extractors... + + + + + The judge will deduplicate, merge, and filter issues + + + ); + } + + // Judge results + if (step.type === "judge-results") { + const { result, judgeResult } = step; + const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + + // Create legend mapping extractor IDs to short keys (A, B, C, ...) + const extractorIds = result.extractorResults.map(r => r.extractorId); + const extractorKeys: Record = {}; + extractorIds.forEach((id, i) => { + extractorKeys[id] = String.fromCharCode(65 + i); // A, B, C, ... + }); + + // Helper to convert extractor IDs to short keys + const sourcesToKeys = (sources: string[]): string => { + return sources.map(s => extractorKeys[s] || "?").join(","); + }; + + // Build list of judge decisions + const decisionItems: Array<{ label: string; value: string }> = []; + + // Accepted/merged decisions + judgeResult.acceptedDecisions.forEach((decision, idx) => { + const symbol = decision.decision === "merge" ? "[*]" : "[+]"; + const keys = sourcesToKeys(decision.sourceExtractors); + const text = truncate(decision.finalText.replace(/\n/g, ' '), judgeTextWidth).padEnd(judgeTextWidth); + decisionItems.push({ + label: `${symbol} ${decision.finalIssueType.padEnd(18)} ${text} [${keys}]`, + value: `accepted-${idx}`, + }); + }); + + // Rejected decisions + judgeResult.rejectedDecisions.forEach((decision, idx) => { + const keys = sourcesToKeys(decision.sourceExtractors); + const text = truncate(decision.finalText.replace(/\n/g, ' '), judgeTextWidth).padEnd(judgeTextWidth); + decisionItems.push({ + label: `[x] ${decision.finalIssueType.padEnd(18)} ${text} [${keys}]`, + value: `rejected-${idx}`, + }); + }); + + decisionItems.push({ label: "───────────────────────────────────────────────────────────────────────────────────────", value: "sep-1" }); + decisionItems.push({ label: "Back to Extraction Results", value: "back" }); + + // Build legend string + const legendParts = extractorIds.map((id, i) => `${String.fromCharCode(65 + i)}=${id}`); + const legendStr = legendParts.join(" "); + + return ( + + + Extractor Lab - Judge Results: + {selectedDoc?.title} + + + + + Input: {totalInputIssues} issues + --> + {judgeResult.summary.acceptedCount} accepted + | + {judgeResult.summary.mergedCount} merged + | + {judgeResult.summary.rejectedCount} rejected + + Legend: [+]=accept [*]=merge [x]=reject | {legendStr} + + + { + if (item.value.startsWith("sep-")) { + return; // Ignore separators + } else if (item.value === "back") { + setStep({ type: "results", result }); + } else if (item.value.startsWith("accepted-")) { + const idx = parseInt(item.value.replace("accepted-", ""), 10); + setStep({ + type: "judge-decision-detail", + result, + judgeResult, + decision: judgeResult.acceptedDecisions[idx], + isRejected: false, + }); + } else if (item.value.startsWith("rejected-")) { + const idx = parseInt(item.value.replace("rejected-", ""), 10); + setStep({ + type: "judge-decision-detail", + result, + judgeResult, + decision: judgeResult.rejectedDecisions[idx], + isRejected: true, + }); + } + }} + /> + + + Enter=View Detail | Escape=Back + + + ); + } + + // Judge decision detail + if (step.type === "judge-decision-detail") { + const { decision, isRejected } = step; + + return ( + + + + Judge Decision: {decision.decision.toUpperCase()} + + + + + + Decision: + {decision.decision} + + + Type: + {decision.finalIssueType} + {decision.finalFallacyType && ({decision.finalFallacyType})} + + + Severity: + = 70 ? "red" : decision.finalSeverity >= 40 ? "yellow" : "green"}> + {decision.finalSeverity}/100 + + | + Confidence: {decision.finalConfidence}/100 + | + Importance: {decision.finalImportance}/100 + + + Source Extractors: + {decision.sourceExtractors.join(", ")} + + + + + Quoted Text: + + "{decision.finalText}" + + + + + Judge Reasoning: + + {decision.judgeReasoning} + + + + + Issue Reasoning: + + {decision.finalReasoning} + + + + + Press Escape to go back to judge results + + + ); + } + return null; } From e2943c968c56af4cae703166cd8fe0788d96feba Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 19:06:27 +0000 Subject: [PATCH 32/72] feat(meta-evals): Add multi-judge selection and comparison view - Add checkbox multi-select for judges (can run multiple in parallel) - Add judge comparison view showing all judges' results side-by-side - Show agreement statistics (issues accepted by all vs any judge) - Drill down from comparison to individual judge results - Fix Zod schema issues by removing problematic `satisfies` constraints - Increase max_tokens for OpenRouter from 8000 to 16000 - Add better error handling for finish_reason: length (token limit) - Add getJudgesConfig() to support array of judges from env var - Add generateJudgeLabel() for display labels Co-Authored-By: Claude Opus 4.5 --- .../ai/src/tools/fallacy-judge/index.ts | 263 ++++++++++----- .../ai/src/tools/fallacy-judge/types.ts | 6 + internal-packages/ai/src/utils/openrouter.ts | 5 + meta-evals/src/components/ExtractorLab.tsx | 306 +++++++++++++++--- 4 files changed, 459 insertions(+), 121 deletions(-) diff --git a/internal-packages/ai/src/tools/fallacy-judge/index.ts b/internal-packages/ai/src/tools/fallacy-judge/index.ts index 366182fa..e9a4af95 100644 --- a/internal-packages/ai/src/tools/fallacy-judge/index.ts +++ b/internal-packages/ai/src/tools/fallacy-judge/index.ts @@ -10,8 +10,9 @@ */ import { z } from 'zod'; +import Anthropic from '@anthropic-ai/sdk'; import { Tool, type ToolContext } from '../base/Tool'; -import { callClaudeWithTool } from '../../claude/wrapper'; +import { callClaude, callClaudeWithTool } from '../../claude/wrapper'; import { callOpenRouterWithTool } from '../../utils/openrouter'; import { fallacyJudgeConfig } from './config'; import type { @@ -35,7 +36,67 @@ function isOpenRouterModel(model: string): boolean { } /** - * Parse FALLACY_JUDGE env var for full config + * Parse a single judge config object + */ +function parseJudgeConfigObject(parsed: unknown): JudgeConfig | null { + if (typeof parsed === 'object' && parsed !== null && typeof (parsed as Record).model === 'string') { + const obj = parsed as Record; + return { + model: obj.model as string, + temperature: typeof obj.temperature === 'number' ? obj.temperature : + obj.temperature === 'default' ? 'default' : undefined, + thinking: typeof obj.thinking === 'boolean' ? obj.thinking : undefined, + label: typeof obj.label === 'string' ? obj.label : undefined, + enabled: obj.enabled !== false, + }; + } + return null; +} + +/** + * Parse FALLACY_JUDGES env var for array of judge configs + * Also accepts array in FALLACY_JUDGE for convenience + * + * Example: + * FALLACY_JUDGES='[{"model":"claude-sonnet-4-5-20250929","thinking":true},{"model":"google/gemini-3-flash-preview","thinking":false}]' + */ +export function getJudgesConfig(): JudgeConfig[] { + // Try FALLACY_JUDGES first, then FALLACY_JUDGE (both can contain arrays) + const judgesEnv = process.env.FALLACY_JUDGES || process.env.FALLACY_JUDGE; + + if (judgesEnv) { + try { + const parsed = JSON.parse(judgesEnv); + if (Array.isArray(parsed)) { + const configs: JudgeConfig[] = []; + for (const item of parsed) { + const config = parseJudgeConfigObject(item); + if (config) { + configs.push(config); + } + } + if (configs.length > 0) { + return configs; + } + } else { + // Single object in FALLACY_JUDGE + const config = parseJudgeConfigObject(parsed); + if (config && config.enabled) { + return [config]; + } + } + console.warn('[FallacyJudge] Invalid FALLACY_JUDGES/FALLACY_JUDGE format'); + } catch (e) { + console.warn('[FallacyJudge] Failed to parse FALLACY_JUDGES/FALLACY_JUDGE:', e); + } + } + + // Default: empty array (no judges configured) + return []; +} + +/** + * Parse FALLACY_JUDGE env var for single judge config (legacy) * * Example: * FALLACY_JUDGE='{"model":"google/gemini-3-flash-preview","temperature":"default","thinking":false,"enabled":true}' @@ -46,14 +107,9 @@ export function getJudgeConfig(): JudgeConfig { if (judgeEnv) { try { const parsed = JSON.parse(judgeEnv); - if (typeof parsed === 'object' && parsed !== null && typeof parsed.model === 'string') { - return { - model: parsed.model, - temperature: typeof parsed.temperature === 'number' ? parsed.temperature : - parsed.temperature === 'default' ? 'default' : undefined, - thinking: typeof parsed.thinking === 'boolean' ? parsed.thinking : undefined, - enabled: parsed.enabled !== false, // Default to true if not specified - }; + const config = parseJudgeConfigObject(parsed); + if (config) { + return config; } console.warn('[FallacyJudge] Invalid FALLACY_JUDGE format, using defaults'); } catch (e) { @@ -68,6 +124,53 @@ export function getJudgeConfig(): JudgeConfig { }; } +/** + * Generate a display label for a judge config + */ +export function generateJudgeLabel(config: JudgeConfig): string { + if (config.label) { + return config.label; + } + + // Extract short model name + let shortName: string; + if (isOpenRouterModel(config.model)) { + const parts = config.model.split('/'); + shortName = parts[parts.length - 1].replace('-preview', '').replace('-latest', ''); + } else { + if (config.model.includes('opus')) { + shortName = 'opus'; + } else if (config.model.includes('sonnet')) { + shortName = 'sonnet'; + } else if (config.model.includes('haiku')) { + shortName = 'haiku'; + } else { + shortName = config.model.slice(0, 10); + } + } + + // Build suffix parts + const suffixParts: string[] = []; + + if (config.temperature === 'default') { + suffixParts.push('tDef'); + } else if (config.temperature !== undefined) { + suffixParts.push(`t${config.temperature}`); + } + + if (config.thinking === false) { + suffixParts.push('noThink'); + } else if (config.thinking === true) { + suffixParts.push('think'); + } + + if (suffixParts.length > 0) { + return `${shortName}-${suffixParts.join('-')}`; + } + + return shortName; +} + const extractorIssueInputSchema = z.object({ extractorId: z.string(), exactText: z.string(), @@ -77,13 +180,22 @@ const extractorIssueInputSchema = z.object({ confidenceScore: z.number(), importanceScore: z.number(), reasoning: z.string(), -}) satisfies z.ZodType; +}); + +const judgeConfigSchema = z.object({ + model: z.string(), + temperature: z.union([z.number(), z.literal('default')]).optional(), + thinking: z.boolean().optional(), + label: z.string().optional(), + enabled: z.boolean(), +}); const inputSchema = z.object({ documentText: z.string().min(1), issues: z.array(extractorIssueInputSchema), extractorIds: z.array(z.string()), -}) satisfies z.ZodType; + judgeConfig: judgeConfigSchema.optional(), +}); const judgeDecisionSchema = z.object({ decision: z.enum(['accept', 'merge', 'reject']), @@ -97,7 +209,7 @@ const judgeDecisionSchema = z.object({ sourceExtractors: z.array(z.string()), sourceIssueIndices: z.array(z.number()), judgeReasoning: z.string(), -}) satisfies z.ZodType; +}); const outputSchema = z.object({ acceptedDecisions: z.array(judgeDecisionSchema), @@ -109,7 +221,7 @@ const outputSchema = z.object({ mergedCount: z.number(), rejectedCount: z.number(), }), -}) satisfies z.ZodType; +}); export class FallacyJudgeTool extends Tool { config = fallacyJudgeConfig; @@ -120,6 +232,7 @@ export class FallacyJudgeTool extends Tool { + const startTime = Date.now(); context.logger.info( `[FallacyJudge] Aggregating ${input.issues.length} issues from ${input.extractorIds.length} extractors` ); @@ -227,7 +340,8 @@ Group similar issues together and provide your decisions. Remember: - Explain your reasoning for each decision`; try { - const judgeConfig = getJudgeConfig(); + // Use passed config if provided, otherwise fall back to env var config + const judgeConfig = input.judgeConfig ?? getJudgeConfig(); const useOpenRouter = isOpenRouterModel(judgeConfig.model); // Determine temperature @@ -336,11 +450,12 @@ Group similar issues together and provide your decisions. Remember: if (useOpenRouter) { // Use OpenRouter for non-Claude models + // Use 16000 max_tokens to handle large outputs with many issues result = await callOpenRouterWithTool({ model: judgeConfig.model, system: systemPrompt, messages: [{ role: 'user', content: userPrompt }], - max_tokens: 8000, + max_tokens: 16000, ...(temperature !== undefined && { temperature }), toolName: 'aggregate_fallacy_issues', toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', @@ -349,20 +464,52 @@ Group similar issues together and provide your decisions. Remember: }); } else { // Use Claude API directly - result = await callClaudeWithTool( - { - model: judgeConfig.model, - system: systemPrompt, - messages: [{ role: 'user', content: userPrompt }], - max_tokens: 8000, - ...(temperature !== undefined && { temperature }), - toolName: 'aggregate_fallacy_issues', - toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', - toolSchema, - thinking: thinkingEnabled, - }, - [] - ); + if (thinkingEnabled) { + // When thinking is enabled, use tool_choice: 'auto' to allow thinking + // (forced tool_choice like 'any' or specific tool is incompatible with extended thinking) + const claudeResult = await callClaude( + { + model: judgeConfig.model, + system: systemPrompt, + messages: [{ role: 'user', content: userPrompt }], + max_tokens: 16000, // Must be > thinking.budget_tokens (10000) + ...(temperature !== undefined && { temperature }), + tools: [{ + name: 'aggregate_fallacy_issues', + description: 'Aggregate and deduplicate fallacy issues from multiple extractors', + input_schema: toolSchema, + }], + tool_choice: { type: 'auto' }, + thinking: true, + }, + [] + ); + + // Extract tool result from response + const toolUse = claudeResult.response.content.find( + (c): c is Anthropic.Messages.ToolUseBlock => c.type === 'tool_use' + ); + if (!toolUse) { + throw new Error('Judge did not call the aggregation tool - no tool use in response'); + } + result = { toolResult: toolUse.input as JudgeResultType }; + } else { + // Without thinking, use forced tool_choice for guaranteed structure + result = await callClaudeWithTool( + { + model: judgeConfig.model, + system: systemPrompt, + messages: [{ role: 'user', content: userPrompt }], + max_tokens: 8000, + ...(temperature !== undefined && { temperature }), + toolName: 'aggregate_fallacy_issues', + toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', + toolSchema, + thinking: false, + }, + [] + ); + } } // Separate accepted/rejected decisions @@ -395,8 +542,9 @@ Group similar issues together and provide your decisions. Remember: } } + const durationMs = Date.now() - startTime; context.logger.info( - `[FallacyJudge] Aggregation complete: ${acceptedDecisions.length} accepted, ${mergedCount} merged, ${rejectedDecisions.length} rejected` + `[FallacyJudge] Aggregation complete in ${(durationMs / 1000).toFixed(1)}s: ${acceptedDecisions.length} accepted, ${mergedCount} merged, ${rejectedDecisions.length} rejected` ); return { @@ -412,57 +560,8 @@ Group similar issues together and provide your decisions. Remember: }; } catch (error) { context.logger.error('[FallacyJudge] Aggregation failed:', error); - - // Fallback: Simple deduplication without LLM - // Keep all issues, grouping by similar text - const groups = new Map(); - for (let i = 0; i < input.issues.length; i++) { - const issue = input.issues[i]; - const normalizedText = issue.exactText.toLowerCase().replace(/\s+/g, ' ').trim(); - const existing = groups.get(normalizedText); - if (existing) { - existing.push(i); - } else { - groups.set(normalizedText, [i]); - } - } - - const acceptedDecisions: JudgeDecision[] = []; - for (const [, indices] of groups) { - // Pick the issue with highest confidence - const bestIdx = indices.reduce((best, current) => - input.issues[current].confidenceScore > input.issues[best].confidenceScore - ? current - : best - ); - const bestIssue = input.issues[bestIdx]; - - acceptedDecisions.push({ - decision: indices.length > 1 ? 'merge' : 'accept', - finalText: bestIssue.exactText, - finalIssueType: bestIssue.issueType, - finalFallacyType: bestIssue.fallacyType, - finalSeverity: bestIssue.severityScore, - finalConfidence: bestIssue.confidenceScore, - finalImportance: bestIssue.importanceScore, - finalReasoning: bestIssue.reasoning, - sourceExtractors: [...new Set(indices.map((i) => input.issues[i].extractorId))], - sourceIssueIndices: indices, - judgeReasoning: 'Fallback deduplication (LLM judge unavailable)', - }); - } - - return { - acceptedDecisions, - rejectedDecisions: [], - summary: { - totalInputIssues: input.issues.length, - uniqueGroups: groups.size, - acceptedCount: acceptedDecisions.length, - mergedCount: acceptedDecisions.filter((d) => d.decision === 'merge').length, - rejectedCount: 0, - }, - }; + // Re-throw to surface error to user - don't silently fallback + throw error; } } } diff --git a/internal-packages/ai/src/tools/fallacy-judge/types.ts b/internal-packages/ai/src/tools/fallacy-judge/types.ts index 6ed986f8..ac4cd30d 100644 --- a/internal-packages/ai/src/tools/fallacy-judge/types.ts +++ b/internal-packages/ai/src/tools/fallacy-judge/types.ts @@ -23,6 +23,9 @@ export interface JudgeConfig { /** Enable extended thinking/reasoning */ thinking?: boolean; + /** Optional display label (auto-generated if not provided) */ + label?: string; + /** Whether the judge is enabled */ enabled: boolean; } @@ -68,6 +71,9 @@ export interface FallacyJudgeInput { /** List of extractor IDs that contributed */ extractorIds: string[]; + + /** Optional config override (if not provided, reads from FALLACY_JUDGE env var) */ + judgeConfig?: JudgeConfig; } /** diff --git a/internal-packages/ai/src/utils/openrouter.ts b/internal-packages/ai/src/utils/openrouter.ts index 71ec99ce..0dc07967 100644 --- a/internal-packages/ai/src/utils/openrouter.ts +++ b/internal-packages/ai/src/utils/openrouter.ts @@ -452,6 +452,11 @@ export async function callOpenRouterWithTool( console.error(` finish_reason: ${choice.finish_reason}`); console.error(` message.content: ${choice.message?.content?.substring(0, 500) || '(empty)'}`); console.error(` tool_calls: ${JSON.stringify(choice.message?.tool_calls || [])}`); + + // Provide specific error for finish_reason: length + if (choice.finish_reason === 'length') { + throw new Error(`Response truncated (max_tokens too small) - model ${options.model} ran out of tokens before completing the tool call`); + } throw new Error(`No tool call found for ${options.toolName}`); } diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index 654e50ee..b027bfba 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -18,9 +18,13 @@ import { } from "@roast/ai/fallacy-extraction/lab"; import { runMultiExtractor } from "@roast/ai/fallacy-extraction"; import fallacyJudgeModule from "@roast/ai/fallacy-judge"; -// CommonJS/ESM interop: default export is wrapped +// CommonJS/ESM interop: default export is wrapped, named exports need unwrapping too const fallacyJudgeTool = (fallacyJudgeModule as unknown as { default?: typeof fallacyJudgeModule }).default ?? fallacyJudgeModule; -import type { FallacyJudgeOutput, JudgeDecision } from "@roast/ai/fallacy-judge/types"; +const { getJudgesConfig, generateJudgeLabel } = fallacyJudgeModule as unknown as { + getJudgesConfig: () => import("@roast/ai/fallacy-judge/types").JudgeConfig[]; + generateJudgeLabel: (config: import("@roast/ai/fallacy-judge/types").JudgeConfig) => string; +}; +import type { FallacyJudgeOutput, JudgeDecision, JudgeConfig } from "@roast/ai/fallacy-judge/types"; import { ModelSelector } from "./ModelSelector"; import { DocumentSelector } from "./DocumentSelector"; @@ -46,6 +50,15 @@ interface ExtractorLabProps { onBack: () => void; } +/** Result from a single judge run with its config */ +interface JudgeRunResult { + config: JudgeConfig; + label: string; + result: FallacyJudgeOutput; + durationMs: number; + error?: string; +} + type LabStep = | { type: "select-document" } | { type: "configure-extractors" } @@ -53,9 +66,10 @@ type LabStep = | { type: "running" } | { type: "results"; result: MultiExtractorResult } | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number } - | { type: "running-judge"; result: MultiExtractorResult } - | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput } - | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean }; + | { type: "running-judge"; result: MultiExtractorResult; judgeConfigs: JudgeConfig[] } + | { type: "judge-comparison"; result: MultiExtractorResult; judgeResults: JudgeRunResult[] } + | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string } + | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean; judgeLabel: string }; // Load extractor configs from FALLACY_EXTRACTORS env var, fallback to default function getInitialExtractorConfigs(): ExtractorConfig[] { @@ -90,6 +104,8 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o // Overhead: indicator(2) + [+]space(4) + type(18) + space(1) + space(1) + [A,B](10) = 36 const judgeTextWidth = Math.max(40, termWidth - 6 - 36); const [extractorConfigs, setExtractorConfigs] = useState(getInitialExtractorConfigs); + const [availableJudges] = useState(() => getJudgesConfig()); + const [selectedJudgeIdxs, setSelectedJudgeIdxs] = useState>(() => new Set([0])); // First judge selected by default const [error, setError] = useState(null); const [highlightedItem, setHighlightedItem] = useState(""); @@ -146,40 +162,87 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } - async function runJudge(extractionResult: MultiExtractorResult) { - setStep({ type: "running-judge", result: extractionResult }); + async function runJudge(extractionResult: MultiExtractorResult, judgeConfig?: JudgeConfig, judgeLabel?: string): Promise { + // Flatten all issues from all extractors + const allIssues = extractionResult.extractorResults.flatMap((r) => + r.issues.map((issue) => ({ + extractorId: r.extractorId, + exactText: issue.exactText, + issueType: issue.issueType, + fallacyType: issue.fallacyType, + severityScore: issue.severityScore, + confidenceScore: issue.confidenceScore, + importanceScore: issue.importanceScore, + reasoning: issue.reasoning, + })) + ); - try { - // Flatten all issues from all extractors - const allIssues = extractionResult.extractorResults.flatMap((r) => - r.issues.map((issue) => ({ - extractorId: r.extractorId, - exactText: issue.exactText, - issueType: issue.issueType, - fallacyType: issue.fallacyType, - severityScore: issue.severityScore, - confidenceScore: issue.confidenceScore, - importanceScore: issue.importanceScore, - reasoning: issue.reasoning, - })) - ); + const extractorIds = extractionResult.extractorResults + .filter((r) => !r.error) + .map((r) => r.extractorId); - const extractorIds = extractionResult.extractorResults - .filter((r) => !r.error) - .map((r) => r.extractorId); + const startTime = Date.now(); + const label = judgeLabel || (judgeConfig ? generateJudgeLabel(judgeConfig) : "default"); + try { const judgeResult = await fallacyJudgeTool.execute( { documentText, issues: allIssues, extractorIds, + judgeConfig, }, { logger: simpleLogger } ); - setStep({ type: "judge-results", result: extractionResult, judgeResult }); + return { + config: judgeConfig || { model: "default", enabled: true }, + label, + result: judgeResult, + durationMs: Date.now() - startTime, + }; } catch (e) { - setError(`Judge failed: ${e}`); + return { + config: judgeConfig || { model: "default", enabled: true }, + label, + result: { + acceptedDecisions: [], + rejectedDecisions: [], + summary: { totalInputIssues: allIssues.length, uniqueGroups: 0, acceptedCount: 0, mergedCount: 0, rejectedCount: 0 }, + }, + durationMs: Date.now() - startTime, + error: String(e), + }; + } + } + + async function runMultipleJudges(extractionResult: MultiExtractorResult, judgeConfigs: JudgeConfig[]) { + setStep({ type: "running-judge", result: extractionResult, judgeConfigs }); + + try { + // Run all judges in parallel + const judgePromises = judgeConfigs.map(config => + runJudge(extractionResult, config, generateJudgeLabel(config)) + ); + + const judgeResults = await Promise.all(judgePromises); + + // Check if any had errors + const errored = judgeResults.filter(r => r.error); + if (errored.length === judgeResults.length) { + throw new Error(`All judges failed: ${errored[0].error}`); + } + + // If only one judge was selected, go directly to its results + if (judgeResults.length === 1) { + const single = judgeResults[0]; + setStep({ type: "judge-results", result: extractionResult, judgeResult: single.result, judgeLabel: single.label }); + } else { + // Multiple judges - show comparison view + setStep({ type: "judge-comparison", result: extractionResult, judgeResults }); + } + } catch (e) { + setError(`Judges failed: ${e}`); setStep({ type: "results", result: extractionResult }); } } @@ -191,9 +254,11 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o if (currentStep.type === "issue-detail") { setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "judge-decision-detail") { - setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult }); + setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult, judgeLabel: currentStep.judgeLabel }); } else if (currentStep.type === "judge-results") { setStep({ type: "results", result: currentStep.result }); + } else if (currentStep.type === "judge-comparison") { + setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "results") { setStep({ type: "configure-extractors" }); } else if (currentStep.type === "add-extractor") { @@ -400,11 +465,40 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o // Actions at the bottom issueItems.push({ label: "───────────────────────────────────────────────────────────────────────────", value: "sep-1" }); + + // Judge selection (only if we have multiple extractors with issues) if (hasMultipleExtractors && totalIssues > 0) { - issueItems.push({ - label: `βš–οΈ Run Judge (aggregate ${totalIssues} issues from ${result.extractorResults.length} extractors)`, - value: "run-judge", - }); + if (availableJudges.length > 0) { + // Show available judges with checkboxes for multi-select + availableJudges.forEach((judge, idx) => { + const label = generateJudgeLabel(judge); + const isSelected = selectedJudgeIdxs.has(idx); + const prefix = isSelected ? "[x]" : "[ ]"; + const thinkStr = judge.thinking ? "think" : "noThink"; + const tempStr = judge.temperature === 'default' ? 'tDef' : judge.temperature !== undefined ? `t${judge.temperature}` : ''; + issueItems.push({ + label: `${prefix} Judge: ${label} (${tempStr ? tempStr + ', ' : ''}${thinkStr})`, + value: `judge-${idx}`, + }); + }); + + issueItems.push({ label: "─────────────────────────────────────────", value: "sep-2" }); + + const selectedCount = selectedJudgeIdxs.size; + const judgeLabel = selectedCount === 1 + ? generateJudgeLabel(availableJudges[[...selectedJudgeIdxs][0]]) + : `${selectedCount} judges`; + issueItems.push({ + label: `βš–οΈ Run ${judgeLabel} (aggregate ${totalIssues} issues)`, + value: "run-judge", + }); + } else { + // No judges configured - show hint + issueItems.push({ + label: `⚠️ No judges configured. Set FALLACY_JUDGES or FALLACY_JUDGE env var`, + value: "no-judges", + }); + } } issueItems.push({ label: "← Back to Configure", value: "back" }); @@ -435,7 +529,24 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } else if (item.value === "back") { setStep({ type: "configure-extractors" }); } else if (item.value === "run-judge") { - runJudge(result); + // Run all selected judges + const selectedConfigs = [...selectedJudgeIdxs].map(idx => availableJudges[idx]); + runMultipleJudges(result, selectedConfigs); + } else if (item.value.startsWith("judge-")) { + // Toggle multi-select + const idx = parseInt(item.value.replace("judge-", ""), 10); + setSelectedJudgeIdxs(prev => { + const next = new Set(prev); + if (next.has(idx)) { + // Don't allow deselecting the last one + if (next.size > 1) { + next.delete(idx); + } + } else { + next.add(idx); + } + return next; + }); } else if (item.value.startsWith("issue-")) { const [, extractorIdx, issueIdx] = item.value.split("-"); setStep({ @@ -496,13 +607,15 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } - // Running judge + // Running judge(s) if (step.type === "running-judge") { const totalIssues = step.result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + const judgeCount = step.judgeConfigs.length; + const judgeNames = step.judgeConfigs.map(c => generateJudgeLabel(c)).join(", "); return ( - Extractor Lab - Running Judge + Extractor Lab - Running {judgeCount > 1 ? `${judgeCount} Judges` : "Judge"} @@ -511,8 +624,9 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o - - The judge will deduplicate, merge, and filter issues + + The judge{judgeCount > 1 ? "s" : ""} will deduplicate, merge, and filter issues + {judgeCount > 1 && Running in parallel: {judgeNames}} ); @@ -520,7 +634,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o // Judge results if (step.type === "judge-results") { - const { result, judgeResult } = step; + const { result, judgeResult, judgeLabel } = step; const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); // Create legend mapping extractor IDs to short keys (A, B, C, ...) @@ -569,8 +683,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o return ( - Extractor Lab - Judge Results: - {selectedDoc?.title} + Judge Results{judgeLabel ? `: ${judgeLabel}` : ""} @@ -602,6 +715,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o judgeResult, decision: judgeResult.acceptedDecisions[idx], isRejected: false, + judgeLabel: judgeLabel || "", }); } else if (item.value.startsWith("rejected-")) { const idx = parseInt(item.value.replace("rejected-", ""), 10); @@ -611,6 +725,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o judgeResult, decision: judgeResult.rejectedDecisions[idx], isRejected: true, + judgeLabel: judgeLabel || "", }); } }} @@ -689,5 +804,118 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } + // Judge comparison view - comparing multiple judges + if (step.type === "judge-comparison") { + const { result, judgeResults } = step; + const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + + // Build comparison items + const comparisonItems: Array<{ label: string; value: string }> = []; + + // Header row + comparisonItems.push({ + label: `── Judge Comparison: ${judgeResults.length} judges, ${totalInputIssues} input issues ──`, + value: "header", + }); + + // Each judge row + judgeResults.forEach((jr, idx) => { + const status = jr.error ? "❌ Error" : `βœ… ${jr.result.summary.acceptedCount} accepted, ${jr.result.summary.mergedCount} merged, ${jr.result.summary.rejectedCount} rejected`; + const duration = `${(jr.durationMs / 1000).toFixed(1)}s`; + comparisonItems.push({ + label: `[${idx + 1}] ${jr.label.padEnd(30)} ${duration.padEnd(8)} ${status}`, + value: `judge-${idx}`, + }); + + // If error, show error details + if (jr.error) { + comparisonItems.push({ + label: ` Error: ${truncate(jr.error, termWidth - 20)}`, + value: `error-${idx}`, + }); + } + }); + + // Summary stats + comparisonItems.push({ + label: "────────────────────────────────────────────────────────────────────────────", + value: "sep-1", + }); + + // Agreement summary - find issues accepted by all judges + const successfulJudges = judgeResults.filter(jr => !jr.error); + if (successfulJudges.length > 1) { + // Get accepted issue texts from each judge for comparison + const acceptedByJudge = successfulJudges.map(jr => + new Set(jr.result.acceptedDecisions.map(d => d.finalText.toLowerCase().trim())) + ); + + // Find issues in ALL judges (intersection) + const unanimouslyAccepted = [...acceptedByJudge[0]].filter(text => + acceptedByJudge.every(set => set.has(text)) + ).length; + + // Find issues in ANY judge (union) + const allAccepted = new Set(acceptedByJudge.flatMap(set => [...set])).size; + + const agreementPct = allAccepted > 0 ? Math.round((unanimouslyAccepted / allAccepted) * 100) : 0; + + comparisonItems.push({ + label: `πŸ“Š Agreement: ${unanimouslyAccepted}/${allAccepted} issues accepted by all judges (${agreementPct}%)`, + value: "stats-1", + }); + } + + comparisonItems.push({ + label: "────────────────────────────────────────────────────────────────────────────", + value: "sep-2", + }); + comparisonItems.push({ label: "← Back to Extraction Results", value: "back" }); + + return ( + + + Extractor Lab - Judge Comparison: + {selectedDoc?.title} + + + + + Input: {totalInputIssues} issues from {result.extractorResults.length} extractors + | + Judges run: {judgeResults.length} + | + Successful: {judgeResults.filter(j => !j.error).length} + + + + !i.value.startsWith("sep-") && !i.value.startsWith("header") && !i.value.startsWith("stats-") && !i.value.startsWith("error-"))} + limit={maxItems - 5} + onSelect={(item) => { + if (item.value === "back") { + setStep({ type: "results", result }); + } else if (item.value.startsWith("judge-")) { + const idx = parseInt(item.value.replace("judge-", ""), 10); + const jr = judgeResults[idx]; + if (!jr.error) { + setStep({ + type: "judge-results", + result, + judgeResult: jr.result, + judgeLabel: jr.label, + }); + } + } + }} + /> + + + Enter=View Judge Details | Escape=Back to Results + + + ); + } + return null; } From ca24af47a4769b686076781fb382f545597214c0 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 19:14:28 +0000 Subject: [PATCH 33/72] feat(meta-evals): Add deduplication step to Extractor Lab Add Phase 1.5 deduplication after judge results: - Remove exact text duplicates (case-insensitive) - Sort by priority score (severity*0.6 + importance*0.4) - Limit to max 25 issues - Show kept issues, duplicates removed, and limit-dropped items - Button on judge results screen to trigger dedup Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/ExtractorLab.tsx | 151 ++++++++++++++++++++- 1 file changed, 149 insertions(+), 2 deletions(-) diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index b027bfba..aa4768aa 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -59,6 +59,16 @@ interface JudgeRunResult { error?: string; } +/** Result from deduplication step */ +interface DedupResult { + /** Issues kept after dedup */ + kept: JudgeDecision[]; + /** Issues removed as duplicates */ + duplicates: JudgeDecision[]; + /** Issues removed due to limit */ + limitDropped: JudgeDecision[]; +} + type LabStep = | { type: "select-document" } | { type: "configure-extractors" } @@ -69,7 +79,8 @@ type LabStep = | { type: "running-judge"; result: MultiExtractorResult; judgeConfigs: JudgeConfig[] } | { type: "judge-comparison"; result: MultiExtractorResult; judgeResults: JudgeRunResult[] } | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string } - | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean; judgeLabel: string }; + | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean; judgeLabel: string } + | { type: "dedup-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string; dedupResult: DedupResult }; // Load extractor configs from FALLACY_EXTRACTORS env var, fallback to default function getInitialExtractorConfigs(): ExtractorConfig[] { @@ -247,6 +258,51 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } + // Deduplication: remove duplicates, sort by priority, limit count + // Mirrors the pipeline's Phase 1.5 deduplication + const MAX_ISSUES = 25; + + function runDeduplication( + extractionResult: MultiExtractorResult, + judgeResult: FallacyJudgeOutput, + judgeLabel: string + ) { + const decisions = judgeResult.acceptedDecisions; + + // Step 1: Remove exact text duplicates (case-insensitive, whitespace normalized) + const seen = new Set(); + const unique: JudgeDecision[] = []; + const duplicates: JudgeDecision[] = []; + + for (const decision of decisions) { + const key = decision.finalText.toLowerCase().replace(/\s+/g, " ").trim(); + if (!seen.has(key)) { + seen.add(key); + unique.push(decision); + } else { + duplicates.push(decision); + } + } + + // Step 2: Calculate priority score and sort (higher = more important) + const priorityScore = (d: JudgeDecision) => + d.finalSeverity * 0.6 + d.finalImportance * 0.4; + + const sorted = [...unique].sort((a, b) => priorityScore(b) - priorityScore(a)); + + // Step 3: Limit to MAX_ISSUES + const kept = sorted.slice(0, MAX_ISSUES); + const limitDropped = sorted.slice(MAX_ISSUES); + + setStep({ + type: "dedup-results", + result: extractionResult, + judgeResult, + judgeLabel, + dedupResult: { kept, duplicates, limitDropped }, + }); + } + // Handle keyboard input - use ref to avoid stale closure useInput((input, key) => { if (key.escape) { @@ -259,6 +315,8 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "judge-comparison") { setStep({ type: "results", result: currentStep.result }); + } else if (currentStep.type === "dedup-results") { + setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult, judgeLabel: currentStep.judgeLabel }); } else if (currentStep.type === "results") { setStep({ type: "configure-extractors" }); } else if (currentStep.type === "add-extractor") { @@ -674,7 +732,8 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o }); decisionItems.push({ label: "───────────────────────────────────────────────────────────────────────────────────────", value: "sep-1" }); - decisionItems.push({ label: "Back to Extraction Results", value: "back" }); + decisionItems.push({ label: `β–Ά Run Deduplication (${judgeResult.acceptedDecisions.length} issues)`, value: "run-dedup" }); + decisionItems.push({ label: "← Back to Extraction Results", value: "back" }); // Build legend string const legendParts = extractorIds.map((id, i) => `${String.fromCharCode(65 + i)}=${id}`); @@ -707,6 +766,8 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o return; // Ignore separators } else if (item.value === "back") { setStep({ type: "results", result }); + } else if (item.value === "run-dedup") { + runDeduplication(result, judgeResult, judgeLabel || ""); } else if (item.value.startsWith("accepted-")) { const idx = parseInt(item.value.replace("accepted-", ""), 10); setStep({ @@ -917,5 +978,91 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } + // Deduplication results view + if (step.type === "dedup-results") { + const { result, judgeResult, judgeLabel, dedupResult } = step; + const { kept, duplicates, limitDropped } = dedupResult; + const totalInput = judgeResult.acceptedDecisions.length; + + // Calculate priority score for display + const priorityScore = (d: JudgeDecision) => + d.finalSeverity * 0.6 + d.finalImportance * 0.4; + + // Build list items + const dedupItems: Array<{ label: string; value: string }> = []; + + // Kept issues (sorted by priority) + dedupItems.push({ label: `── Kept (${kept.length}) ──`, value: "header-kept" }); + kept.forEach((d, idx) => { + const score = priorityScore(d).toFixed(0); + const text = truncate(d.finalText.replace(/\n/g, ' '), issueTextWidth); + dedupItems.push({ + label: ` [${score}] ${d.finalIssueType.padEnd(18)} ${text}`, + value: `kept-${idx}`, + }); + }); + + // Duplicates removed + if (duplicates.length > 0) { + dedupItems.push({ label: `── Duplicates Removed (${duplicates.length}) ──`, value: "header-dup" }); + duplicates.forEach((d, idx) => { + const text = truncate(d.finalText.replace(/\n/g, ' '), issueTextWidth); + dedupItems.push({ + label: ` [dup] ${d.finalIssueType.padEnd(18)} ${text}`, + value: `dup-${idx}`, + }); + }); + } + + // Limit dropped + if (limitDropped.length > 0) { + dedupItems.push({ label: `── Dropped by Limit (${limitDropped.length}) ──`, value: "header-limit" }); + limitDropped.forEach((d, idx) => { + const score = priorityScore(d).toFixed(0); + const text = truncate(d.finalText.replace(/\n/g, ' '), issueTextWidth); + dedupItems.push({ + label: ` [${score}] ${d.finalIssueType.padEnd(18)} ${text}`, + value: `limit-${idx}`, + }); + }); + } + + dedupItems.push({ label: "───────────────────────────────────────────────────────────────────────────", value: "sep-1" }); + dedupItems.push({ label: "← Back to Judge Results", value: "back" }); + + return ( + + + Deduplication Results + + + + + Input: {totalInput} issues + β†’ + {kept.length} kept + {duplicates.length > 0 && | {duplicates.length} duplicates} + {limitDropped.length > 0 && | {limitDropped.length} over limit} + + + + !i.value.startsWith("header-") && !i.value.startsWith("sep-"))} + limit={maxItems - 5} + onSelect={(item) => { + if (item.value === "back") { + setStep({ type: "judge-results", result, judgeResult, judgeLabel }); + } + // Could add detail view for individual items if needed + }} + /> + + + [score] = priority (sev*0.6 + imp*0.4) | Escape=Back + + + ); + } + return null; } From 24d419c1b03bca758d801c69f085f06adedf8e96 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 19:29:50 +0000 Subject: [PATCH 34/72] fix(meta-evals): Show error messages in judge comparison + increase max_tokens - Increase OpenRouter max_tokens from 16000 to 32000 for large issue sets - Show error details in comparison view (was being filtered out) - Error lines are displayed but not clickable Co-Authored-By: Claude Opus 4.5 --- internal-packages/ai/src/tools/fallacy-judge/index.ts | 4 ++-- meta-evals/src/components/ExtractorLab.tsx | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/internal-packages/ai/src/tools/fallacy-judge/index.ts b/internal-packages/ai/src/tools/fallacy-judge/index.ts index e9a4af95..cc672145 100644 --- a/internal-packages/ai/src/tools/fallacy-judge/index.ts +++ b/internal-packages/ai/src/tools/fallacy-judge/index.ts @@ -450,12 +450,12 @@ Group similar issues together and provide your decisions. Remember: if (useOpenRouter) { // Use OpenRouter for non-Claude models - // Use 16000 max_tokens to handle large outputs with many issues + // Use 32000 max_tokens to handle large outputs with many issues (esp. with thinking) result = await callOpenRouterWithTool({ model: judgeConfig.model, system: systemPrompt, messages: [{ role: 'user', content: userPrompt }], - max_tokens: 16000, + max_tokens: 32000, ...(temperature !== undefined && { temperature }), toolName: 'aggregate_fallacy_issues', toolDescription: 'Aggregate and deduplicate fallacy issues from multiple extractors', diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index aa4768aa..d1e2296a 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -951,11 +951,14 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o !i.value.startsWith("sep-") && !i.value.startsWith("header") && !i.value.startsWith("stats-") && !i.value.startsWith("error-"))} + items={comparisonItems.filter(i => !i.value.startsWith("sep-") && !i.value.startsWith("header") && !i.value.startsWith("stats-"))} limit={maxItems - 5} onSelect={(item) => { if (item.value === "back") { setStep({ type: "results", result }); + } else if (item.value.startsWith("error-")) { + // Error lines are not clickable, just informational + return; } else if (item.value.startsWith("judge-")) { const idx = parseInt(item.value.replace("judge-", ""), 10); const jr = judgeResults[idx]; From aa6f5802b7ab5a609293b0d703ce4e8c45983cb4 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 20:33:46 +0000 Subject: [PATCH 35/72] feat(meta-evals): Add pre-judge deduplication step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run mechanical deduplication before sending issues to the judge to: - Reduce token usage and avoid timeouts with large issue sets - Show users what duplicates are being removed - Clean separation: dedup summary is static, judge selection is interactive Flow: Extraction Results β†’ Pre-Judge Dedup β†’ Select Judges β†’ Run β†’ Results Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/ExtractorLab.tsx | 434 ++++++++++----------- 1 file changed, 216 insertions(+), 218 deletions(-) diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index d1e2296a..a9f840e0 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -59,14 +59,26 @@ interface JudgeRunResult { error?: string; } -/** Result from deduplication step */ -interface DedupResult { - /** Issues kept after dedup */ - kept: JudgeDecision[]; - /** Issues removed as duplicates */ - duplicates: JudgeDecision[]; - /** Issues removed due to limit */ - limitDropped: JudgeDecision[]; +/** Issue with extractor source info for pre-judge dedup */ +interface ExtractorIssue { + extractorId: string; + exactText: string; + issueType: string; + fallacyType?: string; + severityScore: number; + confidenceScore: number; + importanceScore: number; + reasoning: string; +} + +/** Result from pre-judge deduplication */ +interface PreJudgeDedupResult { + /** Unique issues to send to judge */ + unique: ExtractorIssue[]; + /** Duplicate issues removed */ + duplicates: ExtractorIssue[]; + /** Original total count */ + originalCount: number; } type LabStep = @@ -76,11 +88,11 @@ type LabStep = | { type: "running" } | { type: "results"; result: MultiExtractorResult } | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number } - | { type: "running-judge"; result: MultiExtractorResult; judgeConfigs: JudgeConfig[] } + | { type: "pre-judge-dedup"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult } + | { type: "running-judge"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult; judgeConfigs: JudgeConfig[] } | { type: "judge-comparison"; result: MultiExtractorResult; judgeResults: JudgeRunResult[] } - | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string } - | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean; judgeLabel: string } - | { type: "dedup-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string; dedupResult: DedupResult }; + | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string; judgeResults?: JudgeRunResult[] } + | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean; judgeLabel: string; judgeResults?: JudgeRunResult[] }; // Load extractor configs from FALLACY_EXTRACTORS env var, fallback to default function getInitialExtractorConfigs(): ExtractorConfig[] { @@ -173,21 +185,12 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } - async function runJudge(extractionResult: MultiExtractorResult, judgeConfig?: JudgeConfig, judgeLabel?: string): Promise { - // Flatten all issues from all extractors - const allIssues = extractionResult.extractorResults.flatMap((r) => - r.issues.map((issue) => ({ - extractorId: r.extractorId, - exactText: issue.exactText, - issueType: issue.issueType, - fallacyType: issue.fallacyType, - severityScore: issue.severityScore, - confidenceScore: issue.confidenceScore, - importanceScore: issue.importanceScore, - reasoning: issue.reasoning, - })) - ); - + async function runJudge( + extractionResult: MultiExtractorResult, + dedupIssues: ExtractorIssue[], + judgeConfig?: JudgeConfig, + judgeLabel?: string + ): Promise { const extractorIds = extractionResult.extractorResults .filter((r) => !r.error) .map((r) => r.extractorId); @@ -199,7 +202,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o const judgeResult = await fallacyJudgeTool.execute( { documentText, - issues: allIssues, + issues: dedupIssues, extractorIds, judgeConfig, }, @@ -219,7 +222,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o result: { acceptedDecisions: [], rejectedDecisions: [], - summary: { totalInputIssues: allIssues.length, uniqueGroups: 0, acceptedCount: 0, mergedCount: 0, rejectedCount: 0 }, + summary: { totalInputIssues: dedupIssues.length, uniqueGroups: 0, acceptedCount: 0, mergedCount: 0, rejectedCount: 0 }, }, durationMs: Date.now() - startTime, error: String(e), @@ -227,13 +230,17 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } - async function runMultipleJudges(extractionResult: MultiExtractorResult, judgeConfigs: JudgeConfig[]) { - setStep({ type: "running-judge", result: extractionResult, judgeConfigs }); + async function runMultipleJudges( + extractionResult: MultiExtractorResult, + dedupResult: PreJudgeDedupResult, + judgeConfigs: JudgeConfig[] + ) { + setStep({ type: "running-judge", result: extractionResult, dedupResult, judgeConfigs }); try { - // Run all judges in parallel + // Run all judges in parallel using deduplicated issues const judgePromises = judgeConfigs.map(config => - runJudge(extractionResult, config, generateJudgeLabel(config)) + runJudge(extractionResult, dedupResult.unique, config, generateJudgeLabel(config)) ); const judgeResults = await Promise.all(judgePromises); @@ -258,49 +265,52 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } - // Deduplication: remove duplicates, sort by priority, limit count - // Mirrors the pipeline's Phase 1.5 deduplication - const MAX_ISSUES = 25; - - function runDeduplication( - extractionResult: MultiExtractorResult, - judgeResult: FallacyJudgeOutput, - judgeLabel: string - ) { - const decisions = judgeResult.acceptedDecisions; + // Pre-judge deduplication: remove duplicate issues before sending to judge + function runPreJudgeDedup(extractionResult: MultiExtractorResult, navigate = true): PreJudgeDedupResult { + // Flatten all issues from all extractors + const allIssues: ExtractorIssue[] = extractionResult.extractorResults.flatMap((r) => + r.issues.map((issue) => ({ + extractorId: r.extractorId, + exactText: issue.exactText, + issueType: issue.issueType, + fallacyType: issue.fallacyType, + severityScore: issue.severityScore, + confidenceScore: issue.confidenceScore, + importanceScore: issue.importanceScore, + reasoning: issue.reasoning, + })) + ); - // Step 1: Remove exact text duplicates (case-insensitive, whitespace normalized) + // Remove exact text duplicates (case-insensitive, whitespace normalized) const seen = new Set(); - const unique: JudgeDecision[] = []; - const duplicates: JudgeDecision[] = []; + const unique: ExtractorIssue[] = []; + const duplicates: ExtractorIssue[] = []; - for (const decision of decisions) { - const key = decision.finalText.toLowerCase().replace(/\s+/g, " ").trim(); + for (const issue of allIssues) { + const key = issue.exactText.toLowerCase().replace(/\s+/g, " ").trim(); if (!seen.has(key)) { seen.add(key); - unique.push(decision); + unique.push(issue); } else { - duplicates.push(decision); + duplicates.push(issue); } } - // Step 2: Calculate priority score and sort (higher = more important) - const priorityScore = (d: JudgeDecision) => - d.finalSeverity * 0.6 + d.finalImportance * 0.4; - - const sorted = [...unique].sort((a, b) => priorityScore(b) - priorityScore(a)); + const dedupResult: PreJudgeDedupResult = { + unique, + duplicates, + originalCount: allIssues.length, + }; - // Step 3: Limit to MAX_ISSUES - const kept = sorted.slice(0, MAX_ISSUES); - const limitDropped = sorted.slice(MAX_ISSUES); + if (navigate) { + setStep({ + type: "pre-judge-dedup", + result: extractionResult, + dedupResult, + }); + } - setStep({ - type: "dedup-results", - result: extractionResult, - judgeResult, - judgeLabel, - dedupResult: { kept, duplicates, limitDropped }, - }); + return dedupResult; } // Handle keyboard input - use ref to avoid stale closure @@ -310,13 +320,18 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o if (currentStep.type === "issue-detail") { setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "judge-decision-detail") { - setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult, judgeLabel: currentStep.judgeLabel }); + setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult, judgeLabel: currentStep.judgeLabel, judgeResults: currentStep.judgeResults }); } else if (currentStep.type === "judge-results") { - setStep({ type: "results", result: currentStep.result }); + // Go back to comparison if we came from there, otherwise to extraction results + if (currentStep.judgeResults) { + setStep({ type: "judge-comparison", result: currentStep.result, judgeResults: currentStep.judgeResults }); + } else { + setStep({ type: "results", result: currentStep.result }); + } } else if (currentStep.type === "judge-comparison") { setStep({ type: "results", result: currentStep.result }); - } else if (currentStep.type === "dedup-results") { - setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult, judgeLabel: currentStep.judgeLabel }); + } else if (currentStep.type === "pre-judge-dedup") { + setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "results") { setStep({ type: "configure-extractors" }); } else if (currentStep.type === "add-extractor") { @@ -524,39 +539,12 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o // Actions at the bottom issueItems.push({ label: "───────────────────────────────────────────────────────────────────────────", value: "sep-1" }); - // Judge selection (only if we have multiple extractors with issues) - if (hasMultipleExtractors && totalIssues > 0) { - if (availableJudges.length > 0) { - // Show available judges with checkboxes for multi-select - availableJudges.forEach((judge, idx) => { - const label = generateJudgeLabel(judge); - const isSelected = selectedJudgeIdxs.has(idx); - const prefix = isSelected ? "[x]" : "[ ]"; - const thinkStr = judge.thinking ? "think" : "noThink"; - const tempStr = judge.temperature === 'default' ? 'tDef' : judge.temperature !== undefined ? `t${judge.temperature}` : ''; - issueItems.push({ - label: `${prefix} Judge: ${label} (${tempStr ? tempStr + ', ' : ''}${thinkStr})`, - value: `judge-${idx}`, - }); - }); - - issueItems.push({ label: "─────────────────────────────────────────", value: "sep-2" }); - - const selectedCount = selectedJudgeIdxs.size; - const judgeLabel = selectedCount === 1 - ? generateJudgeLabel(availableJudges[[...selectedJudgeIdxs][0]]) - : `${selectedCount} judges`; - issueItems.push({ - label: `βš–οΈ Run ${judgeLabel} (aggregate ${totalIssues} issues)`, - value: "run-judge", - }); - } else { - // No judges configured - show hint - issueItems.push({ - label: `⚠️ No judges configured. Set FALLACY_JUDGES or FALLACY_JUDGE env var`, - value: "no-judges", - }); - } + // Deduplicate button (only if we have issues) + if (totalIssues > 0) { + issueItems.push({ + label: `β–Ά Deduplicate & Prepare for Judge (${totalIssues} issues)`, + value: "run-dedup", + }); } issueItems.push({ label: "← Back to Configure", value: "back" }); @@ -586,25 +574,8 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o return; } else if (item.value === "back") { setStep({ type: "configure-extractors" }); - } else if (item.value === "run-judge") { - // Run all selected judges - const selectedConfigs = [...selectedJudgeIdxs].map(idx => availableJudges[idx]); - runMultipleJudges(result, selectedConfigs); - } else if (item.value.startsWith("judge-")) { - // Toggle multi-select - const idx = parseInt(item.value.replace("judge-", ""), 10); - setSelectedJudgeIdxs(prev => { - const next = new Set(prev); - if (next.has(idx)) { - // Don't allow deselecting the last one - if (next.size > 1) { - next.delete(idx); - } - } else { - next.add(idx); - } - return next; - }); + } else if (item.value === "run-dedup") { + runPreJudgeDedup(result); } else if (item.value.startsWith("issue-")) { const [, extractorIdx, issueIdx] = item.value.split("-"); setStep({ @@ -665,11 +636,117 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } + // Pre-judge deduplication results + if (step.type === "pre-judge-dedup") { + const { result, dedupResult } = step; + const { unique, duplicates, originalCount } = dedupResult; + + // Build judge selection items only + const judgeItems: Array<{ label: string; value: string }> = []; + + if (availableJudges.length > 0) { + availableJudges.forEach((judge, idx) => { + const label = generateJudgeLabel(judge); + const isSelected = selectedJudgeIdxs.has(idx); + const prefix = isSelected ? "[x]" : "[ ]"; + const thinkStr = judge.thinking ? "think" : "noThink"; + const tempStr = judge.temperature === 'default' ? 'tDef' : judge.temperature !== undefined ? `t${judge.temperature}` : ''; + judgeItems.push({ + label: `${prefix} Judge: ${label} (${tempStr ? tempStr + ', ' : ''}${thinkStr})`, + value: `judge-${idx}`, + }); + }); + + const selectedCount = selectedJudgeIdxs.size; + const judgeLabel = selectedCount === 1 + ? generateJudgeLabel(availableJudges[[...selectedJudgeIdxs][0]]) + : `${selectedCount} judges`; + judgeItems.push({ + label: `βš–οΈ Run ${judgeLabel} (aggregate ${unique.length} issues)`, + value: "run-judge", + }); + } else { + judgeItems.push({ + label: `⚠️ No judges configured. Set FALLACY_JUDGES or FALLACY_JUDGE env var`, + value: "no-judges", + }); + } + + judgeItems.push({ label: "← Back to Extraction Results", value: "back" }); + + return ( + + + Pre-Judge Deduplication + + + {/* Summary stats */} + + + Original: {originalCount} + β†’ + {unique.length} unique + {duplicates.length > 0 && | {duplicates.length} duplicates removed} + + + + {/* Duplicates list (if any) */} + {duplicates.length > 0 && ( + + Duplicates removed: + {duplicates.slice(0, 3).map((d, idx) => ( + + {" "}[{d.extractorId}] {truncate(d.exactText.replace(/\n/g, ' '), issueTextWidth - 20)} + + ))} + {duplicates.length > 3 && ... and {duplicates.length - 3} more} + + )} + + {/* Judge selection */} + + Select Judges: + + + { + if (item.value === "back") { + setStep({ type: "results", result }); + } else if (item.value === "run-judge") { + const selectedConfigs = [...selectedJudgeIdxs].map(idx => availableJudges[idx]); + runMultipleJudges(result, dedupResult, selectedConfigs); + } else if (item.value.startsWith("judge-")) { + // Toggle multi-select + const idx = parseInt(item.value.replace("judge-", ""), 10); + setSelectedJudgeIdxs(prev => { + const next = new Set(prev); + if (next.has(idx)) { + if (next.size > 1) { + next.delete(idx); + } + } else { + next.add(idx); + } + return next; + }); + } + }} + /> + + + Toggle judges with Enter | Escape=Back + + + ); + } + // Running judge(s) if (step.type === "running-judge") { - const totalIssues = step.result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); - const judgeCount = step.judgeConfigs.length; - const judgeNames = step.judgeConfigs.map(c => generateJudgeLabel(c)).join(", "); + const { dedupResult, judgeConfigs } = step; + const judgeCount = judgeConfigs.length; + const judgeNames = judgeConfigs.map(c => generateJudgeLabel(c)).join(", "); return ( @@ -678,12 +755,12 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o - Aggregating {totalIssues} issues from {step.result.extractorResults.length} extractors... + Aggregating {dedupResult.unique.length} issues (from {dedupResult.originalCount} original)... - The judge{judgeCount > 1 ? "s" : ""} will deduplicate, merge, and filter issues + The judge{judgeCount > 1 ? "s" : ""} will merge and filter issues {judgeCount > 1 && Running in parallel: {judgeNames}} @@ -692,7 +769,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o // Judge results if (step.type === "judge-results") { - const { result, judgeResult, judgeLabel } = step; + const { result, judgeResult, judgeLabel, judgeResults } = step; const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); // Create legend mapping extractor IDs to short keys (A, B, C, ...) @@ -732,8 +809,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o }); decisionItems.push({ label: "───────────────────────────────────────────────────────────────────────────────────────", value: "sep-1" }); - decisionItems.push({ label: `β–Ά Run Deduplication (${judgeResult.acceptedDecisions.length} issues)`, value: "run-dedup" }); - decisionItems.push({ label: "← Back to Extraction Results", value: "back" }); + decisionItems.push({ label: "← Back", value: "back" }); // Build legend string const legendParts = extractorIds.map((id, i) => `${String.fromCharCode(65 + i)}=${id}`); @@ -765,9 +841,14 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o if (item.value.startsWith("sep-")) { return; // Ignore separators } else if (item.value === "back") { - setStep({ type: "results", result }); - } else if (item.value === "run-dedup") { - runDeduplication(result, judgeResult, judgeLabel || ""); + // Go back to comparison if we came from there, otherwise to pre-judge dedup + if (judgeResults) { + setStep({ type: "judge-comparison", result, judgeResults }); + } else { + // Go back to pre-judge-dedup view (don't auto-navigate, just get result) + const dedupResult = runPreJudgeDedup(result, false); + setStep({ type: "pre-judge-dedup", result, dedupResult }); + } } else if (item.value.startsWith("accepted-")) { const idx = parseInt(item.value.replace("accepted-", ""), 10); setStep({ @@ -777,6 +858,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o decision: judgeResult.acceptedDecisions[idx], isRejected: false, judgeLabel: judgeLabel || "", + judgeResults, }); } else if (item.value.startsWith("rejected-")) { const idx = parseInt(item.value.replace("rejected-", ""), 10); @@ -787,6 +869,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o decision: judgeResult.rejectedDecisions[idx], isRejected: true, judgeLabel: judgeLabel || "", + judgeResults, }); } }} @@ -968,6 +1051,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o result, judgeResult: jr.result, judgeLabel: jr.label, + judgeResults, // Pass so we can navigate back to comparison }); } } @@ -981,91 +1065,5 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } - // Deduplication results view - if (step.type === "dedup-results") { - const { result, judgeResult, judgeLabel, dedupResult } = step; - const { kept, duplicates, limitDropped } = dedupResult; - const totalInput = judgeResult.acceptedDecisions.length; - - // Calculate priority score for display - const priorityScore = (d: JudgeDecision) => - d.finalSeverity * 0.6 + d.finalImportance * 0.4; - - // Build list items - const dedupItems: Array<{ label: string; value: string }> = []; - - // Kept issues (sorted by priority) - dedupItems.push({ label: `── Kept (${kept.length}) ──`, value: "header-kept" }); - kept.forEach((d, idx) => { - const score = priorityScore(d).toFixed(0); - const text = truncate(d.finalText.replace(/\n/g, ' '), issueTextWidth); - dedupItems.push({ - label: ` [${score}] ${d.finalIssueType.padEnd(18)} ${text}`, - value: `kept-${idx}`, - }); - }); - - // Duplicates removed - if (duplicates.length > 0) { - dedupItems.push({ label: `── Duplicates Removed (${duplicates.length}) ──`, value: "header-dup" }); - duplicates.forEach((d, idx) => { - const text = truncate(d.finalText.replace(/\n/g, ' '), issueTextWidth); - dedupItems.push({ - label: ` [dup] ${d.finalIssueType.padEnd(18)} ${text}`, - value: `dup-${idx}`, - }); - }); - } - - // Limit dropped - if (limitDropped.length > 0) { - dedupItems.push({ label: `── Dropped by Limit (${limitDropped.length}) ──`, value: "header-limit" }); - limitDropped.forEach((d, idx) => { - const score = priorityScore(d).toFixed(0); - const text = truncate(d.finalText.replace(/\n/g, ' '), issueTextWidth); - dedupItems.push({ - label: ` [${score}] ${d.finalIssueType.padEnd(18)} ${text}`, - value: `limit-${idx}`, - }); - }); - } - - dedupItems.push({ label: "───────────────────────────────────────────────────────────────────────────", value: "sep-1" }); - dedupItems.push({ label: "← Back to Judge Results", value: "back" }); - - return ( - - - Deduplication Results - - - - - Input: {totalInput} issues - β†’ - {kept.length} kept - {duplicates.length > 0 && | {duplicates.length} duplicates} - {limitDropped.length > 0 && | {limitDropped.length} over limit} - - - - !i.value.startsWith("header-") && !i.value.startsWith("sep-"))} - limit={maxItems - 5} - onSelect={(item) => { - if (item.value === "back") { - setStep({ type: "judge-results", result, judgeResult, judgeLabel }); - } - // Could add detail view for individual items if needed - }} - /> - - - [score] = priority (sev*0.6 + imp*0.4) | Escape=Back - - - ); - } - return null; } From 4d107932927716f4e74d34786ff27e7d55961356 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 20:41:37 +0000 Subject: [PATCH 36/72] refactor(meta-evals): Split ExtractorLab into smaller modules Restructure the 1069-line ExtractorLab.tsx into focused modules: - ExtractorLab.tsx: 472 lines (main component, state, actions) - extractor-lab/types.ts: Type definitions - extractor-lab/utils.ts: Helper functions (truncate, dedup, etc.) - extractor-lab/views/: 10 view components (16-133 lines each) Each view is now a self-contained component: - ErrorView, RunningView, RunningJudgeView - ConfigureExtractorsView, ResultsView, IssueDetailView - PreJudgeDedupView, JudgeResultsView, JudgeDecisionDetailView - JudgeComparisonView Co-Authored-By: Claude Opus 4.5 --- meta-evals/src/components/ExtractorLab.tsx | 1015 ++++------------- .../src/components/extractor-lab/index.ts | 3 + .../src/components/extractor-lab/types.ts | 78 ++ .../src/components/extractor-lab/utils.ts | 95 ++ .../views/ConfigureExtractorsView.tsx | 86 ++ .../extractor-lab/views/ErrorView.tsx | 16 + .../extractor-lab/views/IssueDetailView.tsx | 49 + .../views/JudgeComparisonView.tsx | 133 +++ .../views/JudgeDecisionDetailView.tsx | 72 ++ .../extractor-lab/views/JudgeResultsView.tsx | 116 ++ .../extractor-lab/views/PreJudgeDedupView.tsx | 123 ++ .../extractor-lab/views/ResultsView.tsx | 102 ++ .../extractor-lab/views/RunningJudgeView.tsx | 35 + .../extractor-lab/views/RunningView.tsx | 28 + .../components/extractor-lab/views/index.ts | 10 + 15 files changed, 1155 insertions(+), 806 deletions(-) create mode 100644 meta-evals/src/components/extractor-lab/index.ts create mode 100644 meta-evals/src/components/extractor-lab/types.ts create mode 100644 meta-evals/src/components/extractor-lab/utils.ts create mode 100644 meta-evals/src/components/extractor-lab/views/ConfigureExtractorsView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/ErrorView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/IssueDetailView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/JudgeComparisonView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/JudgeDecisionDetailView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/JudgeResultsView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/ResultsView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/RunningJudgeView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/RunningView.tsx create mode 100644 meta-evals/src/components/extractor-lab/views/index.ts diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index a9f840e0..56bf2629 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -5,95 +5,55 @@ * for quick iteration on extractor config and prompts. */ -import React, { useState, useEffect, useRef } from "react"; -import { Box, Text, useInput, useStdout } from "ink"; -import SelectInput from "ink-select-input"; -import Spinner from "ink-spinner"; +import React, { useState, useRef } from "react"; +import { useInput, useStdout } from "ink"; import { prisma, type DocumentChoice } from "@roast/db"; import { getMultiExtractorConfig, type ExtractorConfig, type MultiExtractorResult, - type ExtractorResult, } from "@roast/ai/fallacy-extraction/lab"; import { runMultiExtractor } from "@roast/ai/fallacy-extraction"; import fallacyJudgeModule from "@roast/ai/fallacy-judge"; -// CommonJS/ESM interop: default export is wrapped, named exports need unwrapping too -const fallacyJudgeTool = (fallacyJudgeModule as unknown as { default?: typeof fallacyJudgeModule }).default ?? fallacyJudgeModule; -const { getJudgesConfig, generateJudgeLabel } = fallacyJudgeModule as unknown as { - getJudgesConfig: () => import("@roast/ai/fallacy-judge/types").JudgeConfig[]; - generateJudgeLabel: (config: import("@roast/ai/fallacy-judge/types").JudgeConfig) => string; -}; -import type { FallacyJudgeOutput, JudgeDecision, JudgeConfig } from "@roast/ai/fallacy-judge/types"; +import type { JudgeConfig } from "@roast/ai/fallacy-judge/types"; import { ModelSelector } from "./ModelSelector"; import { DocumentSelector } from "./DocumentSelector"; -/** Truncate string to fit terminal width */ -function truncate(str: string, maxLen: number): string { - if (str.length <= maxLen) return str; - return str.slice(0, maxLen - 1) + "…"; -} - -// Simple logger for the judge tool -const simpleLogger = { - info: (...args: unknown[]) => console.error("[INFO]", ...args), - warn: (...args: unknown[]) => console.error("[WARN]", ...args), - error: (...args: unknown[]) => console.error("[ERROR]", ...args), - debug: (...args: unknown[]) => {}, +// Import extracted modules +import type { + ExtractorLabProps, + LabStep, + JudgeRunResult, + PreJudgeDedupResult, + ExtractorIssue, +} from "./extractor-lab/types"; +import { + truncate, + simpleLogger, + TEMP_PRESETS, + calculateTextWidths, + runPreJudgeDedup as runPreJudgeDedupUtil, +} from "./extractor-lab/utils"; +import { + ErrorView, + RunningView, + RunningJudgeView, + ConfigureExtractorsView, + IssueDetailView, + ResultsView, + PreJudgeDedupView, + JudgeResultsView, + JudgeDecisionDetailView, + JudgeComparisonView, +} from "./extractor-lab/views"; + +// CommonJS/ESM interop +const fallacyJudgeTool = (fallacyJudgeModule as unknown as { default?: typeof fallacyJudgeModule }).default ?? fallacyJudgeModule; +const { getJudgesConfig, generateJudgeLabel } = fallacyJudgeModule as unknown as { + getJudgesConfig: () => JudgeConfig[]; + generateJudgeLabel: (config: JudgeConfig) => string; }; -interface ExtractorLabProps { - height: number; - maxItems: number; - documents: DocumentChoice[]; - onSearchDocuments: (filter: string) => void; - onBack: () => void; -} - -/** Result from a single judge run with its config */ -interface JudgeRunResult { - config: JudgeConfig; - label: string; - result: FallacyJudgeOutput; - durationMs: number; - error?: string; -} - -/** Issue with extractor source info for pre-judge dedup */ -interface ExtractorIssue { - extractorId: string; - exactText: string; - issueType: string; - fallacyType?: string; - severityScore: number; - confidenceScore: number; - importanceScore: number; - reasoning: string; -} - -/** Result from pre-judge deduplication */ -interface PreJudgeDedupResult { - /** Unique issues to send to judge */ - unique: ExtractorIssue[]; - /** Duplicate issues removed */ - duplicates: ExtractorIssue[]; - /** Original total count */ - originalCount: number; -} - -type LabStep = - | { type: "select-document" } - | { type: "configure-extractors" } - | { type: "add-extractor" } - | { type: "running" } - | { type: "results"; result: MultiExtractorResult } - | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number } - | { type: "pre-judge-dedup"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult } - | { type: "running-judge"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult; judgeConfigs: JudgeConfig[] } - | { type: "judge-comparison"; result: MultiExtractorResult; judgeResults: JudgeRunResult[] } - | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string; judgeResults?: JudgeRunResult[] } - | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean; judgeLabel: string; judgeResults?: JudgeRunResult[] }; - // Load extractor configs from FALLACY_EXTRACTORS env var, fallback to default function getInitialExtractorConfigs(): ExtractorConfig[] { try { @@ -104,45 +64,33 @@ function getInitialExtractorConfigs(): ExtractorConfig[] { } } -// Temperature presets for cycling -const TEMP_PRESETS = ["default", 0, 0.3, 0.5, 0.7, 1.0] as const; - export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, onBack }: ExtractorLabProps) { const { stdout } = useStdout(); const [step, setStep] = useState({ type: "select-document" }); const [selectedDoc, setSelectedDoc] = useState(null); const [documentText, setDocumentText] = useState(""); - // Calculate available width for text based on terminal width - // Border overhead: β”‚ (1) + padding (1) + content + padding (1) + β”‚ (1) = 4 - // SelectInput indicator: "❯ " or " " = 2 - // Total frame overhead = 6 + // Calculate widths const termWidth = stdout?.columns ?? 120; + const { issueTextWidth, judgeTextWidth } = calculateTextWidths(termWidth); - // For extraction results: " πŸ”΄ [issueType] text" - // Overhead: indicator(2) + spaces(2) + emoji(2) + space(1) + [type](~18) + space(1) = ~26 - const issueTextWidth = Math.max(40, termWidth - 6 - 26); - - // For judge decisions: "[+] type.padEnd(18) text [A,B]" - // Overhead: indicator(2) + [+]space(4) + type(18) + space(1) + space(1) + [A,B](10) = 36 - const judgeTextWidth = Math.max(40, termWidth - 6 - 36); const [extractorConfigs, setExtractorConfigs] = useState(getInitialExtractorConfigs); const [availableJudges] = useState(() => getJudgesConfig()); - const [selectedJudgeIdxs, setSelectedJudgeIdxs] = useState>(() => new Set([0])); // First judge selected by default + const [selectedJudgeIdxs, setSelectedJudgeIdxs] = useState>(() => new Set([0])); const [error, setError] = useState(null); const [highlightedItem, setHighlightedItem] = useState(""); - // Use ref to track current step for useInput (avoids stale closure) const stepRef = useRef(step); stepRef.current = step; - - // Track highlighted item for keyboard shortcuts const highlightedRef = useRef(highlightedItem); highlightedRef.current = highlightedItem; + // ───────────────────────────────────────────────────────────────────────────── + // Actions + // ───────────────────────────────────────────────────────────────────────────── + async function loadDocumentText(docId: string) { try { - // Get latest document version with content const doc = await prisma.document.findUnique({ where: { id: docId }, include: { @@ -153,38 +101,38 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o }, }, }); - const content = doc?.versions[0]?.content; - if (content) { - setDocumentText(content); + if (doc?.versions[0]?.content) { + setDocumentText(doc.versions[0].content); } else { setError("Document has no content"); } - } catch (e) { - setError(`Failed to load document text: ${e}`); + } catch (err) { + setError(`Failed to load document: ${err}`); } } async function runExtraction() { - if (!documentText) { - setError("No document text loaded"); - return; - } - setStep({ type: "running" }); - try { const result = await runMultiExtractor(documentText, { extractors: extractorConfigs, judge: { model: "", enabled: false }, // We'll run judge manually for instrumentation }); - setStep({ type: "results", result }); - } catch (e) { - setError(`Extraction failed: ${e}`); + } catch (err) { + setError(`Extraction failed: ${err}`); setStep({ type: "configure-extractors" }); } } + function runPreJudgeDedup(extractionResult: MultiExtractorResult, navigate = true): PreJudgeDedupResult { + const dedupResult = runPreJudgeDedupUtil(extractionResult); + if (navigate) { + setStep({ type: "pre-judge-dedup", result: extractionResult, dedupResult }); + } + return dedupResult; + } + async function runJudge( extractionResult: MultiExtractorResult, dedupIssues: ExtractorIssue[], @@ -196,7 +144,6 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o .map((r) => r.extractorId); const startTime = Date.now(); - const label = judgeLabel || (judgeConfig ? generateJudgeLabel(judgeConfig) : "default"); try { const judgeResult = await fallacyJudgeTool.execute( @@ -210,22 +157,22 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); return { - config: judgeConfig || { model: "default", enabled: true }, - label, + config: judgeConfig!, + label: judgeLabel || "default", result: judgeResult, durationMs: Date.now() - startTime, }; - } catch (e) { + } catch (err) { return { - config: judgeConfig || { model: "default", enabled: true }, - label, + config: judgeConfig!, + label: judgeLabel || "default", result: { acceptedDecisions: [], rejectedDecisions: [], summary: { totalInputIssues: dedupIssues.length, uniqueGroups: 0, acceptedCount: 0, mergedCount: 0, rejectedCount: 0 }, }, durationMs: Date.now() - startTime, - error: String(e), + error: err instanceof Error ? err.message : String(err), }; } } @@ -237,96 +184,44 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ) { setStep({ type: "running-judge", result: extractionResult, dedupResult, judgeConfigs }); - try { - // Run all judges in parallel using deduplicated issues - const judgePromises = judgeConfigs.map(config => + const results = await Promise.all( + judgeConfigs.map((config) => runJudge(extractionResult, dedupResult.unique, config, generateJudgeLabel(config)) - ); - - const judgeResults = await Promise.all(judgePromises); - - // Check if any had errors - const errored = judgeResults.filter(r => r.error); - if (errored.length === judgeResults.length) { - throw new Error(`All judges failed: ${errored[0].error}`); - } - - // If only one judge was selected, go directly to its results - if (judgeResults.length === 1) { - const single = judgeResults[0]; - setStep({ type: "judge-results", result: extractionResult, judgeResult: single.result, judgeLabel: single.label }); - } else { - // Multiple judges - show comparison view - setStep({ type: "judge-comparison", result: extractionResult, judgeResults }); - } - } catch (e) { - setError(`Judges failed: ${e}`); - setStep({ type: "results", result: extractionResult }); - } - } - - // Pre-judge deduplication: remove duplicate issues before sending to judge - function runPreJudgeDedup(extractionResult: MultiExtractorResult, navigate = true): PreJudgeDedupResult { - // Flatten all issues from all extractors - const allIssues: ExtractorIssue[] = extractionResult.extractorResults.flatMap((r) => - r.issues.map((issue) => ({ - extractorId: r.extractorId, - exactText: issue.exactText, - issueType: issue.issueType, - fallacyType: issue.fallacyType, - severityScore: issue.severityScore, - confidenceScore: issue.confidenceScore, - importanceScore: issue.importanceScore, - reasoning: issue.reasoning, - })) + ) ); - // Remove exact text duplicates (case-insensitive, whitespace normalized) - const seen = new Set(); - const unique: ExtractorIssue[] = []; - const duplicates: ExtractorIssue[] = []; - - for (const issue of allIssues) { - const key = issue.exactText.toLowerCase().replace(/\s+/g, " ").trim(); - if (!seen.has(key)) { - seen.add(key); - unique.push(issue); - } else { - duplicates.push(issue); - } - } - - const dedupResult: PreJudgeDedupResult = { - unique, - duplicates, - originalCount: allIssues.length, - }; - - if (navigate) { + if (results.length === 1 && !results[0].error) { setStep({ - type: "pre-judge-dedup", + type: "judge-results", result: extractionResult, - dedupResult, + judgeResult: results[0].result, + judgeLabel: results[0].label, }); + } else { + setStep({ type: "judge-comparison", result: extractionResult, judgeResults: results }); } - - return dedupResult; } - // Handle keyboard input - use ref to avoid stale closure + // ───────────────────────────────────────────────────────────────────────────── + // Keyboard handling + // ───────────────────────────────────────────────────────────────────────────── + useInput((input, key) => { if (key.escape) { const currentStep = stepRef.current; + if (currentStep.type === "issue-detail") { setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "judge-decision-detail") { - setStep({ type: "judge-results", result: currentStep.result, judgeResult: currentStep.judgeResult, judgeLabel: currentStep.judgeLabel, judgeResults: currentStep.judgeResults }); + const { result, judgeResult, judgeLabel, judgeResults } = currentStep; + setStep({ type: "judge-results", result, judgeResult, judgeLabel, judgeResults }); } else if (currentStep.type === "judge-results") { - // Go back to comparison if we came from there, otherwise to extraction results - if (currentStep.judgeResults) { - setStep({ type: "judge-comparison", result: currentStep.result, judgeResults: currentStep.judgeResults }); + const { result, judgeResults } = currentStep; + if (judgeResults) { + setStep({ type: "judge-comparison", result, judgeResults }); } else { - setStep({ type: "results", result: currentStep.result }); + const dedupResult = runPreJudgeDedup(result, false); + setStep({ type: "pre-judge-dedup", result, dedupResult }); } } else if (currentStep.type === "judge-comparison") { setStep({ type: "results", result: currentStep.result }); @@ -334,30 +229,26 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o setStep({ type: "results", result: currentStep.result }); } else if (currentStep.type === "results") { setStep({ type: "configure-extractors" }); - } else if (currentStep.type === "add-extractor") { - setStep({ type: "configure-extractors" }); - } else if (currentStep.type === "configure-extractors") { + } else if (currentStep.type === "configure-extractors" || currentStep.type === "add-extractor") { setStep({ type: "select-document" }); } else if (currentStep.type === "select-document") { onBack(); } - // Don't call onBack for running/running-judge states } - // Handle 'd' to delete extractor and 't' to cycle temperature (only on configure screen) + // Keyboard shortcuts for configure screen if (stepRef.current.type === "configure-extractors") { const highlighted = highlightedRef.current; + if (highlighted.startsWith("config-")) { const idx = parseInt(highlighted.replace("config-", ""), 10); if (input === "d") { - // Delete extractor (but keep at least one) setExtractorConfigs(configs => { if (configs.length <= 1) return configs; return configs.filter((_, i) => i !== idx); }); } else if (input === "t") { - // Cycle temperature setExtractorConfigs(configs => configs.map((c, i) => { if (i !== idx) return c; @@ -372,16 +263,14 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } }); + // ───────────────────────────────────────────────────────────────────────────── + // Render + // ───────────────────────────────────────────────────────────────────────────── + if (error) { - return ( - - Error: {error} - Press Escape to go back - - ); + return ; } - // Document selection using reusable DocumentSelector if (step.type === "select-document") { return ( ({ - label: `[${idx + 1}] ${config.model} (t=${config.temperature}, think=${config.thinking})`, - value: `config-${idx}`, - })), - { label: "+ Add Extractor", value: "add" }, - { label: "─────────────────", value: "divider2" }, - { label: "← Back to Documents", value: "back" }, - ]; - return ( - - - Extractor Lab - Configure - - - - - - Document: - {selectedDoc?.title} - - - Text length: - {documentText.length} chars - - - Extractors: - {extractorConfigs.length} - - - - - !i.value.startsWith("divider"))} - onHighlight={(item) => setHighlightedItem(item.value)} - onSelect={(item) => { - if (item.value === "back") { - setStep({ type: "select-document" }); - } else if (item.value === "run") { - runExtraction(); - } else if (item.value === "add") { - // Go to model selection - setStep({ type: "add-extractor" }); - } else if (item.value.startsWith("config-")) { - // Toggle thinking for this extractor - const idx = parseInt(item.value.replace("config-", ""), 10); - setExtractorConfigs(configs => - configs.map((c, i) => i === idx ? { ...c, thinking: !c.thinking } : c) - ); - } - }} - /> - - - Enter=toggle think | t=cycle temp | d=delete | Esc=back - - + setStep({ type: "select-document" })} + onRun={runExtraction} + onAdd={() => setStep({ type: "add-extractor" })} + onToggleThinking={(idx) => { + setExtractorConfigs(configs => + configs.map((c, i) => i === idx ? { ...c, thinking: !c.thinking } : c) + ); + }} + /> ); } - // Add extractor - model selection using reusable ModelSelector if (step.type === "add-extractor") { return ( { - // Add new extractor with selected model setExtractorConfigs([ ...extractorConfigs, { model: model.id, temperature: "default", thinking: false }, @@ -488,580 +330,141 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o ); } - // Running if (step.type === "running") { - return ( - - - Extractor Lab - Running - - - - - Running {extractorConfigs.length} extractor(s)... - - - - - This may take a minute... - - - ); + return ; } - // Results - scrollable list of issues if (step.type === "results") { - const { result } = step; - const totalIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); - const hasMultipleExtractors = result.extractorResults.filter((r) => !r.error).length > 1; - - // Build flat list of issues with extractor info - const issueItems: Array<{ label: string; value: string }> = []; - - result.extractorResults.forEach((r, extractorIdx) => { - // Add extractor header - const tempStr = r.config.temperature === 'default' ? 'tDef' : `t${r.config.temperature}`; - const thinkStr = r.config.thinking ? '' : ' noThink'; - issueItems.push({ - label: `── ${r.extractorId} (${tempStr}${thinkStr}) - ${r.issues.length} issues, ${(r.durationMs / 1000).toFixed(1)}s ──`, - value: `header-${extractorIdx}`, - }); - // Add issues for this extractor - r.issues.forEach((issue, issueIdx) => { - const severityColor = issue.severityScore >= 70 ? 'πŸ”΄' : issue.severityScore >= 40 ? '🟑' : '🟒'; - issueItems.push({ - label: ` ${severityColor} [${issue.issueType}] ${truncate(issue.exactText.replace(/\n/g, ' '), issueTextWidth)}`, - value: `issue-${extractorIdx}-${issueIdx}`, - }); - }); - }); - - // Actions at the bottom - issueItems.push({ label: "───────────────────────────────────────────────────────────────────────────", value: "sep-1" }); - - // Deduplicate button (only if we have issues) - if (totalIssues > 0) { - issueItems.push({ - label: `β–Ά Deduplicate & Prepare for Judge (${totalIssues} issues)`, - value: "run-dedup", - }); - } - issueItems.push({ label: "← Back to Configure", value: "back" }); - return ( - - - Extractor Lab - Extraction Results: - {selectedDoc?.title} - - - - - Duration: {(result.totalDurationMs / 1000).toFixed(1)}s - | - Issues: {totalIssues} - | - Extractors: {result.extractorResults.length} - - - - { - if (item.value.startsWith("sep-") || item.value.startsWith("header-")) { - // Ignore separators and headers - return; - } else if (item.value === "back") { - setStep({ type: "configure-extractors" }); - } else if (item.value === "run-dedup") { - runPreJudgeDedup(result); - } else if (item.value.startsWith("issue-")) { - const [, extractorIdx, issueIdx] = item.value.split("-"); - setStep({ - type: "issue-detail", - result, - extractorIdx: parseInt(extractorIdx), - issueIdx: parseInt(issueIdx), - }); - } - }} - /> - - - Enter View Detail | Escape Back - - + setStep({ type: "configure-extractors" })} + onRunDedup={() => runPreJudgeDedup(step.result)} + onViewIssue={(extractorIdx, issueIdx) => { + setStep({ type: "issue-detail", result: step.result, extractorIdx, issueIdx }); + }} + /> ); } - // Issue detail view if (step.type === "issue-detail") { - const { result, extractorIdx, issueIdx } = step; - const extractor = result.extractorResults[extractorIdx]; - const issue = extractor.issues[issueIdx]; - return ( - - - Issue Detail - - - - Extractor: {extractor.extractorId} - Type: {issue.issueType}{issue.fallacyType && ({issue.fallacyType})} - Severity: = 70 ? 'red' : issue.severityScore >= 40 ? 'yellow' : 'green'}>{issue.severityScore}/100 - Confidence: {issue.confidenceScore}/100 - Importance: {issue.importanceScore}/100 - - - - Quoted Text: - - "{issue.exactText}" - - - - - Reasoning: - - {issue.reasoning} - - - - - Press Escape to go back to results - - + ); } - // Pre-judge deduplication results if (step.type === "pre-judge-dedup") { - const { result, dedupResult } = step; - const { unique, duplicates, originalCount } = dedupResult; - - // Build judge selection items only - const judgeItems: Array<{ label: string; value: string }> = []; - - if (availableJudges.length > 0) { - availableJudges.forEach((judge, idx) => { - const label = generateJudgeLabel(judge); - const isSelected = selectedJudgeIdxs.has(idx); - const prefix = isSelected ? "[x]" : "[ ]"; - const thinkStr = judge.thinking ? "think" : "noThink"; - const tempStr = judge.temperature === 'default' ? 'tDef' : judge.temperature !== undefined ? `t${judge.temperature}` : ''; - judgeItems.push({ - label: `${prefix} Judge: ${label} (${tempStr ? tempStr + ', ' : ''}${thinkStr})`, - value: `judge-${idx}`, - }); - }); - - const selectedCount = selectedJudgeIdxs.size; - const judgeLabel = selectedCount === 1 - ? generateJudgeLabel(availableJudges[[...selectedJudgeIdxs][0]]) - : `${selectedCount} judges`; - judgeItems.push({ - label: `βš–οΈ Run ${judgeLabel} (aggregate ${unique.length} issues)`, - value: "run-judge", - }); - } else { - judgeItems.push({ - label: `⚠️ No judges configured. Set FALLACY_JUDGES or FALLACY_JUDGE env var`, - value: "no-judges", - }); - } - - judgeItems.push({ label: "← Back to Extraction Results", value: "back" }); - return ( - - - Pre-Judge Deduplication - - - {/* Summary stats */} - - - Original: {originalCount} - β†’ - {unique.length} unique - {duplicates.length > 0 && | {duplicates.length} duplicates removed} - - - - {/* Duplicates list (if any) */} - {duplicates.length > 0 && ( - - Duplicates removed: - {duplicates.slice(0, 3).map((d, idx) => ( - - {" "}[{d.extractorId}] {truncate(d.exactText.replace(/\n/g, ' '), issueTextWidth - 20)} - - ))} - {duplicates.length > 3 && ... and {duplicates.length - 3} more} - - )} - - {/* Judge selection */} - - Select Judges: - - - { - if (item.value === "back") { - setStep({ type: "results", result }); - } else if (item.value === "run-judge") { - const selectedConfigs = [...selectedJudgeIdxs].map(idx => availableJudges[idx]); - runMultipleJudges(result, dedupResult, selectedConfigs); - } else if (item.value.startsWith("judge-")) { - // Toggle multi-select - const idx = parseInt(item.value.replace("judge-", ""), 10); - setSelectedJudgeIdxs(prev => { - const next = new Set(prev); - if (next.has(idx)) { - if (next.size > 1) { - next.delete(idx); - } - } else { - next.add(idx); - } - return next; - }); + setStep({ type: "results", result: step.result })} + onRunJudges={(configs) => runMultipleJudges(step.result, step.dedupResult, configs)} + onToggleJudge={(idx) => { + setSelectedJudgeIdxs(prev => { + const next = new Set(prev); + if (next.has(idx)) { + if (next.size > 1) next.delete(idx); + } else { + next.add(idx); } - }} - /> - - - Toggle judges with Enter | Escape=Back - - + return next; + }); + }} + /> ); } - // Running judge(s) if (step.type === "running-judge") { - const { dedupResult, judgeConfigs } = step; - const judgeCount = judgeConfigs.length; - const judgeNames = judgeConfigs.map(c => generateJudgeLabel(c)).join(", "); return ( - - - Extractor Lab - Running {judgeCount > 1 ? `${judgeCount} Judges` : "Judge"} - - - - - Aggregating {dedupResult.unique.length} issues (from {dedupResult.originalCount} original)... - - - - - The judge{judgeCount > 1 ? "s" : ""} will merge and filter issues - {judgeCount > 1 && Running in parallel: {judgeNames}} - - + ); } - // Judge results - if (step.type === "judge-results") { - const { result, judgeResult, judgeLabel, judgeResults } = step; - const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); - - // Create legend mapping extractor IDs to short keys (A, B, C, ...) - const extractorIds = result.extractorResults.map(r => r.extractorId); - const extractorKeys: Record = {}; - extractorIds.forEach((id, i) => { - extractorKeys[id] = String.fromCharCode(65 + i); // A, B, C, ... - }); - - // Helper to convert extractor IDs to short keys - const sourcesToKeys = (sources: string[]): string => { - return sources.map(s => extractorKeys[s] || "?").join(","); - }; - - // Build list of judge decisions - const decisionItems: Array<{ label: string; value: string }> = []; - - // Accepted/merged decisions - judgeResult.acceptedDecisions.forEach((decision, idx) => { - const symbol = decision.decision === "merge" ? "[*]" : "[+]"; - const keys = sourcesToKeys(decision.sourceExtractors); - const text = truncate(decision.finalText.replace(/\n/g, ' '), judgeTextWidth).padEnd(judgeTextWidth); - decisionItems.push({ - label: `${symbol} ${decision.finalIssueType.padEnd(18)} ${text} [${keys}]`, - value: `accepted-${idx}`, - }); - }); - - // Rejected decisions - judgeResult.rejectedDecisions.forEach((decision, idx) => { - const keys = sourcesToKeys(decision.sourceExtractors); - const text = truncate(decision.finalText.replace(/\n/g, ' '), judgeTextWidth).padEnd(judgeTextWidth); - decisionItems.push({ - label: `[x] ${decision.finalIssueType.padEnd(18)} ${text} [${keys}]`, - value: `rejected-${idx}`, - }); - }); - - decisionItems.push({ label: "───────────────────────────────────────────────────────────────────────────────────────", value: "sep-1" }); - decisionItems.push({ label: "← Back", value: "back" }); - - // Build legend string - const legendParts = extractorIds.map((id, i) => `${String.fromCharCode(65 + i)}=${id}`); - const legendStr = legendParts.join(" "); - + if (step.type === "judge-comparison") { return ( - - - Judge Results{judgeLabel ? `: ${judgeLabel}` : ""} - - - - - Input: {totalInputIssues} issues - --> - {judgeResult.summary.acceptedCount} accepted - | - {judgeResult.summary.mergedCount} merged - | - {judgeResult.summary.rejectedCount} rejected - - Legend: [+]=accept [*]=merge [x]=reject | {legendStr} - - - { - if (item.value.startsWith("sep-")) { - return; // Ignore separators - } else if (item.value === "back") { - // Go back to comparison if we came from there, otherwise to pre-judge dedup - if (judgeResults) { - setStep({ type: "judge-comparison", result, judgeResults }); - } else { - // Go back to pre-judge-dedup view (don't auto-navigate, just get result) - const dedupResult = runPreJudgeDedup(result, false); - setStep({ type: "pre-judge-dedup", result, dedupResult }); - } - } else if (item.value.startsWith("accepted-")) { - const idx = parseInt(item.value.replace("accepted-", ""), 10); - setStep({ - type: "judge-decision-detail", - result, - judgeResult, - decision: judgeResult.acceptedDecisions[idx], - isRejected: false, - judgeLabel: judgeLabel || "", - judgeResults, - }); - } else if (item.value.startsWith("rejected-")) { - const idx = parseInt(item.value.replace("rejected-", ""), 10); - setStep({ - type: "judge-decision-detail", - result, - judgeResult, - decision: judgeResult.rejectedDecisions[idx], - isRejected: true, - judgeLabel: judgeLabel || "", - judgeResults, - }); - } - }} - /> - - - Enter=View Detail | Escape=Back - - + setStep({ type: "results", result: step.result })} + onViewJudge={(jr) => { + setStep({ + type: "judge-results", + result: step.result, + judgeResult: jr.result, + judgeLabel: jr.label, + judgeResults: step.judgeResults, + }); + }} + /> ); } - // Judge decision detail - if (step.type === "judge-decision-detail") { - const { decision, isRejected } = step; - + if (step.type === "judge-results") { + const { result, judgeResult, judgeLabel, judgeResults } = step; return ( - - - - Judge Decision: {decision.decision.toUpperCase()} - - - - - - Decision: - {decision.decision} - - - Type: - {decision.finalIssueType} - {decision.finalFallacyType && ({decision.finalFallacyType})} - - - Severity: - = 70 ? "red" : decision.finalSeverity >= 40 ? "yellow" : "green"}> - {decision.finalSeverity}/100 - - | - Confidence: {decision.finalConfidence}/100 - | - Importance: {decision.finalImportance}/100 - - - Source Extractors: - {decision.sourceExtractors.join(", ")} - - - - - Quoted Text: - - "{decision.finalText}" - - - - - Judge Reasoning: - - {decision.judgeReasoning} - - - - - Issue Reasoning: - - {decision.finalReasoning} - - - - - Press Escape to go back to judge results - - + { + if (judgeResults) { + setStep({ type: "judge-comparison", result, judgeResults }); + } else { + const dedupResult = runPreJudgeDedup(result, false); + setStep({ type: "pre-judge-dedup", result, dedupResult }); + } + }} + onViewDecision={(decision, isRejected) => { + setStep({ + type: "judge-decision-detail", + result, + judgeResult, + decision, + isRejected, + judgeLabel, + judgeResults, + }); + }} + /> ); } - // Judge comparison view - comparing multiple judges - if (step.type === "judge-comparison") { - const { result, judgeResults } = step; - const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); - - // Build comparison items - const comparisonItems: Array<{ label: string; value: string }> = []; - - // Header row - comparisonItems.push({ - label: `── Judge Comparison: ${judgeResults.length} judges, ${totalInputIssues} input issues ──`, - value: "header", - }); - - // Each judge row - judgeResults.forEach((jr, idx) => { - const status = jr.error ? "❌ Error" : `βœ… ${jr.result.summary.acceptedCount} accepted, ${jr.result.summary.mergedCount} merged, ${jr.result.summary.rejectedCount} rejected`; - const duration = `${(jr.durationMs / 1000).toFixed(1)}s`; - comparisonItems.push({ - label: `[${idx + 1}] ${jr.label.padEnd(30)} ${duration.padEnd(8)} ${status}`, - value: `judge-${idx}`, - }); - - // If error, show error details - if (jr.error) { - comparisonItems.push({ - label: ` Error: ${truncate(jr.error, termWidth - 20)}`, - value: `error-${idx}`, - }); - } - }); - - // Summary stats - comparisonItems.push({ - label: "────────────────────────────────────────────────────────────────────────────", - value: "sep-1", - }); - - // Agreement summary - find issues accepted by all judges - const successfulJudges = judgeResults.filter(jr => !jr.error); - if (successfulJudges.length > 1) { - // Get accepted issue texts from each judge for comparison - const acceptedByJudge = successfulJudges.map(jr => - new Set(jr.result.acceptedDecisions.map(d => d.finalText.toLowerCase().trim())) - ); - - // Find issues in ALL judges (intersection) - const unanimouslyAccepted = [...acceptedByJudge[0]].filter(text => - acceptedByJudge.every(set => set.has(text)) - ).length; - - // Find issues in ANY judge (union) - const allAccepted = new Set(acceptedByJudge.flatMap(set => [...set])).size; - - const agreementPct = allAccepted > 0 ? Math.round((unanimouslyAccepted / allAccepted) * 100) : 0; - - comparisonItems.push({ - label: `πŸ“Š Agreement: ${unanimouslyAccepted}/${allAccepted} issues accepted by all judges (${agreementPct}%)`, - value: "stats-1", - }); - } - - comparisonItems.push({ - label: "────────────────────────────────────────────────────────────────────────────", - value: "sep-2", - }); - comparisonItems.push({ label: "← Back to Extraction Results", value: "back" }); - + if (step.type === "judge-decision-detail") { return ( - - - Extractor Lab - Judge Comparison: - {selectedDoc?.title} - - - - - Input: {totalInputIssues} issues from {result.extractorResults.length} extractors - | - Judges run: {judgeResults.length} - | - Successful: {judgeResults.filter(j => !j.error).length} - - - - !i.value.startsWith("sep-") && !i.value.startsWith("header") && !i.value.startsWith("stats-"))} - limit={maxItems - 5} - onSelect={(item) => { - if (item.value === "back") { - setStep({ type: "results", result }); - } else if (item.value.startsWith("error-")) { - // Error lines are not clickable, just informational - return; - } else if (item.value.startsWith("judge-")) { - const idx = parseInt(item.value.replace("judge-", ""), 10); - const jr = judgeResults[idx]; - if (!jr.error) { - setStep({ - type: "judge-results", - result, - judgeResult: jr.result, - judgeLabel: jr.label, - judgeResults, // Pass so we can navigate back to comparison - }); - } - } - }} - /> - - - Enter=View Judge Details | Escape=Back to Results - - + ); } diff --git a/meta-evals/src/components/extractor-lab/index.ts b/meta-evals/src/components/extractor-lab/index.ts new file mode 100644 index 00000000..e2adaf3d --- /dev/null +++ b/meta-evals/src/components/extractor-lab/index.ts @@ -0,0 +1,3 @@ +export * from "./types"; +export * from "./utils"; +export * from "./views"; diff --git a/meta-evals/src/components/extractor-lab/types.ts b/meta-evals/src/components/extractor-lab/types.ts new file mode 100644 index 00000000..f5570fde --- /dev/null +++ b/meta-evals/src/components/extractor-lab/types.ts @@ -0,0 +1,78 @@ +/** + * Types for Extractor Lab component + */ + +import type { DocumentChoice } from "@roast/db"; +import type { + ExtractorConfig, + MultiExtractorResult, +} from "@roast/ai/fallacy-extraction/lab"; +import type { + FallacyJudgeOutput, + JudgeDecision, + JudgeConfig, +} from "@roast/ai/fallacy-judge/types"; + +export type { DocumentChoice, ExtractorConfig, MultiExtractorResult, FallacyJudgeOutput, JudgeDecision, JudgeConfig }; + +/** Props for the main ExtractorLab component */ +export interface ExtractorLabProps { + height: number; + maxItems: number; + documents: DocumentChoice[]; + onSearchDocuments: (filter: string) => void; + onBack: () => void; +} + +/** Result from a single judge run with its config */ +export interface JudgeRunResult { + config: JudgeConfig; + label: string; + result: FallacyJudgeOutput; + durationMs: number; + error?: string; +} + +/** Issue with extractor source info for pre-judge dedup */ +export interface ExtractorIssue { + extractorId: string; + exactText: string; + issueType: string; + fallacyType?: string; + severityScore: number; + confidenceScore: number; + importanceScore: number; + reasoning: string; +} + +/** Result from pre-judge deduplication */ +export interface PreJudgeDedupResult { + /** Unique issues to send to judge */ + unique: ExtractorIssue[]; + /** Duplicate issues removed */ + duplicates: ExtractorIssue[]; + /** Original total count */ + originalCount: number; +} + +/** All possible steps/views in the Extractor Lab */ +export type LabStep = + | { type: "select-document" } + | { type: "configure-extractors" } + | { type: "add-extractor" } + | { type: "running" } + | { type: "results"; result: MultiExtractorResult } + | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number } + | { type: "pre-judge-dedup"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult } + | { type: "running-judge"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult; judgeConfigs: JudgeConfig[] } + | { type: "judge-comparison"; result: MultiExtractorResult; judgeResults: JudgeRunResult[] } + | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string; judgeResults?: JudgeRunResult[] } + | { type: "judge-decision-detail"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; decision: JudgeDecision; isRejected: boolean; judgeLabel: string; judgeResults?: JudgeRunResult[] }; + +/** Logger interface for judge tool */ +export interface SimpleLogger { + info: (...args: unknown[]) => void; + warn: (...args: unknown[]) => void; + error: (...args: unknown[]) => void; + debug: (...args: unknown[]) => void; +} diff --git a/meta-evals/src/components/extractor-lab/utils.ts b/meta-evals/src/components/extractor-lab/utils.ts new file mode 100644 index 00000000..b5cc0397 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/utils.ts @@ -0,0 +1,95 @@ +/** + * Utility functions for Extractor Lab + */ + +import { + getMultiExtractorConfig, + type ExtractorConfig, +} from "@roast/ai/fallacy-extraction/lab"; +import type { SimpleLogger, ExtractorIssue, PreJudgeDedupResult, MultiExtractorResult } from "./types"; + +/** Temperature presets for cycling */ +export const TEMP_PRESETS = ["default", 0, 0.3, 0.5, 0.7, 1.0] as const; + +/** Truncate string to fit terminal width */ +export function truncate(str: string, maxLen: number): string { + if (str.length <= maxLen) return str; + return str.slice(0, maxLen - 1) + "…"; +} + +/** Simple logger for the judge tool */ +export const simpleLogger: SimpleLogger = { + info: (...args: unknown[]) => console.error("[INFO]", ...args), + warn: (...args: unknown[]) => console.error("[WARN]", ...args), + error: (...args: unknown[]) => console.error("[ERROR]", ...args), + debug: (..._args: unknown[]) => {}, +}; + +/** Load extractor configs from FALLACY_EXTRACTORS env var, fallback to default */ +export function getInitialExtractorConfigs(): ExtractorConfig[] { + try { + const config = getMultiExtractorConfig(); + return config.extractors; + } catch { + return [{ model: "claude-sonnet-4-5-20250929", temperature: "default", thinking: false }]; + } +} + +/** Generate a label for an extractor config */ +export function generateExtractorLabel(config: ExtractorConfig): string { + const modelShort = config.model.split("/").pop()?.replace(/-\d{8}$/, "") ?? config.model; + const tempStr = config.temperature === "default" ? "tDef" : `t${config.temperature}`; + const thinkStr = config.thinking ? "think" : "noThink"; + return `${modelShort}-${tempStr}-${thinkStr}`; +} + +/** Run pre-judge deduplication on extractor results */ +export function runPreJudgeDedup(extractionResult: MultiExtractorResult): PreJudgeDedupResult { + // Flatten all issues from all extractors + const allIssues: ExtractorIssue[] = extractionResult.extractorResults.flatMap((r) => + r.issues.map((issue) => ({ + extractorId: r.extractorId, + exactText: issue.exactText, + issueType: issue.issueType, + fallacyType: issue.fallacyType, + severityScore: issue.severityScore, + confidenceScore: issue.confidenceScore, + importanceScore: issue.importanceScore, + reasoning: issue.reasoning, + })) + ); + + // Remove exact text duplicates (case-insensitive, whitespace normalized) + const seen = new Set(); + const unique: ExtractorIssue[] = []; + const duplicates: ExtractorIssue[] = []; + + for (const issue of allIssues) { + const key = issue.exactText.toLowerCase().replace(/\s+/g, " ").trim(); + if (!seen.has(key)) { + seen.add(key); + unique.push(issue); + } else { + duplicates.push(issue); + } + } + + return { + unique, + duplicates, + originalCount: allIssues.length, + }; +} + +/** Calculate text widths based on terminal width */ +export function calculateTextWidths(termWidth: number) { + // For extraction results: " πŸ”΄ [issueType] text" + // Overhead: indicator(2) + spaces(2) + emoji(2) + space(1) + [type](~18) + space(1) = ~26 + const issueTextWidth = Math.max(40, termWidth - 6 - 26); + + // For judge decisions: "[+] type.padEnd(18) text [A,B]" + // Overhead: indicator(2) + [+]space(4) + type(18) + space(1) + space(1) + [A,B](10) = 36 + const judgeTextWidth = Math.max(40, termWidth - 6 - 36); + + return { issueTextWidth, judgeTextWidth }; +} diff --git a/meta-evals/src/components/extractor-lab/views/ConfigureExtractorsView.tsx b/meta-evals/src/components/extractor-lab/views/ConfigureExtractorsView.tsx new file mode 100644 index 00000000..5c103075 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/ConfigureExtractorsView.tsx @@ -0,0 +1,86 @@ +import React from "react"; +import { Box, Text } from "ink"; +import SelectInput from "ink-select-input"; +import type { ExtractorConfig, DocumentChoice } from "../types"; + +interface ConfigureExtractorsViewProps { + height: number; + selectedDoc: DocumentChoice | null; + documentText: string; + extractorConfigs: ExtractorConfig[]; + onHighlight: (value: string) => void; + onBack: () => void; + onRun: () => void; + onAdd: () => void; + onToggleThinking: (idx: number) => void; +} + +export function ConfigureExtractorsView({ + height, + selectedDoc, + documentText, + extractorConfigs, + onHighlight, + onBack, + onRun, + onAdd, + onToggleThinking, +}: ConfigureExtractorsViewProps) { + const items = [ + { label: "β–Ά Run Extraction", value: "run" }, + { label: "─────────────────", value: "divider" }, + ...extractorConfigs.map((config, idx) => ({ + label: `[${idx + 1}] ${config.model} (t=${config.temperature}, think=${config.thinking})`, + value: `config-${idx}`, + })), + { label: "+ Add Extractor", value: "add" }, + { label: "─────────────────", value: "divider2" }, + { label: "← Back to Documents", value: "back" }, + ]; + + return ( + + + Extractor Lab - Configure + + + + + + Document: + {selectedDoc?.title} + + + Text length: + {documentText.length} chars + + + Extractors: + {extractorConfigs.length} + + + + + !i.value.startsWith("divider"))} + onHighlight={(item) => onHighlight(item.value)} + onSelect={(item) => { + if (item.value === "back") { + onBack(); + } else if (item.value === "run") { + onRun(); + } else if (item.value === "add") { + onAdd(); + } else if (item.value.startsWith("config-")) { + const idx = parseInt(item.value.replace("config-", ""), 10); + onToggleThinking(idx); + } + }} + /> + + + Enter=toggle think | t=cycle temp | d=delete | Esc=back + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/ErrorView.tsx b/meta-evals/src/components/extractor-lab/views/ErrorView.tsx new file mode 100644 index 00000000..feba37a2 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/ErrorView.tsx @@ -0,0 +1,16 @@ +import React from "react"; +import { Box, Text } from "ink"; + +interface ErrorViewProps { + error: string; + height: number; +} + +export function ErrorView({ error, height }: ErrorViewProps) { + return ( + + Error: {error} + Press Escape to go back + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/IssueDetailView.tsx b/meta-evals/src/components/extractor-lab/views/IssueDetailView.tsx new file mode 100644 index 00000000..eac56efd --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/IssueDetailView.tsx @@ -0,0 +1,49 @@ +import React from "react"; +import { Box, Text } from "ink"; +import type { MultiExtractorResult } from "../types"; + +interface IssueDetailViewProps { + height: number; + result: MultiExtractorResult; + extractorIdx: number; + issueIdx: number; +} + +export function IssueDetailView({ height, result, extractorIdx, issueIdx }: IssueDetailViewProps) { + const extractor = result.extractorResults[extractorIdx]; + const issue = extractor.issues[issueIdx]; + + return ( + + + Issue Detail + + + + Extractor: {extractor.extractorId} + Type: {issue.issueType}{issue.fallacyType && ({issue.fallacyType})} + Severity: = 70 ? 'red' : issue.severityScore >= 40 ? 'yellow' : 'green'}>{issue.severityScore}/100 + Confidence: {issue.confidenceScore}/100 + Importance: {issue.importanceScore}/100 + + + + Quoted Text: + + "{issue.exactText}" + + + + + Reasoning: + + {issue.reasoning} + + + + + Press Escape to go back to results + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/JudgeComparisonView.tsx b/meta-evals/src/components/extractor-lab/views/JudgeComparisonView.tsx new file mode 100644 index 00000000..a07fa963 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/JudgeComparisonView.tsx @@ -0,0 +1,133 @@ +import React from "react"; +import { Box, Text } from "ink"; +import SelectInput from "ink-select-input"; +import type { MultiExtractorResult, JudgeRunResult, DocumentChoice } from "../types"; +import { truncate } from "../utils"; + +interface JudgeComparisonViewProps { + height: number; + maxItems: number; + result: MultiExtractorResult; + judgeResults: JudgeRunResult[]; + selectedDoc: DocumentChoice | null; + termWidth: number; + onBack: () => void; + onViewJudge: (judgeResult: JudgeRunResult) => void; +} + +export function JudgeComparisonView({ + height, + maxItems, + result, + judgeResults, + selectedDoc, + termWidth, + onBack, + onViewJudge, +}: JudgeComparisonViewProps) { + const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + + // Build comparison items + const comparisonItems: Array<{ label: string; value: string }> = []; + + // Header row + comparisonItems.push({ + label: `── Judge Comparison: ${judgeResults.length} judges, ${totalInputIssues} input issues ──`, + value: "header", + }); + + // Each judge row + judgeResults.forEach((jr, idx) => { + const status = jr.error ? "❌ Error" : `βœ… ${jr.result.summary.acceptedCount} accepted, ${jr.result.summary.mergedCount} merged, ${jr.result.summary.rejectedCount} rejected`; + const duration = `${(jr.durationMs / 1000).toFixed(1)}s`; + comparisonItems.push({ + label: `[${idx + 1}] ${jr.label.padEnd(30)} ${duration.padEnd(8)} ${status}`, + value: `judge-${idx}`, + }); + + // If error, show error details + if (jr.error) { + comparisonItems.push({ + label: ` Error: ${truncate(jr.error, termWidth - 20)}`, + value: `error-${idx}`, + }); + } + }); + + // Summary stats + comparisonItems.push({ + label: "────────────────────────────────────────────────────────────────────────────", + value: "sep-1", + }); + + // Agreement summary - find issues accepted by all judges + const successfulJudges = judgeResults.filter(jr => !jr.error); + if (successfulJudges.length > 1) { + // Get accepted issue texts from each judge for comparison + const acceptedByJudge = successfulJudges.map(jr => + new Set(jr.result.acceptedDecisions.map(d => d.finalText.toLowerCase().trim())) + ); + + // Find issues in ALL judges (intersection) + const unanimouslyAccepted = [...acceptedByJudge[0]].filter(text => + acceptedByJudge.every(set => set.has(text)) + ).length; + + // Find issues in ANY judge (union) + const allAccepted = new Set(acceptedByJudge.flatMap(set => [...set])).size; + + const agreementPct = allAccepted > 0 ? Math.round((unanimouslyAccepted / allAccepted) * 100) : 0; + + comparisonItems.push({ + label: `πŸ“Š Agreement: ${unanimouslyAccepted}/${allAccepted} issues accepted by all judges (${agreementPct}%)`, + value: "stats-1", + }); + } + + comparisonItems.push({ + label: "────────────────────────────────────────────────────────────────────────────", + value: "sep-2", + }); + comparisonItems.push({ label: "← Back to Extraction Results", value: "back" }); + + return ( + + + Extractor Lab - Judge Comparison: + {selectedDoc?.title} + + + + + Input: {totalInputIssues} issues from {result.extractorResults.length} extractors + | + Judges run: {judgeResults.length} + | + Successful: {judgeResults.filter(j => !j.error).length} + + + + !i.value.startsWith("sep-") && !i.value.startsWith("header") && !i.value.startsWith("stats-"))} + limit={maxItems - 5} + onSelect={(item) => { + if (item.value === "back") { + onBack(); + } else if (item.value.startsWith("error-")) { + return; + } else if (item.value.startsWith("judge-")) { + const idx = parseInt(item.value.replace("judge-", ""), 10); + const jr = judgeResults[idx]; + if (!jr.error) { + onViewJudge(jr); + } + } + }} + /> + + + Enter=View Judge Details | Escape=Back to Results + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/JudgeDecisionDetailView.tsx b/meta-evals/src/components/extractor-lab/views/JudgeDecisionDetailView.tsx new file mode 100644 index 00000000..419f56ac --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/JudgeDecisionDetailView.tsx @@ -0,0 +1,72 @@ +import React from "react"; +import { Box, Text } from "ink"; +import type { JudgeDecision } from "../types"; + +interface JudgeDecisionDetailViewProps { + height: number; + decision: JudgeDecision; + isRejected: boolean; +} + +export function JudgeDecisionDetailView({ height, decision, isRejected }: JudgeDecisionDetailViewProps) { + return ( + + + + Judge Decision: {decision.decision.toUpperCase()} + + + + + + Decision: + {decision.decision} + + + Type: + {decision.finalIssueType} + {decision.finalFallacyType && ({decision.finalFallacyType})} + + + Severity: + = 70 ? "red" : decision.finalSeverity >= 40 ? "yellow" : "green"}> + {decision.finalSeverity}/100 + + | + Confidence: {decision.finalConfidence}/100 + | + Importance: {decision.finalImportance}/100 + + + Source Extractors: + {decision.sourceExtractors.join(", ")} + + + + + Quoted Text: + + "{decision.finalText}" + + + + + Judge Reasoning: + + {decision.judgeReasoning} + + + + + Issue Reasoning: + + {decision.finalReasoning} + + + + + Press Escape to go back to judge results + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/JudgeResultsView.tsx b/meta-evals/src/components/extractor-lab/views/JudgeResultsView.tsx new file mode 100644 index 00000000..f392fd54 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/JudgeResultsView.tsx @@ -0,0 +1,116 @@ +import React from "react"; +import { Box, Text } from "ink"; +import SelectInput from "ink-select-input"; +import type { MultiExtractorResult, FallacyJudgeOutput, JudgeDecision, JudgeRunResult } from "../types"; +import { truncate } from "../utils"; + +interface JudgeResultsViewProps { + height: number; + maxItems: number; + result: MultiExtractorResult; + judgeResult: FallacyJudgeOutput; + judgeLabel: string; + judgeResults?: JudgeRunResult[]; + judgeTextWidth: number; + onBack: () => void; + onViewDecision: (decision: JudgeDecision, isRejected: boolean) => void; +} + +export function JudgeResultsView({ + height, + maxItems, + result, + judgeResult, + judgeLabel, + judgeTextWidth, + onBack, + onViewDecision, +}: JudgeResultsViewProps) { + const totalInputIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + + // Create legend mapping extractor IDs to short keys (A, B, C, ...) + const extractorIds = result.extractorResults.map(r => r.extractorId); + const extractorKeys: Record = {}; + extractorIds.forEach((id, i) => { + extractorKeys[id] = String.fromCharCode(65 + i); // A, B, C, ... + }); + + // Helper to convert extractor IDs to short keys + const sourcesToKeys = (sources: string[]): string => { + return sources.map(s => extractorKeys[s] || "?").join(","); + }; + + // Build list of judge decisions + const decisionItems: Array<{ label: string; value: string }> = []; + + // Accepted/merged decisions + judgeResult.acceptedDecisions.forEach((decision, idx) => { + const symbol = decision.decision === "merge" ? "[*]" : "[+]"; + const keys = sourcesToKeys(decision.sourceExtractors); + const text = truncate(decision.finalText.replace(/\n/g, ' '), judgeTextWidth).padEnd(judgeTextWidth); + decisionItems.push({ + label: `${symbol} ${decision.finalIssueType.padEnd(18)} ${text} [${keys}]`, + value: `accepted-${idx}`, + }); + }); + + // Rejected decisions + judgeResult.rejectedDecisions.forEach((decision, idx) => { + const keys = sourcesToKeys(decision.sourceExtractors); + const text = truncate(decision.finalText.replace(/\n/g, ' '), judgeTextWidth).padEnd(judgeTextWidth); + decisionItems.push({ + label: `[x] ${decision.finalIssueType.padEnd(18)} ${text} [${keys}]`, + value: `rejected-${idx}`, + }); + }); + + decisionItems.push({ label: "───────────────────────────────────────────────────────────────────────────────────────", value: "sep-1" }); + decisionItems.push({ label: "← Back", value: "back" }); + + // Build legend string + const legendParts = extractorIds.map((id, i) => `${String.fromCharCode(65 + i)}=${id}`); + const legendStr = legendParts.join(" "); + + return ( + + + Judge Results{judgeLabel ? `: ${judgeLabel}` : ""} + + + + + Input: {totalInputIssues} issues + --> + {judgeResult.summary.acceptedCount} accepted + | + {judgeResult.summary.mergedCount} merged + | + {judgeResult.summary.rejectedCount} rejected + + Legend: [+]=accept [*]=merge [x]=reject | {legendStr} + + + { + if (item.value.startsWith("sep-")) { + return; + } else if (item.value === "back") { + onBack(); + } else if (item.value.startsWith("accepted-")) { + const idx = parseInt(item.value.replace("accepted-", ""), 10); + onViewDecision(judgeResult.acceptedDecisions[idx], false); + } else if (item.value.startsWith("rejected-")) { + const idx = parseInt(item.value.replace("rejected-", ""), 10); + onViewDecision(judgeResult.rejectedDecisions[idx], true); + } + }} + /> + + + Enter=View Detail | Escape=Back + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx b/meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx new file mode 100644 index 00000000..4a8d3cc3 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx @@ -0,0 +1,123 @@ +import React from "react"; +import { Box, Text } from "ink"; +import SelectInput from "ink-select-input"; +import type { MultiExtractorResult, PreJudgeDedupResult, JudgeConfig } from "../types"; +import { truncate } from "../utils"; + +interface PreJudgeDedupViewProps { + height: number; + maxItems: number; + result: MultiExtractorResult; + dedupResult: PreJudgeDedupResult; + availableJudges: JudgeConfig[]; + selectedJudgeIdxs: Set; + issueTextWidth: number; + generateJudgeLabel: (config: JudgeConfig) => string; + onBack: () => void; + onRunJudges: (selectedConfigs: JudgeConfig[]) => void; + onToggleJudge: (idx: number) => void; +} + +export function PreJudgeDedupView({ + height, + maxItems, + dedupResult, + availableJudges, + selectedJudgeIdxs, + issueTextWidth, + generateJudgeLabel, + onBack, + onRunJudges, + onToggleJudge, +}: PreJudgeDedupViewProps) { + const { unique, duplicates, originalCount } = dedupResult; + + // Build judge selection items only + const judgeItems: Array<{ label: string; value: string }> = []; + + if (availableJudges.length > 0) { + availableJudges.forEach((judge, idx) => { + const label = generateJudgeLabel(judge); + const isSelected = selectedJudgeIdxs.has(idx); + const prefix = isSelected ? "[x]" : "[ ]"; + const thinkStr = judge.thinking ? "think" : "noThink"; + const tempStr = judge.temperature === 'default' ? 'tDef' : judge.temperature !== undefined ? `t${judge.temperature}` : ''; + judgeItems.push({ + label: `${prefix} Judge: ${label} (${tempStr ? tempStr + ', ' : ''}${thinkStr})`, + value: `judge-${idx}`, + }); + }); + + const selectedCount = selectedJudgeIdxs.size; + const judgeLabel = selectedCount === 1 + ? generateJudgeLabel(availableJudges[[...selectedJudgeIdxs][0]]) + : `${selectedCount} judges`; + judgeItems.push({ + label: `βš–οΈ Run ${judgeLabel} (aggregate ${unique.length} issues)`, + value: "run-judge", + }); + } else { + judgeItems.push({ + label: `⚠️ No judges configured. Set FALLACY_JUDGES or FALLACY_JUDGE env var`, + value: "no-judges", + }); + } + + judgeItems.push({ label: "← Back to Extraction Results", value: "back" }); + + return ( + + + Pre-Judge Deduplication + + + {/* Summary stats */} + + + Original: {originalCount} + β†’ + {unique.length} unique + {duplicates.length > 0 && | {duplicates.length} duplicates removed} + + + + {/* Duplicates list (if any) */} + {duplicates.length > 0 && ( + + Duplicates removed: + {duplicates.slice(0, 3).map((d, idx) => ( + + {" "}[{d.extractorId}] {truncate(d.exactText.replace(/\n/g, ' '), issueTextWidth - 20)} + + ))} + {duplicates.length > 3 && ... and {duplicates.length - 3} more} + + )} + + {/* Judge selection */} + + Select Judges: + + + { + if (item.value === "back") { + onBack(); + } else if (item.value === "run-judge") { + const selectedConfigs = [...selectedJudgeIdxs].map(idx => availableJudges[idx]); + onRunJudges(selectedConfigs); + } else if (item.value.startsWith("judge-")) { + const idx = parseInt(item.value.replace("judge-", ""), 10); + onToggleJudge(idx); + } + }} + /> + + + Toggle judges with Enter | Escape=Back + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/ResultsView.tsx b/meta-evals/src/components/extractor-lab/views/ResultsView.tsx new file mode 100644 index 00000000..d2100b0a --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/ResultsView.tsx @@ -0,0 +1,102 @@ +import React from "react"; +import { Box, Text } from "ink"; +import SelectInput from "ink-select-input"; +import type { MultiExtractorResult, DocumentChoice } from "../types"; +import { truncate } from "../utils"; + +interface ResultsViewProps { + height: number; + maxItems: number; + result: MultiExtractorResult; + selectedDoc: DocumentChoice | null; + issueTextWidth: number; + onBack: () => void; + onRunDedup: () => void; + onViewIssue: (extractorIdx: number, issueIdx: number) => void; +} + +export function ResultsView({ + height, + maxItems, + result, + selectedDoc, + issueTextWidth, + onBack, + onRunDedup, + onViewIssue, +}: ResultsViewProps) { + const totalIssues = result.extractorResults.reduce((sum, r) => sum + r.issues.length, 0); + + // Build flat list of issues with extractor info + const issueItems: Array<{ label: string; value: string }> = []; + + result.extractorResults.forEach((r, extractorIdx) => { + // Add extractor header + const tempStr = r.config.temperature === 'default' ? 'tDef' : `t${r.config.temperature}`; + const thinkStr = r.config.thinking ? '' : ' noThink'; + issueItems.push({ + label: `── ${r.extractorId} (${tempStr}${thinkStr}) - ${r.issues.length} issues, ${(r.durationMs / 1000).toFixed(1)}s ──`, + value: `header-${extractorIdx}`, + }); + // Add issues for this extractor + r.issues.forEach((issue, issueIdx) => { + const severityColor = issue.severityScore >= 70 ? 'πŸ”΄' : issue.severityScore >= 40 ? '🟑' : '🟒'; + issueItems.push({ + label: ` ${severityColor} [${issue.issueType}] ${truncate(issue.exactText.replace(/\n/g, ' '), issueTextWidth)}`, + value: `issue-${extractorIdx}-${issueIdx}`, + }); + }); + }); + + // Actions at the bottom + issueItems.push({ label: "───────────────────────────────────────────────────────────────────────────", value: "sep-1" }); + + // Deduplicate button (only if we have issues) + if (totalIssues > 0) { + issueItems.push({ + label: `β–Ά Deduplicate & Prepare for Judge (${totalIssues} issues)`, + value: "run-dedup", + }); + } + issueItems.push({ label: "← Back to Configure", value: "back" }); + + return ( + + + Extractor Lab - Extraction Results: + {selectedDoc?.title} + + + + + Duration: {(result.totalDurationMs / 1000).toFixed(1)}s + | + Issues: {totalIssues} + | + Extractors: {result.extractorResults.length} + + + + { + if (item.value.startsWith("sep-") || item.value.startsWith("header-")) { + return; + } else if (item.value === "back") { + onBack(); + } else if (item.value === "run-dedup") { + onRunDedup(); + } else if (item.value.startsWith("issue-")) { + const [, extractorIdx, issueIdx] = item.value.split("-"); + onViewIssue(parseInt(extractorIdx), parseInt(issueIdx)); + } + }} + /> + + + Enter View Detail | Escape Back + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/RunningJudgeView.tsx b/meta-evals/src/components/extractor-lab/views/RunningJudgeView.tsx new file mode 100644 index 00000000..28b83a62 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/RunningJudgeView.tsx @@ -0,0 +1,35 @@ +import React from "react"; +import { Box, Text } from "ink"; +import Spinner from "ink-spinner"; +import type { JudgeConfig, PreJudgeDedupResult } from "../types"; + +interface RunningJudgeViewProps { + height: number; + judgeConfigs: JudgeConfig[]; + dedupResult: PreJudgeDedupResult; + generateJudgeLabel: (config: JudgeConfig) => string; +} + +export function RunningJudgeView({ height, judgeConfigs, dedupResult, generateJudgeLabel }: RunningJudgeViewProps) { + const judgeLabels = judgeConfigs.map(c => generateJudgeLabel(c)).join(", "); + + return ( + + + Running LLM Judge + + + + + Running {judgeConfigs.length} judge(s): {judgeLabels} + + + + + + Aggregating {dedupResult.unique.length} unique issues (from {dedupResult.originalCount} total)... + + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/RunningView.tsx b/meta-evals/src/components/extractor-lab/views/RunningView.tsx new file mode 100644 index 00000000..2d4cc78b --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/RunningView.tsx @@ -0,0 +1,28 @@ +import React from "react"; +import { Box, Text } from "ink"; +import Spinner from "ink-spinner"; + +interface RunningViewProps { + height: number; + extractorCount: number; +} + +export function RunningView({ height, extractorCount }: RunningViewProps) { + return ( + + + Extractor Lab - Running + + + + + Running {extractorCount} extractor(s)... + + + + + This may take a minute... + + + ); +} diff --git a/meta-evals/src/components/extractor-lab/views/index.ts b/meta-evals/src/components/extractor-lab/views/index.ts new file mode 100644 index 00000000..725d1572 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/views/index.ts @@ -0,0 +1,10 @@ +export { ErrorView } from "./ErrorView"; +export { RunningView } from "./RunningView"; +export { RunningJudgeView } from "./RunningJudgeView"; +export { ConfigureExtractorsView } from "./ConfigureExtractorsView"; +export { IssueDetailView } from "./IssueDetailView"; +export { ResultsView } from "./ResultsView"; +export { PreJudgeDedupView } from "./PreJudgeDedupView"; +export { JudgeResultsView } from "./JudgeResultsView"; +export { JudgeDecisionDetailView } from "./JudgeDecisionDetailView"; +export { JudgeComparisonView } from "./JudgeComparisonView"; From 59e5ac6f57bbfe8b950b1d395152b494e53a9abe Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 20:59:06 +0000 Subject: [PATCH 37/72] refactor(fallacy-check): Extract dedup into separate module Split deduplicateIssues() into two focused functions: - deduplicateIssues(): Remove exact text duplicates - prioritizeAndLimitIssues(): Score, sort, and limit issues New file: fallacy-check/dedup.ts with: - normalizeTextForDedup() - calculatePriorityScore() - deduplicateIssues() - prioritizeAndLimitIssues() - deduplicateAndPrioritize() (convenience wrapper) This prepares for improving the dedup algorithm with fuzzy matching. Co-Authored-By: Claude Opus 4.5 --- .../plugins/fallacy-check/dedup.ts | 95 +++++++++++++++++++ .../plugins/fallacy-check/index.ts | 56 +---------- 2 files changed, 98 insertions(+), 53 deletions(-) create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts new file mode 100644 index 00000000..e1b4e6bd --- /dev/null +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts @@ -0,0 +1,95 @@ +/** + * Deduplication and prioritization utilities for fallacy issues + */ + +import { logger } from "../../../shared/logger"; +import type { FallacyIssue } from "./FallacyIssue"; +import { LIMITS } from "./constants"; + +/** + * Calculate priority score for an issue. + * Higher score = more important to address. + */ +export function calculatePriorityScore(issue: FallacyIssue): number { + return issue.severityScore * 0.6 + issue.importanceScore * 0.4; +} + +/** + * Deduplicate issues by removing exact text matches. + * Uses case-insensitive, whitespace-normalized comparison. + * + * TODO: This is too strict - different extractors quoting slightly different + * portions of the same passage won't match. Consider fuzzy matching. + */ +export function deduplicateIssues(issues: FallacyIssue[]): FallacyIssue[] { + const seen = new Set(); + const unique: FallacyIssue[] = []; + + for (const issue of issues) { + const key = normalizeTextForDedup(issue.text); + if (!seen.has(key)) { + seen.add(key); + unique.push(issue); + } + } + + return unique; +} + +/** + * Normalize text for deduplication comparison. + * - Lowercase + * - Collapse whitespace + * - Trim + */ +export function normalizeTextForDedup(text: string): string { + return text.toLowerCase().replace(/\s+/g, " ").trim(); +} + +/** + * Prioritize and limit issues based on severity and importance scores. + * - Sorts by priority score (highest first) + * - Limits to MAX_ISSUES_TO_PROCESS if too many + */ +export function prioritizeAndLimitIssues(issues: FallacyIssue[]): FallacyIssue[] { + // Sort by priority score (most important issues first) + const sortedIssues = [...issues].sort( + (a, b) => calculatePriorityScore(b) - calculatePriorityScore(a) + ); + + // Limit to maximum issues if we have too many + if (sortedIssues.length > LIMITS.MAX_ISSUES_TO_PROCESS) { + logger.info( + `Limiting issues from ${sortedIssues.length} to ${LIMITS.MAX_ISSUES_TO_PROCESS} based on priority scores` + ); + + const keptIssues = sortedIssues.slice(0, LIMITS.MAX_ISSUES_TO_PROCESS); + const discardedIssues = sortedIssues.slice(LIMITS.MAX_ISSUES_TO_PROCESS); + + const avgKeptScore = + keptIssues.reduce((sum, i) => sum + calculatePriorityScore(i), 0) / + keptIssues.length; + const avgDiscardedScore = + discardedIssues.length > 0 + ? discardedIssues.reduce((sum, i) => sum + calculatePriorityScore(i), 0) / + discardedIssues.length + : 0; + + logger.debug( + `Priority scores - Kept issues avg: ${avgKeptScore.toFixed(1)}, ` + + `Discarded issues avg: ${avgDiscardedScore.toFixed(1)}` + ); + + return keptIssues; + } + + return sortedIssues; +} + +/** + * Full deduplication pipeline: deduplicate, then prioritize and limit. + */ +export function deduplicateAndPrioritize(issues: FallacyIssue[]): FallacyIssue[] { + const deduplicated = deduplicateIssues(issues); + return prioritizeAndLimitIssues(deduplicated); +} diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts index 58f73a80..31148708 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/index.ts @@ -34,6 +34,7 @@ import { getConfigSummary, } from "./extraction/config"; import { runMultiExtractor, simpleDeduplication } from "./extraction/multiExtractor"; +import { deduplicateIssues, prioritizeAndLimitIssues } from "./dedup"; export class FallacyCheckPlugin implements SimpleAnalysisPlugin { private documentText: string; @@ -176,7 +177,8 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { // Phase 1.5: Deduplicate issues by similar text telemetry.startStage(PIPELINE_STAGES.DEDUPLICATION, allIssues.length); - const deduplicatedIssues = this.deduplicateIssues(allIssues); + const uniqueIssues = deduplicateIssues(allIssues); + const deduplicatedIssues = prioritizeAndLimitIssues(uniqueIssues); telemetry.endStage(deduplicatedIssues.length); telemetry.setFinalCounts({ issuesAfterDedup: deduplicatedIssues.length }); @@ -523,58 +525,6 @@ export class FallacyCheckPlugin implements SimpleAnalysisPlugin { return counts; } - private deduplicateIssues(issues: FallacyIssue[]): FallacyIssue[] { - const seen = new Set(); - const unique: FallacyIssue[] = []; - - for (const issue of issues) { - const key = issue.text.toLowerCase().replace(/\s+/g, " ").trim(); - if (!seen.has(key)) { - seen.add(key); - unique.push(issue); - } - } - - // Calculate priority score for each issue - // Higher score = more important to address - const priorityScore = (issue: FallacyIssue) => - issue.severityScore * 0.6 + issue.importanceScore * 0.4; - - // Sort by priority score (most important issues first) - const sortedIssues = unique.sort( - (a, b) => priorityScore(b) - priorityScore(a) - ); - - // Limit to maximum issues if we have too many - if (sortedIssues.length > LIMITS.MAX_ISSUES_TO_PROCESS) { - logger.info( - `Limiting issues from ${sortedIssues.length} to ${LIMITS.MAX_ISSUES_TO_PROCESS} based on priority scores` - ); - - // Log details about what's being kept and discarded - const keptIssues = sortedIssues.slice(0, LIMITS.MAX_ISSUES_TO_PROCESS); - const discardedIssues = sortedIssues.slice(LIMITS.MAX_ISSUES_TO_PROCESS); - - const avgKeptScore = - keptIssues.reduce((sum, i) => sum + priorityScore(i), 0) / - keptIssues.length; - const avgDiscardedScore = - discardedIssues.length > 0 - ? discardedIssues.reduce((sum, i) => sum + priorityScore(i), 0) / - discardedIssues.length - : 0; - - logger.debug( - `Priority scores - Kept issues avg: ${avgKeptScore.toFixed(1)}, ` + - `Discarded issues avg: ${avgDiscardedScore.toFixed(1)}` - ); - - return keptIssues; - } - - return sortedIssues; - } - /** * Run the supported-elsewhere filter to remove false positives */ From d0199f476956d729068a57bedeae72741b6828a1 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 22:25:26 +0000 Subject: [PATCH 38/72] feat(meta-evals): Add quality-based deduplication with Jaccard similarity - Replace "first seen wins" dedup with quality-based selection - Issues with longer text and higher scores are kept when duplicates found - Add computeIssueQuality() scoring: text length (40%), confidence (25%), severity (20%), importance (15%) - Simplify PreJudgeDedupView to use only Jaccard strategy - Add export function for full dedup analysis validation - Disable uFuzzy (performance issues), keep Fuse.js available - Add tmux key sending best practices to CLAUDE.md Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 16 + internal-packages/ai/package.json | 1 + meta-evals/package.json | 2 + meta-evals/src/components/ExtractorLab.tsx | 55 ++- .../components/extractor-lab/fuzzy-dedup.ts | 323 ++++++++++++++++++ .../src/components/extractor-lab/types.ts | 28 +- .../src/components/extractor-lab/utils.ts | 42 +-- .../extractor-lab/views/PreJudgeDedupView.tsx | 174 +++++++--- pnpm-lock.yaml | 17 + 9 files changed, 559 insertions(+), 99 deletions(-) create mode 100644 meta-evals/src/components/extractor-lab/fuzzy-dedup.ts diff --git a/CLAUDE.md b/CLAUDE.md index 8bcdcef9..b27299d1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -278,6 +278,22 @@ Details here" /bin/rm, /bin/cat, /bin/echo # Use full paths ``` +## Tmux Key Sending + +When sending multiple keystrokes to tmux sessions (e.g., navigating CLI menus), use a loop with delays between keystrokes instead of sending them all at once. + +**Bad** (keys may be dropped or processed incorrectly): +```bash +tmux send-keys -t session Down Down Down Down Down Enter +``` + +**Good** (reliable keystroke delivery): +```bash +for i in {1..5}; do tmux send-keys -t session Down; sleep 0.1; done; tmux send-keys -t session Enter +``` + +This ensures each keystroke is processed before the next is sent, preventing navigation issues in terminal UIs. + ## Documentation Structure - `/dev/docs/README.md` - Documentation index - `/dev/docs/development/` - Development guides diff --git a/internal-packages/ai/package.json b/internal-packages/ai/package.json index ba79f6cc..1baa85c9 100644 --- a/internal-packages/ai/package.json +++ b/internal-packages/ai/package.json @@ -90,6 +90,7 @@ "dependencies": { "@anthropic-ai/sdk": "^0.54.0", "@leeoniya/ufuzzy": "^1.0.18", + "fuse.js": "^7.1.0", "mathjs": "^14.0.1", "openai": "^4.77.0", "tiktoken": "^1.0.17", diff --git a/meta-evals/package.json b/meta-evals/package.json index bf838fe2..937fe2bb 100644 --- a/meta-evals/package.json +++ b/meta-evals/package.json @@ -8,9 +8,11 @@ "start": "node --import tsx/esm src/index.tsx" }, "dependencies": { + "@leeoniya/ufuzzy": "^1.0.18", "@roast/ai": "workspace:*", "@roast/db": "workspace:*", "dotenv": "^16.4.5", + "fuse.js": "^7.1.0", "ink": "^6.5.1", "ink-select-input": "^6.2.0", "ink-spinner": "^5.0.0", diff --git a/meta-evals/src/components/ExtractorLab.tsx b/meta-evals/src/components/ExtractorLab.tsx index 56bf2629..d2f5e2fc 100644 --- a/meta-evals/src/components/ExtractorLab.tsx +++ b/meta-evals/src/components/ExtractorLab.tsx @@ -24,15 +24,17 @@ import type { ExtractorLabProps, LabStep, JudgeRunResult, - PreJudgeDedupResult, ExtractorIssue, + DedupStrategy, + DedupComparison, + MultiStrategyDedupResult, } from "./extractor-lab/types"; import { truncate, simpleLogger, TEMP_PRESETS, calculateTextWidths, - runPreJudgeDedup as runPreJudgeDedupUtil, + runMultiStrategyDedup, } from "./extractor-lab/utils"; import { ErrorView, @@ -77,6 +79,7 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o const [extractorConfigs, setExtractorConfigs] = useState(getInitialExtractorConfigs); const [availableJudges] = useState(() => getJudgesConfig()); const [selectedJudgeIdxs, setSelectedJudgeIdxs] = useState>(() => new Set([0])); + const [selectedStrategy, setSelectedStrategy] = useState("jaccard"); const [error, setError] = useState(null); const [highlightedItem, setHighlightedItem] = useState(""); @@ -125,12 +128,17 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } } - function runPreJudgeDedup(extractionResult: MultiExtractorResult, navigate = true): PreJudgeDedupResult { - const dedupResult = runPreJudgeDedupUtil(extractionResult); - if (navigate) { - setStep({ type: "pre-judge-dedup", result: extractionResult, dedupResult }); + function runPreJudgeDedup(extractionResult: MultiExtractorResult, navigate = true): MultiStrategyDedupResult | null { + try { + const multiDedup = runMultiStrategyDedup(extractionResult); + if (navigate) { + setStep({ type: "pre-judge-dedup", result: extractionResult, multiDedup, selectedStrategy }); + } + return multiDedup; + } catch (err) { + setError(`Dedup failed: ${err}`); + return null; } - return dedupResult; } async function runJudge( @@ -179,10 +187,17 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o async function runMultipleJudges( extractionResult: MultiExtractorResult, - dedupResult: PreJudgeDedupResult, + dedupResult: DedupComparison, judgeConfigs: JudgeConfig[] ) { - setStep({ type: "running-judge", result: extractionResult, dedupResult, judgeConfigs }); + // Convert DedupComparison to PreJudgeDedupResult for running-judge step + // Extract just the duplicate issues (not the match info) + const preDedupResult = { + unique: dedupResult.unique, + duplicates: dedupResult.duplicates.map(m => m.duplicate), + originalCount: dedupResult.originalCount, + }; + setStep({ type: "running-judge", result: extractionResult, dedupResult: preDedupResult, judgeConfigs }); const results = await Promise.all( judgeConfigs.map((config) => @@ -220,8 +235,10 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o if (judgeResults) { setStep({ type: "judge-comparison", result, judgeResults }); } else { - const dedupResult = runPreJudgeDedup(result, false); - setStep({ type: "pre-judge-dedup", result, dedupResult }); + const multiDedup = runPreJudgeDedup(result, false); + if (multiDedup) { + setStep({ type: "pre-judge-dedup", result, multiDedup, selectedStrategy }); + } } } else if (currentStep.type === "judge-comparison") { setStep({ type: "results", result: currentStep.result }); @@ -363,18 +380,20 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o } if (step.type === "pre-judge-dedup") { + const currentDedup = step.multiDedup[step.selectedStrategy]; return ( setStep({ type: "results", result: step.result })} - onRunJudges={(configs) => runMultipleJudges(step.result, step.dedupResult, configs)} + onRunJudges={(configs, dedupResult) => runMultipleJudges(step.result, dedupResult, configs)} onToggleJudge={(idx) => { setSelectedJudgeIdxs(prev => { const next = new Set(prev); @@ -386,6 +405,10 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o return next; }); }} + onSelectStrategy={(strategy) => { + setSelectedStrategy(strategy); + setStep({ type: "pre-judge-dedup", result: step.result, multiDedup: step.multiDedup, selectedStrategy: strategy }); + }} /> ); } @@ -439,8 +462,10 @@ export function ExtractorLab({ height, maxItems, documents, onSearchDocuments, o if (judgeResults) { setStep({ type: "judge-comparison", result, judgeResults }); } else { - const dedupResult = runPreJudgeDedup(result, false); - setStep({ type: "pre-judge-dedup", result, dedupResult }); + const multiDedup = runPreJudgeDedup(result, false); + if (multiDedup) { + setStep({ type: "pre-judge-dedup", result, multiDedup, selectedStrategy }); + } } }} onViewDecision={(decision, isRejected) => { diff --git a/meta-evals/src/components/extractor-lab/fuzzy-dedup.ts b/meta-evals/src/components/extractor-lab/fuzzy-dedup.ts new file mode 100644 index 00000000..81d15492 --- /dev/null +++ b/meta-evals/src/components/extractor-lab/fuzzy-dedup.ts @@ -0,0 +1,323 @@ +/** + * Fuzzy deduplication strategies for comparing extraction issues. + * + * Four strategies: + * 1. Exact - Normalized exact match + * 2. Jaccard - Word overlap similarity + * 3. Fuse.js - Fuzzy search with Bitap algorithm + * 4. uFuzzy - Lightweight fuzzy search + */ + +import Fuse from "fuse.js"; +import uFuzzy from "@leeoniya/ufuzzy"; +import type { + ExtractorIssue, + DedupStrategy, + DedupComparison, + DuplicateMatch, + MultiStrategyDedupResult, +} from "./types"; + +// ============================================================================ +// Normalization +// ============================================================================ + +function normalizeText(text: string): string { + return text.toLowerCase().replace(/\s+/g, " ").trim(); +} + +function getWords(text: string): Set { + return new Set( + normalizeText(text) + .split(/\s+/) + .filter((w) => w.length > 2) + ); +} + +// ============================================================================ +// Similarity Functions +// ============================================================================ + +/** + * Jaccard similarity: intersection over union of words + */ +export function jaccardSimilarity(a: string, b: string): number { + const wordsA = getWords(a); + const wordsB = getWords(b); + + if (wordsA.size === 0 && wordsB.size === 0) return 1; + if (wordsA.size === 0 || wordsB.size === 0) return 0; + + const intersection = [...wordsA].filter((w) => wordsB.has(w)).length; + const union = new Set([...wordsA, ...wordsB]).size; + + return intersection / union; +} + +/** + * Check if one text contains the other (after normalization) + */ +export function isSubstring(a: string, b: string): boolean { + const normA = normalizeText(a); + const normB = normalizeText(b); + return normA.includes(normB) || normB.includes(normA); +} + +/** + * Fuse.js similarity score (0 = perfect match, 1 = no match) + */ +export function fuseSimilarity(a: string, b: string): number { + const fuse = new Fuse([{ text: b }], { + keys: ["text"], + includeScore: true, + threshold: 1.0, // Accept all results, we'll check score ourselves + ignoreLocation: true, + minMatchCharLength: 2, + }); + + const results = fuse.search(a); + if (results.length > 0 && results[0].score !== undefined) { + return results[0].score; + } + return 1; +} + +/** + * uFuzzy similarity (returns 0-1, higher = more similar) + */ +export function ufuzzySimilarity(a: string, b: string): number { + const uf = new uFuzzy({ + intraMode: 1, + intraIns: 1, + intraSub: 1, + intraTrn: 1, + intraDel: 1, + }); + + const haystack = [b]; + const [idxs, info] = uf.search(haystack, a); + + if (idxs && idxs.length > 0 && info && info.ranges[0]) { + const ranges = info.ranges[0]; + let matchedChars = 0; + for (let i = 0; i < ranges.length; i += 2) { + matchedChars += ranges[i + 1] - ranges[i]; + } + return matchedChars / Math.max(a.length, b.length); + } + + return 0; +} + +// ============================================================================ +// Deduplication Strategies +// ============================================================================ + +/** + * Calculate similarity between two issues using the specified strategy. + * Returns { isDuplicate, similarity } where similarity is 0-1 (higher = more similar) + */ +export function calculateSimilarity( + a: ExtractorIssue, + b: ExtractorIssue, + strategy: DedupStrategy, + threshold = 0.5 +): { isDuplicate: boolean; similarity: number } { + const textA = a.exactText; + const textB = b.exactText; + + switch (strategy) { + case "exact": { + const isMatch = normalizeText(textA) === normalizeText(textB); + return { isDuplicate: isMatch, similarity: isMatch ? 1 : 0 }; + } + + case "jaccard": { + // Check substring first + if (isSubstring(textA, textB)) { + return { isDuplicate: true, similarity: 1 }; + } + const sim = jaccardSimilarity(textA, textB); + return { isDuplicate: sim >= threshold, similarity: sim }; + } + + case "fuse": { + // Check substring first + if (isSubstring(textA, textB)) { + return { isDuplicate: true, similarity: 1 }; + } + // Fuse score: 0 = perfect, 1 = no match. Convert to 0-1 similarity. + const fuseScore = fuseSimilarity(textA, textB); + const sim = 1 - fuseScore; + return { isDuplicate: fuseScore < 0.4, similarity: sim }; + } + + case "ufuzzy": { + // Check substring first + if (isSubstring(textA, textB)) { + return { isDuplicate: true, similarity: 1 }; + } + const sim = ufuzzySimilarity(textA, textB); + return { isDuplicate: sim > threshold, similarity: sim }; + } + + default: + return { isDuplicate: false, similarity: 0 }; + } +} + +/** + * Compute a quality score for an issue. + * Higher = better quality (prefer to keep). + * Factors: text length (more context), severity, confidence, importance. + */ +function computeIssueQuality(issue: ExtractorIssue): number { + // Normalize text length (log scale to prevent extremely long texts from dominating) + const lengthScore = Math.log10(issue.exactText.length + 1) / 4; // ~0.5-1.0 for typical lengths + + // Combine severity, confidence, importance (each 0-100, normalize to 0-1) + const severityNorm = issue.severityScore / 100; + const confidenceNorm = issue.confidenceScore / 100; + const importanceNorm = issue.importanceScore / 100; + + // Weighted combination: prefer longer text, then higher scores + // Length is most important (40%), then confidence (25%), severity (20%), importance (15%) + return ( + lengthScore * 0.4 + + confidenceNorm * 0.25 + + severityNorm * 0.2 + + importanceNorm * 0.15 + ); +} + +/** + * Deduplicate issues using a specific strategy. + * Returns unique issues and duplicate matches with similarity info. + * + * When duplicates are found, KEEPS the issue with higher quality score + * (longer text + higher severity/confidence/importance). + */ +export function deduplicateWithStrategy( + issues: ExtractorIssue[], + strategy: DedupStrategy +): DedupComparison { + const unique: ExtractorIssue[] = []; + const duplicates: DuplicateMatch[] = []; + + for (const issue of issues) { + // Check if this issue is a duplicate of any already-kept issue + let bestMatch: { keptIdx: number; kept: ExtractorIssue; similarity: number } | null = null; + + for (let i = 0; i < unique.length; i++) { + const kept = unique[i]; + const { isDuplicate, similarity } = calculateSimilarity(issue, kept, strategy); + if (isDuplicate) { + if (!bestMatch || similarity > bestMatch.similarity) { + bestMatch = { keptIdx: i, kept, similarity }; + } + } + } + + if (bestMatch) { + // Found a duplicate - decide which to keep based on quality score + const newQuality = computeIssueQuality(issue); + const keptQuality = computeIssueQuality(bestMatch.kept); + + if (newQuality > keptQuality) { + // New issue is better - swap: remove kept, add new, mark kept as duplicate + duplicates.push({ + duplicate: bestMatch.kept, + matchedTo: issue, + similarity: bestMatch.similarity, + }); + unique[bestMatch.keptIdx] = issue; + } else { + // Kept issue is better - mark new as duplicate + duplicates.push({ + duplicate: issue, + matchedTo: bestMatch.kept, + similarity: bestMatch.similarity, + }); + } + } else { + unique.push(issue); + } + } + + return { + strategy, + unique, + duplicates, + originalCount: issues.length, + }; +} + +/** + * Run all dedup strategies and return comparison results + */ +export function runAllDedupStrategies( + issues: ExtractorIssue[] +): MultiStrategyDedupResult { + console.error(`[DEDUP] Running dedup on ${issues.length} issues...`); + + const t0 = Date.now(); + const exact = deduplicateWithStrategy(issues, "exact"); + console.error(`[DEDUP] exact: ${Date.now() - t0}ms`); + + const t1 = Date.now(); + const jaccard = deduplicateWithStrategy(issues, "jaccard"); + console.error(`[DEDUP] jaccard: ${Date.now() - t1}ms`); + + const t2 = Date.now(); + const fuse = deduplicateWithStrategy(issues, "fuse"); + console.error(`[DEDUP] fuse: ${Date.now() - t2}ms`); + + // NOTE: uFuzzy is disabled due to performance issues (hangs on large texts) + // const t3 = Date.now(); + // const ufuzzy = deduplicateWithStrategy(issues, "ufuzzy"); + // console.error(`[DEDUP] ufuzzy: ${Date.now() - t3}ms`); + + // Return same as jaccard for now (uFuzzy disabled) + const ufuzzy: DedupComparison = { + strategy: "ufuzzy", + unique: jaccard.unique, + duplicates: jaccard.duplicates, + originalCount: jaccard.originalCount, + }; + console.error(`[DEDUP] ufuzzy: DISABLED (using jaccard results)`); + + console.error(`[DEDUP] Total: ${Date.now() - t0}ms`); + + return { exact, jaccard, fuse, ufuzzy }; +} + +/** + * Flatten extractor results into issues with extractor IDs + */ +export function flattenExtractorResults( + extractorResults: Array<{ + extractorId: string; + issues: Array<{ + exactText: string; + issueType: string; + fallacyType?: string; + severityScore: number; + confidenceScore: number; + importanceScore: number; + reasoning: string; + }>; + }> +): ExtractorIssue[] { + return extractorResults.flatMap((r) => + r.issues.map((issue) => ({ + extractorId: r.extractorId, + exactText: issue.exactText, + issueType: issue.issueType, + fallacyType: issue.fallacyType, + severityScore: issue.severityScore, + confidenceScore: issue.confidenceScore, + importanceScore: issue.importanceScore, + reasoning: issue.reasoning, + })) + ); +} diff --git a/meta-evals/src/components/extractor-lab/types.ts b/meta-evals/src/components/extractor-lab/types.ts index f5570fde..16c08853 100644 --- a/meta-evals/src/components/extractor-lab/types.ts +++ b/meta-evals/src/components/extractor-lab/types.ts @@ -55,6 +55,32 @@ export interface PreJudgeDedupResult { originalCount: number; } +/** Dedup strategy identifier */ +export type DedupStrategy = "exact" | "jaccard" | "fuse" | "ufuzzy"; + +/** A duplicate issue with info about what it matched */ +export interface DuplicateMatch { + duplicate: ExtractorIssue; + matchedTo: ExtractorIssue; + similarity: number; // 0-1 similarity score +} + +/** Result from a single dedup strategy */ +export interface DedupComparison { + strategy: DedupStrategy; + unique: ExtractorIssue[]; + duplicates: DuplicateMatch[]; + originalCount: number; +} + +/** Results from all dedup strategies for comparison */ +export interface MultiStrategyDedupResult { + exact: DedupComparison; + jaccard: DedupComparison; + fuse: DedupComparison; + ufuzzy: DedupComparison; +} + /** All possible steps/views in the Extractor Lab */ export type LabStep = | { type: "select-document" } @@ -63,7 +89,7 @@ export type LabStep = | { type: "running" } | { type: "results"; result: MultiExtractorResult } | { type: "issue-detail"; result: MultiExtractorResult; extractorIdx: number; issueIdx: number } - | { type: "pre-judge-dedup"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult } + | { type: "pre-judge-dedup"; result: MultiExtractorResult; multiDedup: MultiStrategyDedupResult; selectedStrategy: DedupStrategy } | { type: "running-judge"; result: MultiExtractorResult; dedupResult: PreJudgeDedupResult; judgeConfigs: JudgeConfig[] } | { type: "judge-comparison"; result: MultiExtractorResult; judgeResults: JudgeRunResult[] } | { type: "judge-results"; result: MultiExtractorResult; judgeResult: FallacyJudgeOutput; judgeLabel: string; judgeResults?: JudgeRunResult[] } diff --git a/meta-evals/src/components/extractor-lab/utils.ts b/meta-evals/src/components/extractor-lab/utils.ts index b5cc0397..dd26fb1d 100644 --- a/meta-evals/src/components/extractor-lab/utils.ts +++ b/meta-evals/src/components/extractor-lab/utils.ts @@ -6,7 +6,8 @@ import { getMultiExtractorConfig, type ExtractorConfig, } from "@roast/ai/fallacy-extraction/lab"; -import type { SimpleLogger, ExtractorIssue, PreJudgeDedupResult, MultiExtractorResult } from "./types"; +import type { SimpleLogger, MultiExtractorResult, MultiStrategyDedupResult } from "./types"; +import { flattenExtractorResults, runAllDedupStrategies } from "./fuzzy-dedup"; /** Temperature presets for cycling */ export const TEMP_PRESETS = ["default", 0, 0.3, 0.5, 0.7, 1.0] as const; @@ -43,42 +44,13 @@ export function generateExtractorLabel(config: ExtractorConfig): string { return `${modelShort}-${tempStr}-${thinkStr}`; } -/** Run pre-judge deduplication on extractor results */ -export function runPreJudgeDedup(extractionResult: MultiExtractorResult): PreJudgeDedupResult { +/** Run all pre-judge deduplication strategies on extractor results */ +export function runMultiStrategyDedup(extractionResult: MultiExtractorResult): MultiStrategyDedupResult { // Flatten all issues from all extractors - const allIssues: ExtractorIssue[] = extractionResult.extractorResults.flatMap((r) => - r.issues.map((issue) => ({ - extractorId: r.extractorId, - exactText: issue.exactText, - issueType: issue.issueType, - fallacyType: issue.fallacyType, - severityScore: issue.severityScore, - confidenceScore: issue.confidenceScore, - importanceScore: issue.importanceScore, - reasoning: issue.reasoning, - })) - ); + const allIssues = flattenExtractorResults(extractionResult.extractorResults); - // Remove exact text duplicates (case-insensitive, whitespace normalized) - const seen = new Set(); - const unique: ExtractorIssue[] = []; - const duplicates: ExtractorIssue[] = []; - - for (const issue of allIssues) { - const key = issue.exactText.toLowerCase().replace(/\s+/g, " ").trim(); - if (!seen.has(key)) { - seen.add(key); - unique.push(issue); - } else { - duplicates.push(issue); - } - } - - return { - unique, - duplicates, - originalCount: allIssues.length, - }; + // Run all dedup strategies for comparison + return runAllDedupStrategies(allIssues); } /** Calculate text widths based on terminal width */ diff --git a/meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx b/meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx index 4a8d3cc3..496d010e 100644 --- a/meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx +++ b/meta-evals/src/components/extractor-lab/views/PreJudgeDedupView.tsx @@ -1,113 +1,191 @@ import React from "react"; import { Box, Text } from "ink"; import SelectInput from "ink-select-input"; -import type { MultiExtractorResult, PreJudgeDedupResult, JudgeConfig } from "../types"; -import { truncate } from "../utils"; +import * as fs from "fs"; +import * as path from "path"; +import type { MultiExtractorResult, MultiStrategyDedupResult, DedupStrategy, JudgeConfig, DedupComparison } from "../types"; interface PreJudgeDedupViewProps { height: number; maxItems: number; result: MultiExtractorResult; - dedupResult: PreJudgeDedupResult; + multiDedup: MultiStrategyDedupResult; + selectedStrategy: DedupStrategy; availableJudges: JudgeConfig[]; selectedJudgeIdxs: Set; issueTextWidth: number; generateJudgeLabel: (config: JudgeConfig) => string; onBack: () => void; - onRunJudges: (selectedConfigs: JudgeConfig[]) => void; + onRunJudges: (selectedConfigs: JudgeConfig[], dedupResult: DedupComparison) => void; onToggleJudge: (idx: number) => void; + onSelectStrategy: (strategy: DedupStrategy) => void; +} + +const STRATEGY_LABELS: Record = { + exact: "Exact Match", + jaccard: "Jaccard (word overlap)", + fuse: "Fuse.js (fuzzy)", + ufuzzy: "uFuzzy (fuzzy)", +}; + +/** Export full dedup analysis to a file for validation */ +function exportDedupAnalysis(multiDedup: MultiStrategyDedupResult, selectedStrategy: DedupStrategy): string { + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + const filename = `dedup-analysis-${timestamp}.txt`; + const filepath = path.join(process.cwd(), filename); + + let output = "=".repeat(80) + "\n"; + output += "DEDUP ANALYSIS EXPORT\n"; + output += `Generated: ${new Date().toISOString()}\n`; + output += "=".repeat(80) + "\n\n"; + + // Summary + output += "STRATEGY COMPARISON:\n"; + output += "-".repeat(40) + "\n"; + for (const strategy of ["exact", "jaccard", "fuse", "ufuzzy"] as DedupStrategy[]) { + const dedup = multiDedup[strategy]; + output += `${strategy.padEnd(10)}: ${dedup.unique.length} unique, ${dedup.duplicates.length} duplicates (from ${dedup.originalCount} total)\n`; + } + output += "\n"; + + // Detailed analysis for each strategy + for (const strategy of ["exact", "jaccard", "fuse", "ufuzzy"] as DedupStrategy[]) { + const dedup = multiDedup[strategy]; + + output += "=".repeat(80) + "\n"; + output += `STRATEGY: ${STRATEGY_LABELS[strategy].toUpperCase()}\n`; + output += "=".repeat(80) + "\n\n"; + + if (dedup.duplicates.length === 0) { + output += "No duplicates found.\n\n"; + } else { + output += `DUPLICATE PAIRS (${dedup.duplicates.length}):\n`; + output += "-".repeat(40) + "\n\n"; + + dedup.duplicates.forEach((match, idx) => { + output += `${idx + 1}. REMOVED [${match.duplicate.extractorId}]:\n`; + output += ` "${match.duplicate.exactText}"\n\n`; + output += ` KEPT [${match.matchedTo.extractorId}] (similarity: ${Math.round(match.similarity * 100)}%):\n`; + output += ` "${match.matchedTo.exactText}"\n\n`; + output += "-".repeat(40) + "\n\n"; + }); + } + + output += `UNIQUE ISSUES AFTER DEDUP (${dedup.unique.length}):\n`; + output += "-".repeat(40) + "\n\n"; + dedup.unique.forEach((issue, idx) => { + output += `${idx + 1}. [${issue.extractorId}] ${issue.issueType}${issue.fallacyType ? ` (${issue.fallacyType})` : ""}\n`; + output += ` "${issue.exactText}"\n\n`; + }); + output += "\n"; + } + + fs.writeFileSync(filepath, output); + return filepath; } export function PreJudgeDedupView({ height, maxItems, - dedupResult, + multiDedup, + selectedStrategy, availableJudges, selectedJudgeIdxs, - issueTextWidth, generateJudgeLabel, onBack, onRunJudges, onToggleJudge, }: PreJudgeDedupViewProps) { - const { unique, duplicates, originalCount } = dedupResult; + // Use jaccard as the default/only strategy for now + const currentDedup = multiDedup.jaccard; + const { unique, duplicates, originalCount } = currentDedup; - // Build judge selection items only - const judgeItems: Array<{ label: string; value: string }> = []; + // Build items list + const items: Array<{ label: string; value: string }> = []; + // Judge selection + items.push({ label: "── Select Judges ──", value: "header-judges" }); if (availableJudges.length > 0) { availableJudges.forEach((judge, idx) => { const label = generateJudgeLabel(judge); const isSelected = selectedJudgeIdxs.has(idx); const prefix = isSelected ? "[x]" : "[ ]"; const thinkStr = judge.thinking ? "think" : "noThink"; - const tempStr = judge.temperature === 'default' ? 'tDef' : judge.temperature !== undefined ? `t${judge.temperature}` : ''; - judgeItems.push({ - label: `${prefix} Judge: ${label} (${tempStr ? tempStr + ', ' : ''}${thinkStr})`, + const tempStr = + judge.temperature === "default" + ? "tDef" + : judge.temperature !== undefined + ? `t${judge.temperature}` + : ""; + items.push({ + label: `${prefix} ${label} (${tempStr ? tempStr + ", " : ""}${thinkStr})`, value: `judge-${idx}`, }); }); + items.push({ label: "────────────────────────────────────────", value: "sep-3" }); + const selectedCount = selectedJudgeIdxs.size; - const judgeLabel = selectedCount === 1 - ? generateJudgeLabel(availableJudges[[...selectedJudgeIdxs][0]]) - : `${selectedCount} judges`; - judgeItems.push({ - label: `βš–οΈ Run ${judgeLabel} (aggregate ${unique.length} issues)`, + const judgeLabel = + selectedCount === 1 + ? generateJudgeLabel(availableJudges[[...selectedJudgeIdxs][0]]) + : `${selectedCount} judges`; + items.push({ + label: `βš–οΈ Run ${judgeLabel} (${unique.length} deduplicated issues)`, value: "run-judge", }); } else { - judgeItems.push({ - label: `⚠️ No judges configured. Set FALLACY_JUDGES or FALLACY_JUDGE env var`, + items.push({ + label: `⚠️ No judges configured. Set FALLACY_JUDGES env var`, value: "no-judges", }); } - judgeItems.push({ label: "← Back to Extraction Results", value: "back" }); + items.push({ label: "πŸ“„ Export Full Analysis to File", value: "export" }); + items.push({ label: "← Back to Extraction Results", value: "back" }); + + // Filter non-selectable items (headers and separators) + const selectableItems = items.filter( + (i) => + !i.value.startsWith("header-") && + !i.value.startsWith("sep-") + ); return ( - Pre-Judge Deduplication + + Pre-Judge Deduplication + - {/* Summary stats */} - + {/* Dedup summary */} + - Original: {originalCount} - β†’ + Deduplication: + {originalCount} issues β†’ {unique.length} unique - {duplicates.length > 0 && | {duplicates.length} duplicates removed} + {duplicates.length > 0 && ( + ({duplicates.length} duplicates removed) + )} + + + Using Jaccard word-overlap similarity. Quality-based selection keeps longer/higher-scored issues. - - - {/* Duplicates list (if any) */} - {duplicates.length > 0 && ( - - Duplicates removed: - {duplicates.slice(0, 3).map((d, idx) => ( - - {" "}[{d.extractorId}] {truncate(d.exactText.replace(/\n/g, ' '), issueTextWidth - 20)} - - ))} - {duplicates.length > 3 && ... and {duplicates.length - 3} more} - - )} - - {/* Judge selection */} - - Select Judges: { if (item.value === "back") { onBack(); + } else if (item.value === "export") { + const filepath = exportDedupAnalysis(multiDedup, selectedStrategy); + console.error(`\nπŸ“„ Exported full analysis to: ${filepath}\n`); } else if (item.value === "run-judge") { - const selectedConfigs = [...selectedJudgeIdxs].map(idx => availableJudges[idx]); - onRunJudges(selectedConfigs); + const selectedConfigs = [...selectedJudgeIdxs].map((idx) => availableJudges[idx]); + onRunJudges(selectedConfigs, currentDedup); } else if (item.value.startsWith("judge-")) { const idx = parseInt(item.value.replace("judge-", ""), 10); onToggleJudge(idx); @@ -116,7 +194,7 @@ export function PreJudgeDedupView({ /> - Toggle judges with Enter | Escape=Back + Enter=Select | Escape=Back ); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 678df111..631fc66f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -401,6 +401,9 @@ importers: '@roast/db': specifier: workspace:* version: link:../db + fuse.js: + specifier: ^7.1.0 + version: 7.1.0 mathjs: specifier: ^14.0.1 version: 14.6.0 @@ -515,6 +518,9 @@ importers: meta-evals: dependencies: + '@leeoniya/ufuzzy': + specifier: ^1.0.18 + version: 1.0.18 '@roast/ai': specifier: workspace:* version: link:../internal-packages/ai @@ -524,6 +530,9 @@ importers: dotenv: specifier: ^16.4.5 version: 16.6.1 + fuse.js: + specifier: ^7.1.0 + version: 7.1.0 ink: specifier: ^6.5.1 version: 6.5.1(@types/react@19.2.7)(react@19.2.1) @@ -4216,6 +4225,10 @@ packages: functions-have-names@1.2.3: resolution: {integrity: sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==} + fuse.js@7.1.0: + resolution: {integrity: sha512-trLf4SzuuUxfusZADLINj+dE8clK1frKdmqiJNb1Es75fmI5oY6X2mxLVUciLLjxqw/xr72Dhy+lER6dGd02FQ==} + engines: {node: '>=10'} + gensync@1.0.0-beta.2: resolution: {integrity: sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==} engines: {node: '>=6.9.0'} @@ -5088,6 +5101,7 @@ packages: next@15.3.6: resolution: {integrity: sha512-oI6D1zbbsh6JzzZFDCSHnnx6Qpvd1fSkVJu/5d8uluqnxzuoqtodVZjYvNovooznUq8udSAiKp7MbwlfZ8Gm6w==} engines: {node: ^18.18.0 || ^19.8.0 || >= 20.0.0} + deprecated: This version has a security vulnerability. Please upgrade to a patched version. See https://nextjs.org/blog/security-update-2025-12-11 for more details. hasBin: true peerDependencies: '@opentelemetry/api': ^1.1.0 @@ -6582,6 +6596,7 @@ packages: whatwg-encoding@3.1.1: resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} engines: {node: '>=18'} + deprecated: Use @exodus/bytes instead for a more spec-conformant and faster implementation whatwg-mimetype@3.0.0: resolution: {integrity: sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q==} @@ -11239,6 +11254,8 @@ snapshots: functions-have-names@1.2.3: {} + fuse.js@7.1.0: {} + gensync@1.0.0-beta.2: {} get-east-asian-width@1.4.0: {} From 1c63858bf947e9c7dee7b3068a9f032bf4e86c5f Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 11 Jan 2026 22:27:27 +0000 Subject: [PATCH 39/72] feat(fallacy-check): Add Jaccard similarity dedup with quality-based selection Replace exact-match deduplication with Jaccard word-overlap similarity: - Uses 70% similarity threshold for duplicate detection - When duplicates found, keeps higher-quality issue (longer text + higher scores) - Quality scoring: text length (40%), confidence (25%), severity (20%), importance (15%) - Adds logging for dedup decisions This addresses the TODO about exact matching being too strict. Co-Authored-By: Claude Opus 4.5 --- .../plugins/fallacy-check/dedup.ts | 121 +++++++++++++++--- 1 file changed, 102 insertions(+), 19 deletions(-) diff --git a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts index e1b4e6bd..78a87a3f 100644 --- a/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts +++ b/internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/dedup.ts @@ -1,11 +1,18 @@ /** * Deduplication and prioritization utilities for fallacy issues + * + * Uses Jaccard word-overlap similarity with quality-based selection: + * - When duplicates are found, keeps the higher-quality issue + * - Quality based on text length (more context) + scores (severity, confidence, importance) */ import { logger } from "../../../shared/logger"; import type { FallacyIssue } from "./FallacyIssue"; import { LIMITS } from "./constants"; +/** Similarity threshold for considering two issues as duplicates (70%) */ +const JACCARD_THRESHOLD = 0.7; + /** * Calculate priority score for an issue. * Higher score = more important to address. @@ -15,35 +22,111 @@ export function calculatePriorityScore(issue: FallacyIssue): number { } /** - * Deduplicate issues by removing exact text matches. - * Uses case-insensitive, whitespace-normalized comparison. - * - * TODO: This is too strict - different extractors quoting slightly different - * portions of the same passage won't match. Consider fuzzy matching. + * Normalize text for comparison. + * - Lowercase + * - Collapse whitespace + * - Trim + */ +export function normalizeTextForDedup(text: string): string { + return text.toLowerCase().replace(/\s+/g, " ").trim(); +} + +/** + * Calculate Jaccard similarity between two texts based on word overlap. + * Returns a value between 0 (no overlap) and 1 (identical). + */ +function calculateJaccardSimilarity(textA: string, textB: string): number { + const wordsA = new Set(normalizeTextForDedup(textA).split(/\s+/).filter(Boolean)); + const wordsB = new Set(normalizeTextForDedup(textB).split(/\s+/).filter(Boolean)); + + if (wordsA.size === 0 && wordsB.size === 0) return 1; + if (wordsA.size === 0 || wordsB.size === 0) return 0; + + let intersection = 0; + for (const word of wordsA) { + if (wordsB.has(word)) intersection++; + } + + const union = wordsA.size + wordsB.size - intersection; + return union > 0 ? intersection / union : 0; +} + +/** + * Compute a quality score for an issue. + * Higher = better quality (prefer to keep). + * Factors: text length (more context), severity, confidence, importance. + */ +function computeIssueQuality(issue: FallacyIssue): number { + // Normalize text length (log scale to prevent extremely long texts from dominating) + const lengthScore = Math.log10(issue.text.length + 1) / 4; // ~0.5-1.0 for typical lengths + + // Combine severity, confidence, importance (each 0-100, normalize to 0-1) + const severityNorm = issue.severityScore / 100; + const confidenceNorm = issue.confidenceScore / 100; + const importanceNorm = issue.importanceScore / 100; + + // Weighted combination: prefer longer text, then higher scores + // Length is most important (40%), then confidence (25%), severity (20%), importance (15%) + return ( + lengthScore * 0.4 + + confidenceNorm * 0.25 + + severityNorm * 0.2 + + importanceNorm * 0.15 + ); +} + +/** + * Deduplicate issues using Jaccard word-overlap similarity. + * When duplicates are found, keeps the higher-quality issue + * (longer text + higher severity/confidence/importance). */ export function deduplicateIssues(issues: FallacyIssue[]): FallacyIssue[] { - const seen = new Set(); const unique: FallacyIssue[] = []; for (const issue of issues) { - const key = normalizeTextForDedup(issue.text); - if (!seen.has(key)) { - seen.add(key); + // Check if this issue is a duplicate of any already-kept issue + let bestMatch: { keptIdx: number; kept: FallacyIssue; similarity: number } | null = null; + + for (let i = 0; i < unique.length; i++) { + const kept = unique[i]; + const similarity = calculateJaccardSimilarity(issue.text, kept.text); + + if (similarity >= JACCARD_THRESHOLD) { + if (!bestMatch || similarity > bestMatch.similarity) { + bestMatch = { keptIdx: i, kept, similarity }; + } + } + } + + if (bestMatch) { + // Found a duplicate - decide which to keep based on quality score + const newQuality = computeIssueQuality(issue); + const keptQuality = computeIssueQuality(bestMatch.kept); + + if (newQuality > keptQuality) { + // New issue is better - swap: replace kept with new + logger.debug( + `[Dedup] Replacing issue (quality ${keptQuality.toFixed(2)}) with better duplicate (quality ${newQuality.toFixed(2)})` + ); + unique[bestMatch.keptIdx] = issue; + } else { + // Kept issue is better - discard new + logger.debug( + `[Dedup] Discarding duplicate (quality ${newQuality.toFixed(2)}), keeping (quality ${keptQuality.toFixed(2)})` + ); + } + } else { unique.push(issue); } } - return unique; -} + if (unique.length < issues.length) { + logger.info( + `[Dedup] Reduced ${issues.length} issues to ${unique.length} unique (${issues.length - unique.length} duplicates removed)` + ); + } -/** - * Normalize text for deduplication comparison. - * - Lowercase - * - Collapse whitespace - * - Trim - */ -export function normalizeTextForDedup(text: string): string { - return text.toLowerCase().replace(/\s+/g, " ").trim(); + return unique; } /** From 05756140a3f36089313c8541b02aae9278cf9c74 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Thu, 15 Jan 2026 10:57:33 +0000 Subject: [PATCH 40/72] feat(meta-evals): Add filtered items drilldown and improve validation display - Add filtered items section showing pipeline-filtered issues - Show filter reason and support location when clicking filtered items - Fix truncate to sanitize newlines/tabs for cleaner display - Add pipelineCounts to comparisonData for accurate pipeline math - Use pipelineCounts as source of truth for dedup count - Show per-model extraction breakdown - Add search filter to baseline document selection - Delete dump file after import in setup_db.sh Co-Authored-By: Claude Opus 4.5 --- dev/scripts/dev/db/setup_db.sh | 3 + .../repositories/MetaEvaluationRepository.ts | 13 +- meta-evals/src/components/Validation.tsx | 279 +++++++++++++----- meta-evals/src/components/helpers.ts | 6 +- 4 files changed, 227 insertions(+), 74 deletions(-) diff --git a/dev/scripts/dev/db/setup_db.sh b/dev/scripts/dev/db/setup_db.sh index 3365009e..0166dcc7 100755 --- a/dev/scripts/dev/db/setup_db.sh +++ b/dev/scripts/dev/db/setup_db.sh @@ -70,5 +70,8 @@ sed -i "s/$PROD_DB_USER/$LOCAL_DB_USER/g; s/doadmin/$LOCAL_DB_USER/g" schema.sql echo "Importing production schema and data..." cat schema.sql | psql_local "$LOCAL_DB_NAME" +# Clean up dump file +echo "Cleaning up dump file..." +rm -f schema.sql echo "Database setup completed successfully!" diff --git a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts index 549f1d77..70cd6f75 100644 --- a/internal-packages/db/src/repositories/MetaEvaluationRepository.ts +++ b/internal-packages/db/src/repositories/MetaEvaluationRepository.ts @@ -724,7 +724,7 @@ export class MetaEvaluationRepository { */ async getValidationCorpusDocuments( agentId: string, - options: { limit?: number; minContentLength?: number } = {} + options: { limit?: number; minContentLength?: number; filter?: string } = {} ): Promise< Array<{ documentId: string; @@ -734,11 +734,18 @@ export class MetaEvaluationRepository { evaluationCount: number; }> > { - const { limit = 50, minContentLength = 100 } = options; + const { limit = 50, minContentLength = 100, filter } = options; // Get documents that have evaluations from this agent const evaluations = await this.prisma.evaluation.findMany({ - where: { agentId }, + where: { + agentId, + ...(filter && { + document: { + versions: { some: { title: { contains: filter, mode: "insensitive" } } }, + }, + }), + }, include: { document: { include: { diff --git a/meta-evals/src/components/Validation.tsx b/meta-evals/src/components/Validation.tsx index ea03a061..5a7b27cb 100644 --- a/meta-evals/src/components/Validation.tsx +++ b/meta-evals/src/components/Validation.tsx @@ -28,6 +28,11 @@ import { type Tab = "baselines" | "run" | "history"; +/** Sanitize baseline name - remove newlines and extra whitespace */ +function sanitizeName(name: string): string { + return name.replace(/\s+/g, " ").trim(); +} + interface ValidationProps { height: number; maxItems: number; @@ -213,15 +218,17 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati } } - async function loadCorpus(agentId: string) { + async function loadCorpus(agentId: string, filter?: string) { try { const docs = await metaEvaluationRepository.getValidationCorpusDocuments( agentId, - { limit: 50, minContentLength: 200 } + { limit: 50, minContentLength: 200, filter } ); setCorpusDocuments(docs); - // Pre-select all documents by default - setSelectedDocIds(new Set(docs.map((d) => d.documentId))); + // Only reset selection on initial load, not on filter changes + if (!filter) { + setSelectedDocIds(new Set()); + } } catch (e) { setError(String(e)); } @@ -417,6 +424,13 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati filteredItems: currentEval.pipelineTelemetry?.filteredItems, // Include extraction phase telemetry for drill-down extractionPhase: currentEval.pipelineTelemetry?.extractionPhase, + // Include pipeline counts for accurate math display + pipelineCounts: currentEval.pipelineTelemetry ? { + issuesAfterDedup: currentEval.pipelineTelemetry.issuesAfterDedup, + issuesAfterFiltering: currentEval.pipelineTelemetry.issuesAfterFiltering, + commentsGenerated: currentEval.pipelineTelemetry.commentsGenerated, + commentsKept: currentEval.pipelineTelemetry.commentsKept, + } : undefined, }, }); } @@ -508,6 +522,8 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati multiSelect={true} selectedIds={selectedDocIds} onSelectionChange={setSelectedDocIds} + showFilter={true} + onFilterChange={(f) => selectedAgent && loadCorpus(selectedAgent.id, f)} confirmLabel="Create Baseline" onConfirm={() => createBaseline()} onCancel={() => { @@ -563,7 +579,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati - Baseline: {selectedBaseline.name} + Baseline: {sanitizeName(selectedBaseline.name)} {" "}({selectedBaseline.snapshotCount} docs) @@ -648,6 +664,50 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati }; } } + } else if (selectedCommentKey.startsWith("filtered-")) { + // Show filtered item detail view + const idx = parseInt(selectedCommentKey.replace("filtered-", ""), 10); + const filteredItem = filteredItems[idx]; + if (filteredItem) { + const stageName = filteredItem.stage === 'supported-elsewhere-filter' + ? 'Supported Elsewhere Filter' + : filteredItem.stage === 'review' + ? 'Review Filter' + : filteredItem.stage; + return ( + + + + {filteredItem.header || "(no header)"} + + + + Quoted Text: + {filteredItem.quotedText} + + + + Why Filtered: + {filteredItem.filterReason} + + + {filteredItem.supportLocation && ( + + Support Found At: + {filteredItem.supportLocation} + + )} + + + setSelectedCommentKey(null)} + /> + + + + ); + } } if (baselineComment || currentComment) { @@ -656,7 +716,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati // For Kept comments, show both versions side by side if (commentType === "Kept" && baselineComment && currentComment) { return ( - + {baselineComment.header || currentComment.header || "(no header)"} @@ -688,7 +748,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati // For Lost comments with filter reason, show detailed view if (commentType === "Lost" && baselineComment && filterInfo) { return ( - + {baselineComment.header || "(no header)"} @@ -728,7 +788,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati // For New/Lost (without filter reason), show single version with label const comment = currentComment || baselineComment; - const versionLabel = commentType === "New" ? "(from current run)" : "(from baseline)"; + const versionLabel = commentType === "New" ? "- new vs baseline" : "- in baseline only"; return ( @@ -749,11 +809,11 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati {commentType === "Lost" && !filterInfo && ( - Why was this comment lost? + Why is this missing from the current run? {data?.filteredItems !== undefined - ? "This issue was not extracted by the current pipeline run. The LLM did not identify it as an issue during extraction (this is normal variance between runs)." - : "No filter telemetry available for this run (run predates telemetry feature)."} + ? "The LLM extractors did not detect this issue in the current run. This is normal variance between runs - LLMs don't always find the same issues." + : "No telemetry available for this run (run predates telemetry feature)."} )} @@ -803,6 +863,12 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati sourceExtractors: string[]; }>; }; + pipelineCounts?: { + issuesAfterDedup: number; + issuesAfterFiltering: number; + commentsGenerated: number; + commentsKept: number; + }; } | null; const matched = data?.matchedComments || []; @@ -810,6 +876,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati const lost = data?.lostComments || []; const filteredItems = data?.filteredItems || []; const extractionPhase = data?.extractionPhase; + const pipelineCounts = data?.pipelineCounts; // Helper to check if a lost comment has a filter reason const hasFilterReason = (lostComment: { quotedText: string; header: string | null }) => { @@ -825,34 +892,51 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati // Build scrollable list of ALL comments - no truncation const commentItems: Array<{ label: string; value: string }> = []; - // Add all kept comments - matched.forEach((c, i) => { - const comment = c.baselineComment || c.currentComment; - const label = comment ? (comment.header || truncate(comment.quotedText, 50)) : "Unknown"; - commentItems.push({ - label: ` βœ“ ${label}`, - value: `kept-${i}`, + // Add items grouped by category + if (matched.length > 0) { + matched.forEach((c, i) => { + const comment = c.baselineComment || c.currentComment; + const label = comment ? (comment.header || truncate(comment.quotedText, 50)) : "Unknown"; + commentItems.push({ + label: `= ${label}`, + value: `kept-${i}`, + }); }); - }); + } - // Add all new comments - newComments.forEach((c, i) => { - commentItems.push({ - label: ` + ${c.header || truncate(c.quotedText, 50)}`, - value: `new-${i}`, + if (newComments.length > 0) { + newComments.forEach((c, i) => { + commentItems.push({ + label: `+ ${c.header || truncate(c.quotedText, 50)}`, + value: `new-${i}`, + }); }); - }); + } - // Add all lost comments - mark those with filter reasons differently - lost.forEach((c, i) => { - const hasReason = hasFilterReason(c); - // ⊘ = filtered with reason, βˆ’ = not extracted (no reason) - const indicator = hasReason ? "⊘" : "βˆ’"; - commentItems.push({ - label: ` ${indicator} ${c.header || truncate(c.quotedText, 50)}`, - value: `lost-${i}`, + if (lost.length > 0) { + lost.forEach((c, i) => { + const hasReason = hasFilterReason(c); + const suffix = hasReason ? " [filtered]" : ""; + // Use truncated quotedText for consistency with filtered items + commentItems.push({ + label: `- ${truncate(c.quotedText, 50)}${suffix}`, + value: `lost-${i}`, + }); }); - }); + } + + // Build filtered items list separately + const filteredItemsList: Array<{ label: string; value: string }> = []; + if (filteredItems.length > 0) { + filteredItemsList.push({ label: "--- Filtered by pipeline ---", value: "sep-filtered" }); + filteredItems.forEach((f, i) => { + const stageLabel = f.stage === 'supported-elsewhere-filter' ? 'F' : f.stage === 'review' ? 'R' : '?'; + filteredItemsList.push({ + label: `[${stageLabel}] ${truncate(f.quotedText, 50)}`, + value: `filtered-${i}`, + }); + }); + } if (commentItems.length === 0) { commentItems.push({ label: " No comments in this comparison", value: "empty" }); @@ -860,53 +944,110 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati commentItems.push({ label: " ← Back", value: "back" }); - // Count lost with/without filter reasons + // Count lost with filter reasons const lostWithReason = lost.filter((c) => hasFilterReason(c)).length; - const lostWithoutReason = lost.length - lostWithReason; + + // Calculate totals + const baselineTotal = matched.length + lost.length; + const currentTotal = matched.length + newComments.length; + + // Determine if there are any differences + const isUnchanged = lost.length === 0 && newComments.length === 0; return ( - - - βœ“ {matched.length} kept - - - + {newComments.length} new - - - βˆ’ {lost.length} lost - {lost.length > 0 && ( - ({lostWithReason} filtered, {lostWithoutReason} not extracted) - )} - + {/* Summary counts */} + + + Baseline: + {baselineTotal} issues + β†’ Current run: + {currentTotal} issues + - - Legend: βœ“ kept + new ⊘ filtered (has reason) βˆ’ not extracted + + {/* Comparison: what changed between baseline and current */} + + Comparison: + + βœ“ {matched.length} issues appear in BOTH baseline and current + + + + {newComments.length} issues are NEW (in current run, not in baseline) + + + βˆ’ {lost.length} issues are GONE (were in baseline, not in current run) + - {extractionPhase && extractionPhase.multiExtractorEnabled && ( - - - Extraction: + + {/* Current run details: extraction β†’ filter β†’ review */} + {(extractionPhase || pipelineCounts) && (() => { + // Count filtered items by stage + const supportedElsewhereCount = filteredItems.filter(f => f.stage === 'supported-elsewhere-filter').length; + const reviewFilteredCount = filteredItems.filter(f => f.stage === 'review').length; + + // Use actual pipeline counts when available (pipelineCounts is source of truth) + const afterDedup = pipelineCounts?.issuesAfterDedup ?? extractionPhase?.totalIssuesAfterJudge; + const afterFilter = pipelineCounts?.issuesAfterFiltering; + const commentsGenerated = pipelineCounts?.commentsGenerated; + const commentsKept = pipelineCounts?.commentsKept; + + // Calculate what was filtered at each stage + const filteredBySupported = afterDedup !== undefined && afterFilter !== undefined ? afterDedup - afterFilter : supportedElsewhereCount; + const filteredByGeneration = afterFilter !== undefined && commentsGenerated !== undefined ? afterFilter - commentsGenerated : 0; + const filteredByReview = commentsGenerated !== undefined && commentsKept !== undefined ? commentsGenerated - commentsKept : reviewFilteredCount; + + return ( + + Current run details: + {extractionPhase && ( + <> + + Extraction: {extractionPhase.extractors?.length || 0} models β†’ {extractionPhase.totalIssuesBeforeJudge} issues β†’ dedup β†’ {afterDedup} + + {extractionPhase.extractors && extractionPhase.extractors.length > 0 && ( + + {" "}({extractionPhase.extractors.map(e => `${e.model.split('/').pop()}: ${e.issuesFound}`).join(', ')}) + + )} + + )} + {filteredBySupported > 0 && ( + + Filter: {filteredBySupported} removed (supported elsewhere) β†’ {afterFilter} + + )} + {filteredByGeneration > 0 && ( + + Comment gen: {filteredByGeneration} failed (empty/invalid) β†’ {commentsGenerated} + + )} + {filteredByReview > 0 && ( + + Review: {filteredByReview} removed (redundant/low-value) β†’ {commentsKept} + + )} - {extractionPhase.extractors.map(e => { - const tempStr = e.temperatureConfig === 'default' ? 'tDef' : `t${e.temperature}`; - const thinkStr = e.thinkingEnabled ? '' : ' noThink'; - return `${e.extractorId}(${tempStr}${thinkStr}):${e.issuesFound}`; - }).join(' | ')} β†’ {extractionPhase.judgeDurationMs ? 'Judge' : 'Dedup'} β†’ {extractionPhase.totalIssuesAfterJudge}/{extractionPhase.totalIssuesBeforeJudge} kept + Result: {commentsKept ?? currentTotal} comments kept - + ); + })()} + + {/* Simple status - no judgments, just facts */} + {isUnchanged && ( + βœ“ No differences )} { if (item.value === "back") { setSelectedSnapshotId(null); - } else if (item.value.startsWith("kept-") || item.value.startsWith("new-") || item.value.startsWith("lost-")) { + } else if (item.value.startsWith("kept-") || item.value.startsWith("new-") || item.value.startsWith("lost-") || item.value.startsWith("filtered-")) { setSelectedCommentKey(item.value); } }} @@ -924,9 +1065,9 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati if (selectedRunDetail) { const formatChangeSummary = (s: { keptCount: number; newCount: number; lostCount: number }) => { const parts: string[] = []; - if (s.keptCount > 0) parts.push(`${s.keptCount} kept`); + if (s.keptCount > 0) parts.push(`${s.keptCount} matched`); if (s.newCount > 0) parts.push(`+${s.newCount} new`); - if (s.lostCount > 0) parts.push(`-${s.lostCount} lost`); + if (s.lostCount > 0) parts.push(`-${s.lostCount} missing`); return parts.length > 0 ? parts.join(", ") : "no comments"; }; @@ -952,7 +1093,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati {" | "} [~] {changedCount} changed {" | "} - Baseline: {selectedRunDetail.baseline.name} + Baseline: {sanitizeName(selectedRunDetail.baseline.name)} @@ -1006,7 +1147,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati - Baseline: {selectedBaseline?.name || "None"} + Baseline: {selectedBaseline ? sanitizeName(selectedBaseline.name) : "None"} {" | "} {validationRuns.length} run{validationRuns.length !== 1 ? "s" : ""} @@ -1042,7 +1183,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati const items = [ { label: "+ Create New Baseline", value: "create" }, ...baselines.map((b) => ({ - label: `${selectedBaseline?.id === b.id ? "● " : "β—‹ "}${b.name} (${b.snapshotCount} docs)`, + label: `${selectedBaseline?.id === b.id ? "● " : "β—‹ "}${sanitizeName(b.name)} (${b.snapshotCount} docs)`, value: `select:${b.id}`, })), ...(selectedBaseline ? [{ label: "- Delete Selected Baseline", value: "delete" }] : []), @@ -1059,7 +1200,7 @@ export function Validation({ height, maxItems, onBack, onCreateBatch }: Validati {selectedBaseline && ( <> {" | "} - Selected: {selectedBaseline.name} + Selected: {sanitizeName(selectedBaseline.name)} )} diff --git a/meta-evals/src/components/helpers.ts b/meta-evals/src/components/helpers.ts index 6157899a..7f7a9e55 100644 --- a/meta-evals/src/components/helpers.ts +++ b/meta-evals/src/components/helpers.ts @@ -3,8 +3,10 @@ */ export function truncate(str: string, maxLen: number): string { - if (str.length <= maxLen) return str; - return str.slice(0, maxLen - 3) + "..."; + // Sanitize: replace newlines/tabs with spaces, collapse multiple spaces + const clean = str.replace(/[\n\r\t]+/g, ' ').replace(/\s+/g, ' ').trim(); + if (clean.length <= maxLen) return clean; + return clean.slice(0, maxLen - 3) + "..."; } export function formatDate(date: Date): string { From 81a46a16a27e530d41aceea113dc2b8318c7e893 Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sat, 17 Jan 2026 22:14:33 +0000 Subject: [PATCH 41/72] feat(web): Add Validation Lab UI for pipeline regression testing Port the meta-evals Validation functionality to a web UI at /monitor/lab. This enables regression testing of the fallacy detection pipeline through the browser instead of CLI. Features: - Unified single-page layout with baselines sidebar and run/history view - Create baselines from corpus documents with latest evaluations - Run validation against baseline (creates jobs, polls, compares) - Pipeline view with timing data per stage and per extractor - Drill-down into filtered items and comparison details - Auto-prefilled names for baselines and runs API routes: - /api/monitor/lab/baselines - CRUD for validation baselines - /api/monitor/lab/corpus - Fetch documents for baseline creation - /api/monitor/lab/runs - Start runs, get history - /api/monitor/lab/runs/[id]/finalize - Compare results with baseline - /api/monitor/lab/jobs/status - Poll job completion Co-Authored-By: Claude Opus 4.5 --- .../api/monitor/lab/baselines/[id]/route.ts | 27 + .../app/api/monitor/lab/baselines/route.ts | 95 ++++ .../src/app/api/monitor/lab/corpus/route.ts | 33 ++ .../api/monitor/lab/corpus/snapshots/route.ts | 38 ++ .../app/api/monitor/lab/jobs/status/route.ts | 59 +++ .../monitor/lab/runs/[id]/finalize/route.ts | 277 +++++++++++ .../app/api/monitor/lab/runs/[id]/route.ts | 51 ++ .../web/src/app/api/monitor/lab/runs/route.ts | 59 +++ .../app/api/monitor/lab/runs/start/route.ts | 114 +++++ .../api/monitor/lab/snapshots/[id]/route.ts | 100 ++++ apps/web/src/app/monitor/client-layout.tsx | 6 + .../lab/components/baselines/BaselineCard.tsx | 49 ++ .../lab/components/baselines/BaselineList.tsx | 27 + .../baselines/CreateBaselineModal.tsx | 228 +++++++++ .../lab/components/history/RunDetail.tsx | 126 +++++ .../lab/components/snapshots/PipelineView.tsx | 468 ++++++++++++++++++ .../snapshots/SnapshotComparison.tsx | 228 +++++++++ .../lab/components/tabs/BaselinesTab.tsx | 105 ++++ .../lab/components/tabs/HistoryTab.tsx | 303 ++++++++++++ .../monitor/lab/components/tabs/RunTab.tsx | 306 ++++++++++++ .../src/app/monitor/lab/hooks/useBaselines.ts | 63 +++ .../app/monitor/lab/hooks/useCorpusDocs.ts | 40 ++ apps/web/src/app/monitor/lab/hooks/useRuns.ts | 73 +++ apps/web/src/app/monitor/lab/page.tsx | 331 +++++++++++++ apps/web/src/app/monitor/lab/types.ts | 122 +++++ .../src/app/monitor/lab/utils/formatters.ts | 54 ++ 26 files changed, 3382 insertions(+) create mode 100644 apps/web/src/app/api/monitor/lab/baselines/[id]/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/baselines/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/corpus/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/corpus/snapshots/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/jobs/status/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/runs/[id]/finalize/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/runs/[id]/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/runs/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/runs/start/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/snapshots/[id]/route.ts create mode 100644 apps/web/src/app/monitor/lab/components/baselines/BaselineCard.tsx create mode 100644 apps/web/src/app/monitor/lab/components/baselines/BaselineList.tsx create mode 100644 apps/web/src/app/monitor/lab/components/baselines/CreateBaselineModal.tsx create mode 100644 apps/web/src/app/monitor/lab/components/history/RunDetail.tsx create mode 100644 apps/web/src/app/monitor/lab/components/snapshots/PipelineView.tsx create mode 100644 apps/web/src/app/monitor/lab/components/snapshots/SnapshotComparison.tsx create mode 100644 apps/web/src/app/monitor/lab/components/tabs/BaselinesTab.tsx create mode 100644 apps/web/src/app/monitor/lab/components/tabs/HistoryTab.tsx create mode 100644 apps/web/src/app/monitor/lab/components/tabs/RunTab.tsx create mode 100644 apps/web/src/app/monitor/lab/hooks/useBaselines.ts create mode 100644 apps/web/src/app/monitor/lab/hooks/useCorpusDocs.ts create mode 100644 apps/web/src/app/monitor/lab/hooks/useRuns.ts create mode 100644 apps/web/src/app/monitor/lab/page.tsx create mode 100644 apps/web/src/app/monitor/lab/types.ts create mode 100644 apps/web/src/app/monitor/lab/utils/formatters.ts diff --git a/apps/web/src/app/api/monitor/lab/baselines/[id]/route.ts b/apps/web/src/app/api/monitor/lab/baselines/[id]/route.ts new file mode 100644 index 00000000..7273070d --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/baselines/[id]/route.ts @@ -0,0 +1,27 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { metaEvaluationRepository } from "@roast/db"; + +export async function DELETE( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id } = await params; + + try { + await metaEvaluationRepository.deleteValidationBaseline(id); + return NextResponse.json({ success: true }); + } catch (error) { + logger.error("Error deleting baseline:", error); + return commonErrors.serverError("Failed to delete baseline"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/baselines/route.ts b/apps/web/src/app/api/monitor/lab/baselines/route.ts new file mode 100644 index 00000000..d2a88cb7 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/baselines/route.ts @@ -0,0 +1,95 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { metaEvaluationRepository, prisma } from "@roast/db"; + +export async function GET(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const agentId = request.nextUrl.searchParams.get("agentId"); + if (!agentId) { + return NextResponse.json({ error: "agentId is required" }, { status: 400 }); + } + + try { + const baselines = await metaEvaluationRepository.getValidationBaselines(agentId); + return NextResponse.json({ baselines }); + } catch (error) { + logger.error("Error fetching baselines:", error); + return commonErrors.serverError("Failed to fetch baselines"); + } +} + +export async function POST(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + try { + const body = await request.json(); + const { name, description, agentId, documentIds, evaluationVersionIds } = body; + + if (!name || !agentId) { + return NextResponse.json( + { error: "name and agentId are required" }, + { status: 400 } + ); + } + + // Get evaluation version IDs from document IDs if not provided directly + let evalVersionIds = evaluationVersionIds; + if (!evalVersionIds?.length && documentIds?.length) { + // Get the latest evaluation version for each document + const evaluations = await prisma.evaluationVersion.findMany({ + where: { + agentId, + evaluation: { + documentId: { in: documentIds }, + }, + }, + orderBy: { createdAt: "desc" }, + select: { + id: true, + evaluation: { select: { documentId: true } }, + }, + }); + + // Keep only the latest version per document + const latestByDoc = new Map(); + for (const ev of evaluations) { + if (!latestByDoc.has(ev.evaluation.documentId)) { + latestByDoc.set(ev.evaluation.documentId, ev.id); + } + } + evalVersionIds = Array.from(latestByDoc.values()); + } + + if (!evalVersionIds?.length) { + return NextResponse.json( + { error: "No evaluation versions found for the selected documents" }, + { status: 400 } + ); + } + + const baseline = await metaEvaluationRepository.createValidationBaseline({ + name, + description, + agentId, + evaluationVersionIds: evalVersionIds, + createdById: userId, + }); + + return NextResponse.json({ baseline }); + } catch (error) { + logger.error("Error creating baseline:", error); + return commonErrors.serverError("Failed to create baseline"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/corpus/route.ts b/apps/web/src/app/api/monitor/lab/corpus/route.ts new file mode 100644 index 00000000..1c2336b2 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/corpus/route.ts @@ -0,0 +1,33 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { metaEvaluationRepository } from "@roast/db"; + +export async function GET(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const agentId = request.nextUrl.searchParams.get("agentId"); + const filter = request.nextUrl.searchParams.get("filter") || undefined; + const limit = parseInt(request.nextUrl.searchParams.get("limit") || "500", 10); + + if (!agentId) { + return NextResponse.json({ error: "agentId is required" }, { status: 400 }); + } + + try { + const documents = await metaEvaluationRepository.getValidationCorpusDocuments(agentId, { + filter, + limit, + }); + return NextResponse.json({ documents }); + } catch (error) { + logger.error("Error fetching corpus documents:", error); + return commonErrors.serverError("Failed to fetch corpus documents"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/corpus/snapshots/route.ts b/apps/web/src/app/api/monitor/lab/corpus/snapshots/route.ts new file mode 100644 index 00000000..50a5fcae --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/corpus/snapshots/route.ts @@ -0,0 +1,38 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { metaEvaluationRepository } from "@roast/db"; + +// Get evaluation snapshots for a set of documents (used when creating baselines) +export async function GET(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const agentId = request.nextUrl.searchParams.get("agentId"); + const documentIdsParam = request.nextUrl.searchParams.get("documentIds"); + + if (!agentId || !documentIdsParam) { + return NextResponse.json( + { error: "agentId and documentIds are required" }, + { status: 400 } + ); + } + + const documentIds = documentIdsParam.split(",").filter(Boolean); + if (documentIds.length === 0) { + return NextResponse.json({ error: "documentIds cannot be empty" }, { status: 400 }); + } + + try { + const snapshots = await metaEvaluationRepository.getEvaluationSnapshots(documentIds, agentId); + return NextResponse.json({ snapshots }); + } catch (error) { + logger.error("Error fetching evaluation snapshots:", error); + return commonErrors.serverError("Failed to fetch evaluation snapshots"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/jobs/status/route.ts b/apps/web/src/app/api/monitor/lab/jobs/status/route.ts new file mode 100644 index 00000000..dfc0f67a --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/jobs/status/route.ts @@ -0,0 +1,59 @@ +import { NextRequest, NextResponse } from "next/server"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { prisma } from "@roast/db"; + +/** + * Get status of multiple jobs by ID + */ +export async function GET(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const jobIdsParam = request.nextUrl.searchParams.get("jobIds"); + if (!jobIdsParam) { + return NextResponse.json({ error: "jobIds is required" }, { status: 400 }); + } + + const jobIds = jobIdsParam.split(",").filter(Boolean); + if (jobIds.length === 0) { + return NextResponse.json({ error: "jobIds cannot be empty" }, { status: 400 }); + } + + try { + const jobs = await prisma.job.findMany({ + where: { id: { in: jobIds } }, + select: { + id: true, + status: true, + evaluationVersionId: true, + error: true, + }, + }); + + const completed = jobs.filter((j) => j.status === "COMPLETED").length; + const failed = jobs.filter((j) => j.status === "FAILED").length; + const pending = jobs.filter((j) => j.status === "PENDING").length; + const running = jobs.filter((j) => j.status === "RUNNING").length; + + const allDone = completed + failed === jobs.length; + + return NextResponse.json({ + jobs, + summary: { + total: jobs.length, + completed, + failed, + pending, + running, + allDone, + }, + }); + } catch (error) { + return commonErrors.serverError("Failed to get job status"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/runs/[id]/finalize/route.ts b/apps/web/src/app/api/monitor/lab/runs/[id]/finalize/route.ts new file mode 100644 index 00000000..5179def3 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/runs/[id]/finalize/route.ts @@ -0,0 +1,277 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { prisma, metaEvaluationRepository } from "@roast/db"; + +interface CommentData { + id: string; + quotedText: string; + header: string | null; + description: string; + importance: number | null; + startOffset: number; + endOffset: number; +} + +interface EvaluationSnapshot { + evaluationVersionId: string; + documentId: string; + comments: CommentData[]; + pipelineTelemetry?: { + filteredItems?: unknown[]; + extractionPhase?: unknown; + finalCounts?: { + issuesExtracted?: number; + issuesAfterDedup?: number; + issuesAfterFiltering?: number; + commentsGenerated?: number; + commentsKept?: number; + }; + }; +} + +/** + * Finalize a validation run: + * 1. Get the new evaluation versions from completed jobs + * 2. Compare with baseline + * 3. Save comparison results + * 4. Update run status + */ +export async function POST( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id: runId } = await params; + + try { + // Get the run + const run = await prisma.validationRun.findUnique({ + where: { id: runId }, + include: { + baseline: { + select: { id: true, agentId: true }, + }, + }, + }); + + if (!run) { + return NextResponse.json({ error: "Run not found" }, { status: 404 }); + } + + if (run.status === "completed") { + return NextResponse.json({ error: "Run already finalized" }, { status: 400 }); + } + + // Get baseline snapshots + const baselineSnapshots = await metaEvaluationRepository.getBaselineSnapshots(run.baselineId); + + if (baselineSnapshots.length === 0) { + await metaEvaluationRepository.updateValidationRunStatus(runId, "failed", "Baseline has no snapshots"); + return NextResponse.json({ error: "Baseline has no snapshots" }, { status: 400 }); + } + + // Get the document IDs + const documentIds = [...new Set(baselineSnapshots.map((s) => s.documentId))]; + + // Get the latest evaluation versions for these documents + const newSnapshots = await metaEvaluationRepository.getEvaluationSnapshots( + documentIds, + run.baseline.agentId + ); + + // Compare and save results + let unchangedCount = 0; + let changedCount = 0; + + for (const baselineSnapshot of baselineSnapshots) { + const newSnapshot = newSnapshots.find( + (s) => s && s.documentId === baselineSnapshot.documentId + ); + + if (newSnapshot) { + // Compare comments + const comparison = compareSnapshots( + toEvaluationSnapshot(baselineSnapshot), + toEvaluationSnapshot(newSnapshot) + ); + + // Get baseline snapshot record ID + const baselineSnapshotRecord = await metaEvaluationRepository.getBaselineSnapshotByDocument( + run.baselineId, + baselineSnapshot.documentId + ); + + if (baselineSnapshotRecord) { + const status = + comparison.newComments.length === 0 && comparison.lostComments.length === 0 + ? "unchanged" + : "changed"; + + if (status === "unchanged") unchangedCount++; + else changedCount++; + + // Get pipeline telemetry from new snapshot + const telemetry = newSnapshot.pipelineTelemetry as EvaluationSnapshot["pipelineTelemetry"]; + const finalCounts = telemetry?.finalCounts; + + // Get full telemetry record for stages + const fullTelemetry = newSnapshot.pipelineTelemetry as { + stages?: Array<{ + stageName: string; + durationMs: number; + inputCount: number; + outputCount: number; + model?: string; + costUsd?: number; + }>; + totalDurationMs?: number; + } & EvaluationSnapshot["pipelineTelemetry"]; + + await metaEvaluationRepository.addValidationRunSnapshot({ + runId, + baselineSnapshotId: baselineSnapshotRecord.id, + newEvaluationId: newSnapshot.evaluationVersionId, + status: status as "unchanged" | "changed", + keptCount: comparison.matchedComments.length, + newCount: comparison.newComments.length, + lostCount: comparison.lostComments.length, + comparisonData: { + matchedComments: comparison.matchedComments, + newComments: comparison.newComments, + lostComments: comparison.lostComments, + filteredItems: telemetry?.filteredItems, + extractionPhase: telemetry?.extractionPhase, + stages: fullTelemetry?.stages, + totalDurationMs: fullTelemetry?.totalDurationMs, + pipelineCounts: finalCounts + ? { + issuesAfterDedup: finalCounts.issuesAfterDedup ?? 0, + issuesAfterFiltering: finalCounts.issuesAfterFiltering ?? 0, + commentsGenerated: finalCounts.commentsGenerated ?? 0, + commentsKept: finalCounts.commentsKept ?? 0, + } + : undefined, + }, + }); + } + } + } + + // Update run status + const summary = `${unchangedCount} unchanged, ${changedCount} changed`; + await metaEvaluationRepository.updateValidationRunStatus(runId, "completed", summary); + + logger.info("Validation run finalized", { + runId, + unchangedCount, + changedCount, + }); + + return NextResponse.json({ + success: true, + summary, + unchangedCount, + changedCount, + }); + } catch (error) { + logger.error("Error finalizing validation run:", error); + + // Mark run as failed + try { + await metaEvaluationRepository.updateValidationRunStatus( + runId, + "failed", + error instanceof Error ? error.message : "Unknown error" + ); + } catch { + // Ignore secondary error + } + + return commonErrors.serverError("Failed to finalize validation run"); + } +} + +// Helper to convert snapshot format +function toEvaluationSnapshot(snapshot: { + evaluationVersionId: string; + documentId: string; + comments: CommentData[]; + pipelineTelemetry?: unknown; +}): EvaluationSnapshot { + return { + evaluationVersionId: snapshot.evaluationVersionId, + documentId: snapshot.documentId, + comments: snapshot.comments, + pipelineTelemetry: snapshot.pipelineTelemetry as EvaluationSnapshot["pipelineTelemetry"], + }; +} + +// Simple comment comparison +function compareSnapshots(baseline: EvaluationSnapshot, current: EvaluationSnapshot) { + const matchedComments: Array<{ + baselineComment: CommentData; + currentComment: CommentData; + matchConfidence: number; + status: string; + }> = []; + const newComments: CommentData[] = []; + const lostComments: CommentData[] = []; + + const usedCurrentIndices = new Set(); + + // Find matches based on quoted text similarity + for (const baselineComment of baseline.comments) { + let bestMatch: { index: number; score: number } | null = null; + + for (let i = 0; i < current.comments.length; i++) { + if (usedCurrentIndices.has(i)) continue; + + const currentComment = current.comments[i]; + const score = calculateSimilarity(baselineComment.quotedText, currentComment.quotedText); + + if (score > 0.6 && (!bestMatch || score > bestMatch.score)) { + bestMatch = { index: i, score }; + } + } + + if (bestMatch) { + usedCurrentIndices.add(bestMatch.index); + matchedComments.push({ + baselineComment, + currentComment: current.comments[bestMatch.index], + matchConfidence: bestMatch.score, + status: "matched", + }); + } else { + lostComments.push(baselineComment); + } + } + + // Find new comments (not matched to any baseline) + for (let i = 0; i < current.comments.length; i++) { + if (!usedCurrentIndices.has(i)) { + newComments.push(current.comments[i]); + } + } + + return { matchedComments, newComments, lostComments }; +} + +// Simple text similarity (Jaccard on words) +function calculateSimilarity(a: string, b: string): number { + const wordsA = new Set(a.toLowerCase().split(/\s+/)); + const wordsB = new Set(b.toLowerCase().split(/\s+/)); + + const intersection = new Set([...wordsA].filter((x) => wordsB.has(x))); + const union = new Set([...wordsA, ...wordsB]); + + return intersection.size / union.size; +} diff --git a/apps/web/src/app/api/monitor/lab/runs/[id]/route.ts b/apps/web/src/app/api/monitor/lab/runs/[id]/route.ts new file mode 100644 index 00000000..c56fe885 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/runs/[id]/route.ts @@ -0,0 +1,51 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { metaEvaluationRepository } from "@roast/db"; + +export async function GET( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id } = await params; + + try { + const run = await metaEvaluationRepository.getValidationRunDetail(id); + if (!run) { + return NextResponse.json({ error: "Run not found" }, { status: 404 }); + } + return NextResponse.json({ run }); + } catch (error) { + logger.error("Error fetching run detail:", error); + return commonErrors.serverError("Failed to fetch run detail"); + } +} + +export async function DELETE( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id } = await params; + + try { + await metaEvaluationRepository.deleteValidationRun(id); + return NextResponse.json({ success: true }); + } catch (error) { + logger.error("Error deleting run:", error); + return commonErrors.serverError("Failed to delete run"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/runs/route.ts b/apps/web/src/app/api/monitor/lab/runs/route.ts new file mode 100644 index 00000000..e16bcaf2 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/runs/route.ts @@ -0,0 +1,59 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { metaEvaluationRepository } from "@roast/db"; + +export async function GET(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const baselineId = request.nextUrl.searchParams.get("baselineId"); + if (!baselineId) { + return NextResponse.json({ error: "baselineId is required" }, { status: 400 }); + } + + try { + const runs = await metaEvaluationRepository.getValidationRuns(baselineId); + return NextResponse.json({ runs }); + } catch (error) { + logger.error("Error fetching runs:", error); + return commonErrors.serverError("Failed to fetch runs"); + } +} + +export async function POST(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + try { + const body = await request.json(); + const { baselineId, name } = body; + + if (!baselineId) { + return NextResponse.json({ error: "baselineId is required" }, { status: 400 }); + } + + // Create the run record + const run = await metaEvaluationRepository.createValidationRun({ + baselineId, + name, + }); + + // Note: The actual evaluation execution would be triggered separately + // (e.g., via a job queue). For now, we just create the run record. + // The CLI handles the actual pipeline execution. + + return NextResponse.json({ run }); + } catch (error) { + logger.error("Error creating run:", error); + return commonErrors.serverError("Failed to create run"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/runs/start/route.ts b/apps/web/src/app/api/monitor/lab/runs/start/route.ts new file mode 100644 index 00000000..b583b99c --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/runs/start/route.ts @@ -0,0 +1,114 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { prisma, metaEvaluationRepository, generateId } from "@roast/db"; +import { getServices } from "@/application/services/ServiceFactory"; + +/** + * Start a validation run: + * 1. Create ValidationRun record + * 2. Get documents from baseline + * 3. Create batch jobs to re-evaluate each document + * 4. Return run ID and job IDs for polling + */ +export async function POST(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + try { + const body = await request.json(); + const { baselineId, name } = body; + + if (!baselineId) { + return NextResponse.json({ error: "baselineId is required" }, { status: 400 }); + } + + // Get baseline info + const baseline = await prisma.validationBaseline.findUnique({ + where: { id: baselineId }, + select: { id: true, name: true, agentId: true }, + }); + + if (!baseline) { + return NextResponse.json({ error: "Baseline not found" }, { status: 404 }); + } + + // Get document IDs from baseline + const documentIds = await metaEvaluationRepository.getBaselineDocumentIds(baselineId); + + if (documentIds.length === 0) { + return NextResponse.json({ error: "Baseline has no documents" }, { status: 400 }); + } + + // Create the validation run + const run = await metaEvaluationRepository.createValidationRun({ + baselineId, + name: name || `Run ${new Date().toLocaleString()}`, + }); + + // Create batch for the jobs + const batch = await prisma.agentEvalBatch.create({ + data: { + name: `Validation run ${run.id.slice(0, 8)}`, + agentId: baseline.agentId, + requestedDocumentIds: documentIds, + userId, + }, + }); + + // Create evaluations and jobs for each document + const jobIds: string[] = []; + const { jobService } = getServices(); + + for (const documentId of documentIds) { + // Check if evaluation exists + let evaluation = await prisma.evaluation.findFirst({ + where: { + documentId, + agentId: baseline.agentId, + }, + }); + + // Create evaluation if it doesn't exist + if (!evaluation) { + evaluation = await prisma.evaluation.create({ + data: { + documentId, + agentId: baseline.agentId, + }, + }); + } + + // Create job + const job = await jobService.createJob(evaluation.id, batch.id); + jobIds.push(job.id); + } + + logger.info("Validation run started", { + runId: run.id, + baselineId, + documentCount: documentIds.length, + jobCount: jobIds.length, + }); + + return NextResponse.json({ + run: { + id: run.id, + status: "running", + }, + batch: { + id: batch.id, + }, + jobIds, + documentCount: documentIds.length, + }); + } catch (error) { + logger.error("Error starting validation run:", error); + return commonErrors.serverError("Failed to start validation run"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/snapshots/[id]/route.ts b/apps/web/src/app/api/monitor/lab/snapshots/[id]/route.ts new file mode 100644 index 00000000..72a8e5b3 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/snapshots/[id]/route.ts @@ -0,0 +1,100 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { prisma } from "@roast/db"; + +export async function GET( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id } = await params; + + try { + // Get the run snapshot with full comparison data + const snapshot = await prisma.validationRunSnapshot.findUnique({ + where: { id }, + include: { + baselineSnapshot: { + include: { + evaluationVersion: { + include: { + evaluation: { + include: { + document: { + include: { + versions: { + orderBy: { version: "desc" }, + take: 1, + select: { title: true }, + }, + }, + }, + }, + }, + comments: { + include: { highlight: true }, + }, + }, + }, + }, + }, + newEvaluation: { + include: { + comments: { + include: { highlight: true }, + }, + }, + }, + }, + }); + + if (!snapshot) { + return NextResponse.json({ error: "Snapshot not found" }, { status: 404 }); + } + + // Format baseline comments + const baselineComments = snapshot.baselineSnapshot.evaluationVersion.comments.map((c) => ({ + id: c.id, + quotedText: c.highlight.quotedText, + header: c.header, + description: c.description, + importance: c.importance, + })); + + // Format current comments + const currentComments = snapshot.newEvaluation.comments.map((c) => ({ + id: c.id, + quotedText: c.highlight.quotedText, + header: c.header, + description: c.description, + importance: c.importance, + })); + + return NextResponse.json({ + snapshot: { + id: snapshot.id, + status: snapshot.status, + keptCount: snapshot.keptCount, + newCount: snapshot.newCount, + lostCount: snapshot.lostCount, + documentTitle: + snapshot.baselineSnapshot.evaluationVersion.evaluation.document.versions[0]?.title || + "Unknown", + comparisonData: snapshot.comparisonData, + baselineComments, + currentComments, + }, + }); + } catch (error) { + logger.error("Error fetching snapshot:", error); + return commonErrors.serverError("Failed to fetch snapshot"); + } +} diff --git a/apps/web/src/app/monitor/client-layout.tsx b/apps/web/src/app/monitor/client-layout.tsx index be6fd5bd..16201927 100644 --- a/apps/web/src/app/monitor/client-layout.tsx +++ b/apps/web/src/app/monitor/client-layout.tsx @@ -55,6 +55,12 @@ export default function MonitorLayout({ children }: MonitorLayoutProps) { > Docs + + Lab + void; + onDelete: () => void; +} + +export function BaselineCard({ baseline, isSelected, onSelect, onDelete }: BaselineCardProps) { + return ( +
+
+
+

{baseline.name}

+ {baseline.description && ( +

{baseline.description}

+ )} +
+ {baseline.snapshotCount} documents + {formatDate(baseline.createdAt)} + {baseline.commitHash && ( + {baseline.commitHash.slice(0, 7)} + )} +
+
+ +
+
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/baselines/BaselineList.tsx b/apps/web/src/app/monitor/lab/components/baselines/BaselineList.tsx new file mode 100644 index 00000000..cfd43778 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/baselines/BaselineList.tsx @@ -0,0 +1,27 @@ +"use client"; + +import { BaselineCard } from "./BaselineCard"; +import type { Baseline } from "../../types"; + +interface BaselineListProps { + baselines: Baseline[]; + selectedId: string | null; + onSelect: (baseline: Baseline) => void; + onDelete: (id: string) => void; +} + +export function BaselineList({ baselines, selectedId, onSelect, onDelete }: BaselineListProps) { + return ( +
+ {baselines.map((baseline) => ( + onSelect(baseline)} + onDelete={() => onDelete(baseline.id)} + /> + ))} +
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/baselines/CreateBaselineModal.tsx b/apps/web/src/app/monitor/lab/components/baselines/CreateBaselineModal.tsx new file mode 100644 index 00000000..037c3579 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/baselines/CreateBaselineModal.tsx @@ -0,0 +1,228 @@ +"use client"; + +import { useState, useEffect, useCallback } from "react"; +import { XMarkIcon, MagnifyingGlassIcon } from "@heroicons/react/24/outline"; +import type { CorpusDocument } from "../../types"; +import { truncate } from "../../utils/formatters"; + +interface CreateBaselineModalProps { + agentId: string; + onClose: () => void; + onCreated: () => void; +} + +function getDefaultName(): string { + const now = new Date(); + const date = now.toISOString().split("T")[0]; + return `Baseline ${date}`; +} + +export function CreateBaselineModal({ agentId, onClose, onCreated }: CreateBaselineModalProps) { + const [name, setName] = useState(getDefaultName); + const [description, setDescription] = useState(""); + const [searchQuery, setSearchQuery] = useState(""); + const [documents, setDocuments] = useState([]); + const [loading, setLoading] = useState(true); + const [selectedIds, setSelectedIds] = useState>(new Set()); + const [creating, setCreating] = useState(false); + + const fetchDocuments = useCallback(async (filter?: string) => { + setLoading(true); + try { + const params = new URLSearchParams({ agentId }); + if (filter) params.set("filter", filter); + const res = await fetch(`/api/monitor/lab/corpus?${params}`); + if (res.ok) { + const data = await res.json(); + setDocuments(data.documents); + } + } finally { + setLoading(false); + } + }, [agentId]); + + useEffect(() => { + fetchDocuments(); + }, [fetchDocuments]); + + const handleSearch = () => { + fetchDocuments(searchQuery || undefined); + }; + + const toggleDocument = (docId: string) => { + const newSet = new Set(selectedIds); + if (newSet.has(docId)) { + newSet.delete(docId); + } else { + newSet.add(docId); + } + setSelectedIds(newSet); + }; + + const handleSelectAll = () => { + setSelectedIds(new Set(documents.map((d) => d.documentId))); + }; + + const handleSelectNone = () => { + setSelectedIds(new Set()); + }; + + const handleCreate = async () => { + if (!name.trim() || selectedIds.size === 0) return; + setCreating(true); + try { + const res = await fetch("/api/monitor/lab/baselines", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + agentId, + name: name.trim(), + description: description.trim() || undefined, + documentIds: Array.from(selectedIds), + }), + }); + if (res.ok) { + onCreated(); + } + } finally { + setCreating(false); + } + }; + + return ( +
+
+ {/* Header */} +
+

Create Validation Baseline

+ +
+ + {/* Content */} +
+ {/* Name & Description */} +
+
+ + setName(e.target.value)} + placeholder="e.g., Pre-refactor baseline" + className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+
+ + setDescription(e.target.value)} + placeholder="e.g., Baseline before filter changes" + className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+
+ + {/* Document Selection */} +
+
+ +
+ + +
+
+ + {/* Search */} +
+
+ + setSearchQuery(e.target.value)} + onKeyDown={(e) => e.key === "Enter" && handleSearch()} + placeholder="Search documents..." + className="w-full pl-9 pr-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 text-sm" + /> +
+ +
+ + {/* Document List */} +
+ {loading ? ( +
Loading documents...
+ ) : documents.length === 0 ? ( +
No documents found
+ ) : ( +
+ {documents.map((doc) => ( + + ))} +
+ )} +
+
+
+ + {/* Footer */} +
+ + +
+
+
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/history/RunDetail.tsx b/apps/web/src/app/monitor/lab/components/history/RunDetail.tsx new file mode 100644 index 00000000..7ec73754 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/history/RunDetail.tsx @@ -0,0 +1,126 @@ +"use client"; + +import { useState, useEffect } from "react"; +import { ChevronRightIcon, ChevronLeftIcon } from "@heroicons/react/24/outline"; +import { formatDate } from "../../utils/formatters"; +import type { ValidationRunDetail, RunSnapshot } from "../../types"; +import { SnapshotComparison } from "../snapshots/SnapshotComparison"; + +interface RunDetailProps { + runId: string; +} + +export function RunDetail({ runId }: RunDetailProps) { + const [run, setRun] = useState(null); + const [loading, setLoading] = useState(true); + const [selectedSnapshot, setSelectedSnapshot] = useState(null); + + useEffect(() => { + const fetchRun = async () => { + setLoading(true); + try { + const res = await fetch(`/api/monitor/lab/runs/${runId}`); + if (res.ok) { + const data = await res.json(); + setRun(data.run); + } + } finally { + setLoading(false); + } + }; + fetchRun(); + }, [runId]); + + if (loading) { + return
Loading run details...
; + } + + if (!run) { + return
Run not found
; + } + + if (selectedSnapshot) { + return ( +
+ + setSelectedSnapshot(null)} /> +
+ ); + } + + const changedSnapshots = run.snapshots.filter((s) => s.status === "changed"); + const unchangedSnapshots = run.snapshots.filter((s) => s.status === "unchanged"); + + return ( +
+ {/* Summary */} +
+ {formatDate(run.createdAt)} + {run.summary && {run.summary}} +
+ + {/* Changed First */} + {changedSnapshots.length > 0 && ( +
+

+ Changed ({changedSnapshots.length}) +

+
+ {changedSnapshots.map((snapshot) => ( + setSelectedSnapshot(snapshot)} + /> + ))} +
+
+ )} + + {/* Unchanged */} + {unchangedSnapshots.length > 0 && ( +
+

+ Unchanged ({unchangedSnapshots.length}) +

+
+ {unchangedSnapshots.map((snapshot) => ( + setSelectedSnapshot(snapshot)} + /> + ))} +
+
+ )} + + {run.snapshots.length === 0 && ( +
No snapshots in this run
+ )} +
+ ); +} + +function SnapshotRow({ snapshot, onClick }: { snapshot: RunSnapshot; onClick: () => void }) { + return ( +
+
+

{snapshot.documentTitle}

+

+ {snapshot.keptCount} kept Β· {snapshot.newCount} new Β· {snapshot.lostCount} lost +

+
+ +
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/snapshots/PipelineView.tsx b/apps/web/src/app/monitor/lab/components/snapshots/PipelineView.tsx new file mode 100644 index 00000000..fd67d6ab --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/snapshots/PipelineView.tsx @@ -0,0 +1,468 @@ +"use client"; + +import { useState } from "react"; +import { ChevronDownIcon, ChevronRightIcon } from "@heroicons/react/24/outline"; +import type { ExtractionPhase, PipelineCounts, FilteredItem, Comment, StageMetrics } from "../../types"; +import { truncate } from "../../utils/formatters"; + +interface PipelineViewProps { + extraction?: ExtractionPhase; + counts?: PipelineCounts; + filteredItems: FilteredItem[]; + stages?: StageMetrics[]; + totalDurationMs?: number; + finalComments: Comment[]; + lostComments: Comment[]; +} + +function formatDuration(ms: number | undefined): string { + if (ms === undefined) return "β€”"; + if (ms < 1000) return `${ms}ms`; + if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`; + return `${(ms / 60000).toFixed(1)}m`; +} + +function formatCost(usd: number | undefined): string { + if (usd === undefined) return ""; + return `$${usd.toFixed(4)}`; +} + +export function PipelineView({ + extraction, + counts, + filteredItems, + stages, + totalDurationMs, + finalComments, + lostComments, +}: PipelineViewProps) { + // Helper to get stage timing + const getStageTiming = (stageName: string): StageMetrics | undefined => { + return stages?.find((s) => s.stageName === stageName); + }; + const [expandedSteps, setExpandedSteps] = useState>(new Set()); + + const toggleStep = (step: string) => { + const newSet = new Set(expandedSteps); + if (newSet.has(step)) { + newSet.delete(step); + } else { + newSet.add(step); + } + setExpandedSteps(newSet); + }; + + const extractors = extraction?.extractors ?? []; + const totalExtracted = extraction?.totalIssuesBeforeJudge ?? 0; + const afterDedup = counts?.issuesAfterDedup ?? extraction?.totalIssuesAfterJudge ?? 0; + const afterFilter = counts?.issuesAfterFiltering ?? 0; + const commentsGenerated = counts?.commentsGenerated ?? 0; + const commentsKept = counts?.commentsKept ?? 0; + + const dedupRemoved = totalExtracted - afterDedup; + const filterRemoved = afterDedup - afterFilter; + const reviewRemoved = commentsGenerated - commentsKept; + + // Separate filtered items by stage + const filterStageItems = filteredItems.filter((item) => item.stage === "supported-elsewhere-filter"); + const reviewStageItems = filteredItems.filter((item) => item.stage === "review"); + + return ( +
+
+

Pipeline Flow

+
+ +
+ {/* Step 1: Extraction */} + toggleStep("extraction")} + color="blue" + > +
+ {extractors.map((ext, i) => ( +
+
+ {ext.extractorId} +
+ {ext.durationMs !== undefined && ( + {formatDuration(ext.durationMs)} + )} + {ext.costUsd !== undefined && ( + {formatCost(ext.costUsd)} + )} + {ext.issuesFound} issues +
+
+
{ext.model}
+
+ ))} + {extractors.length === 0 && ( +

No extractor data available

+ )} + {extraction?.judgeDurationMs !== undefined && ( +
+ Judge aggregation: {formatDuration(extraction.judgeDurationMs)} +
+ )} +
+
+ + {/* Step 2: Deduplication */} + toggleStep("dedup")} + color="purple" + > +
+ {/* Per-model input breakdown */} + {extractors.length > 0 && ( +
+
Input by Model
+
+ {extractors.map((ext, i) => { + // Calculate approximate survival rate (proportional) + const survivalRate = totalExtracted > 0 + ? (afterDedup / totalExtracted) + : 0; + const estimatedKept = Math.round(ext.issuesFound * survivalRate); + + return ( +
+ {ext.extractorId} +
+ {ext.issuesFound} + β†’ + ~{estimatedKept} + (est.) +
+
+ ); + })} +
+
+ )} + + {/* Summary stats */} +
+
+
+ Total Input: + {totalExtracted} +
+
+ Total Output: + {afterDedup} +
+
+ Duplicates Removed: + -{dedupRemoved} +
+
+ Dedup Rate: + + {totalExtracted > 0 ? Math.round((dedupRemoved / totalExtracted) * 100) : 0}% + +
+
+
+ +

+ Semantic deduplication merges similar issues across models. Per-model estimates assume uniform dedup rate. +

+
+
+ + {/* Step 3: Filtering */} + toggleStep("filter")} + color="orange" + > +
+
+
+
+ Input: + {afterDedup} +
+
+ Output: + {afterFilter} +
+
+
+ + {filterStageItems.length > 0 && ( +
+
+ Filtered Items ({filterStageItems.length}) +
+
+ {filterStageItems.map((item, i) => ( + + ))} +
+
+ )} + + {filterStageItems.length === 0 && filterRemoved > 0 && ( +

+ {filterRemoved} items filtered (details not available) +

+ )} +
+
+ + {/* Step 4: Comment Generation */} + toggleStep("generation")} + color="teal" + > +
+
+
+ Input (issues): + {afterFilter} +
+
+ Output (comments): + {commentsGenerated} +
+
+

+ Issues are converted to user-facing comments with proper formatting +

+
+
+ + {/* Step 5: Review */} + toggleStep("review")} + color="green" + > +
+
+
+
+ Input: + {commentsGenerated} +
+
+ Final Output: + {commentsKept} +
+
+
+ + {/* Removed by review */} + {reviewStageItems.length > 0 && ( +
+
+ Removed by Review ({reviewStageItems.length}) +
+
+ {reviewStageItems.map((item, i) => ( + + ))} +
+
+ )} + + {reviewStageItems.length === 0 && reviewRemoved > 0 && ( +

+ {reviewRemoved} comments removed (details not available) +

+ )} + + {/* Final kept comments */} + {finalComments.length > 0 && ( +
+
+ Final Comments ({finalComments.length}) +
+
+ {finalComments.map((comment, i) => ( + + ))} +
+
+ )} +
+
+
+ + {/* Summary Bar */} +
+
+ + {totalExtracted} extracted β†’{" "} + {afterDedup} deduped β†’{" "} + {afterFilter} filtered β†’{" "} + {commentsGenerated} generated β†’{" "} + {commentsKept} final + +
+ {totalDurationMs !== undefined && ( + {formatDuration(totalDurationMs)} + )} + + {totalExtracted > 0 + ? `${Math.round((commentsKept / totalExtracted) * 100)}% yield` + : "β€”"} + +
+
+
+
+ ); +} + +interface PipelineStepProps { + step: string; + title: string; + summary: string; + timing?: number; + isExpanded: boolean; + onToggle: () => void; + color: "blue" | "purple" | "orange" | "teal" | "green"; + children: React.ReactNode; +} + +function PipelineStep({ + title, + summary, + timing, + isExpanded, + onToggle, + color, + children, +}: PipelineStepProps) { + const colorClasses = { + blue: "bg-blue-100 text-blue-800 border-blue-200", + purple: "bg-purple-100 text-purple-800 border-purple-200", + orange: "bg-orange-100 text-orange-800 border-orange-200", + teal: "bg-teal-100 text-teal-800 border-teal-200", + green: "bg-green-100 text-green-800 border-green-200", + }; + + return ( +
+ + {isExpanded &&
{children}
} +
+ ); +} + +function FilteredItemCard({ item }: { item: FilteredItem }) { + const [expanded, setExpanded] = useState(false); + + return ( +
+
setExpanded(!expanded)} + > +
+
+ + {item.stage === "supported-elsewhere-filter" ? "Filter" : "Review"} + + {item.header && ( + [{item.header}] + )} +
+

{truncate(item.quotedText, 80)}

+
+ +
+ {expanded && ( +
+

+ Reason: {item.filterReason} +

+ {item.supportLocation && ( +

+ Support found at: {item.supportLocation} +

+ )} +
+ )} +
+ ); +} + +function CommentCard({ comment, variant }: { comment: Comment; variant: "kept" | "lost" }) { + const [expanded, setExpanded] = useState(false); + const bgColor = variant === "kept" ? "bg-green-50 border-green-100" : "bg-red-50 border-red-100"; + + return ( +
+
setExpanded(!expanded)} + > +
+ {comment.header || "Comment"} +

{truncate(comment.quotedText, 80)}

+
+ +
+ {expanded && ( +
+

{comment.description}

+ {comment.importance && ( +

+ Importance: {comment.importance} +

+ )} +
+ )} +
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/snapshots/SnapshotComparison.tsx b/apps/web/src/app/monitor/lab/components/snapshots/SnapshotComparison.tsx new file mode 100644 index 00000000..2ceed9ee --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/snapshots/SnapshotComparison.tsx @@ -0,0 +1,228 @@ +"use client"; + +import { useState } from "react"; +import { ArrowLeftIcon } from "@heroicons/react/24/outline"; +import type { RunSnapshot, ComparisonData, CommentMatch, Comment } from "../../types"; +import { truncate } from "../../utils/formatters"; +import { PipelineView } from "./PipelineView"; + +interface SnapshotComparisonProps { + snapshot: RunSnapshot; + onBack: () => void; +} + +type ViewTab = "pipeline" | "comparison"; + +export function SnapshotComparison({ snapshot, onBack }: SnapshotComparisonProps) { + const [activeTab, setActiveTab] = useState("pipeline"); + + const comparison = snapshot.comparisonData as ComparisonData | null; + const matched = comparison?.matchedComments ?? []; + const newComments = comparison?.newComments ?? []; + const lostComments = comparison?.lostComments ?? []; + const filteredItems = comparison?.filteredItems ?? []; + const pipelineCounts = comparison?.pipelineCounts; + const extractionPhase = comparison?.extractionPhase; + const stages = comparison?.stages; + const totalDurationMs = comparison?.totalDurationMs; + + // Collect all final comments for the pipeline view + const allFinalComments: Comment[] = [ + ...matched.map((m) => m.currentComment || m.baselineComment).filter(Boolean), + ...newComments, + ]; + + return ( +
+ {/* Header */} +
+ +

{snapshot.documentTitle}

+ +
+ + {/* Tab Navigation */} +
+ +
+ + {/* Tab Content */} +
+ {activeTab === "pipeline" ? ( + + ) : ( + + )} +
+
+ ); +} + +function TabButton({ + active, + onClick, + label, +}: { + active: boolean; + onClick: () => void; + label: string; +}) { + return ( + + ); +} + +function StatusSummary({ snapshot }: { snapshot: RunSnapshot }) { + return ( +
+ + {snapshot.status === "unchanged" ? "Unchanged" : "Changed"} + + | + {snapshot.keptCount} matched + {snapshot.newCount} new + {snapshot.lostCount} gone +
+ ); +} + +interface ComparisonViewProps { + matched: CommentMatch[]; + newComments: Comment[]; + lostComments: Comment[]; +} + +function ComparisonView({ matched, newComments, lostComments }: ComparisonViewProps) { + return ( +
+ {/* Matched Comments */} + + {matched.map((match, i) => ( + + ))} + + + {/* New Comments */} + + {newComments.map((comment, i) => ( + + ))} + + + {/* Lost Comments */} + + {lostComments.map((comment, i) => ( + + ))} + +
+ ); +} + +interface ComparisonSectionProps { + title: string; + titleColor: string; + isEmpty: boolean; + emptyMessage: string; + children: React.ReactNode; +} + +function ComparisonSection({ + title, + titleColor, + isEmpty, + emptyMessage, + children, +}: ComparisonSectionProps) { + return ( +
+

{title}

+ {isEmpty ? ( +

{emptyMessage}

+ ) : ( +
{children}
+ )} +
+ ); +} + +function MatchedCommentItem({ match }: { match: CommentMatch }) { + const comment = match.baselineComment || match.currentComment; + if (!comment) return null; + + return ( +
+
+ {comment.header || "Comment"} + + (confidence: {Math.round((match.matchConfidence ?? 1) * 100)}%) + +
+

{truncate(comment.quotedText, 100)}

+

{truncate(comment.description, 150)}

+
+ ); +} + +function CommentItem({ comment }: { comment: Comment }) { + return ( +
+
{comment.header || "Comment"}
+

{truncate(comment.quotedText, 100)}

+

{truncate(comment.description, 150)}

+
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/tabs/BaselinesTab.tsx b/apps/web/src/app/monitor/lab/components/tabs/BaselinesTab.tsx new file mode 100644 index 00000000..50ba0422 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/tabs/BaselinesTab.tsx @@ -0,0 +1,105 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useBaselines } from "../../hooks/useBaselines"; +import { BaselineList } from "../baselines/BaselineList"; +import { CreateBaselineModal } from "../baselines/CreateBaselineModal"; +import type { Baseline } from "../../types"; + +interface BaselinesTabProps { + agentId: string; + selectedBaseline: Baseline | null; + onSelectBaseline: (baseline: Baseline | null) => void; +} + +export function BaselinesTab({ agentId, selectedBaseline, onSelectBaseline }: BaselinesTabProps) { + const { baselines, loading, error, refresh, deleteBaseline } = useBaselines(agentId); + const [showCreateModal, setShowCreateModal] = useState(false); + + useEffect(() => { + refresh(); + }, [refresh]); + + const handleBaselineCreated = () => { + setShowCreateModal(false); + refresh(); + }; + + const handleDelete = async (id: string) => { + if (confirm("Delete this baseline? This cannot be undone.")) { + await deleteBaseline(id); + if (selectedBaseline?.id === id) { + onSelectBaseline(null); + } + } + }; + + if (loading) { + return ; + } + + if (error) { + return ; + } + + return ( +
+
+

Validation Baselines

+ +
+ + {baselines.length === 0 ? ( + + ) : ( + + )} + + {showCreateModal && ( + setShowCreateModal(false)} + onCreated={handleBaselineCreated} + /> + )} +
+ ); +} + +function LoadingState({ message }: { message: string }) { + return ( +
+
{message}
+
+ ); +} + +function ErrorState({ message }: { message: string }) { + return ( +
+
Error: {message}
+
+ ); +} + +function EmptyState({ message, action }: { message: string; action: string }) { + return ( +
+

{message}

+

{action}

+
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/tabs/HistoryTab.tsx b/apps/web/src/app/monitor/lab/components/tabs/HistoryTab.tsx new file mode 100644 index 00000000..2e495f82 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/tabs/HistoryTab.tsx @@ -0,0 +1,303 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useBaselines } from "../../hooks/useBaselines"; +import { useRuns } from "../../hooks/useRuns"; +import { formatDate } from "../../utils/formatters"; +import type { Baseline, ValidationRun, ValidationRunDetail, RunSnapshot } from "../../types"; +import { SnapshotComparison } from "../snapshots/SnapshotComparison"; +import { ChevronRightIcon, TrashIcon } from "@heroicons/react/24/outline"; + +interface HistoryTabProps { + agentId: string; + selectedBaseline: Baseline | null; + onSelectBaseline: (baseline: Baseline | null) => void; +} + +export function HistoryTab({ agentId, selectedBaseline, onSelectBaseline }: HistoryTabProps) { + const { baselines, loading: baselinesLoading, refresh: refreshBaselines } = useBaselines(agentId); + const { runs, loading: runsLoading, refresh: refreshRuns, getRunDetail, deleteRun } = useRuns(selectedBaseline?.id ?? null); + const [selectedRun, setSelectedRun] = useState(null); + const [selectedSnapshot, setSelectedSnapshot] = useState(null); + const [loadingDetail, setLoadingDetail] = useState(false); + + useEffect(() => { + refreshBaselines(); + }, [refreshBaselines]); + + useEffect(() => { + if (selectedBaseline) { + refreshRuns(); + setSelectedRun(null); + setSelectedSnapshot(null); + } + }, [selectedBaseline, refreshRuns]); + + const handleSelectRun = async (run: ValidationRun) => { + setLoadingDetail(true); + setSelectedSnapshot(null); + try { + const detail = await getRunDetail(run.id); + setSelectedRun(detail); + } finally { + setLoadingDetail(false); + } + }; + + const handleDeleteRun = async (runId: string) => { + if (confirm("Delete this run? This cannot be undone.")) { + await deleteRun(runId); + if (selectedRun?.id === runId) { + setSelectedRun(null); + setSelectedSnapshot(null); + } + } + }; + + if (baselinesLoading) { + return ; + } + + if (baselines.length === 0) { + return ( + + ); + } + + return ( +
+ {/* Baseline Selector */} +
+ +
+ + {selectedBaseline && ( +
+ {/* Run List */} +
+
+

Validation Runs

+
+
+ {runsLoading ? ( +
Loading...
+ ) : runs.length === 0 ? ( +
No runs yet
+ ) : ( + runs.map((run) => ( + handleSelectRun(run)} + onDelete={() => handleDeleteRun(run.id)} + /> + )) + )} +
+
+ + {/* Run Detail / Snapshot List */} +
+ {loadingDetail ? ( + + ) : selectedSnapshot ? ( + setSelectedSnapshot(null)} + /> + ) : selectedRun ? ( + + ) : ( +
+ Select a run to view details +
+ )} +
+
+ )} +
+ ); +} + +interface RunListItemProps { + run: ValidationRun; + isSelected: boolean; + onSelect: () => void; + onDelete: () => void; +} + +function RunListItem({ run, isSelected, onSelect, onDelete }: RunListItemProps) { + return ( +
+
+
+
+ + {run.name || `Run ${run.id.slice(0, 8)}`} + + +
+
{formatDate(run.createdAt)}
+ {run.status === "completed" && ( +
+ {run.unchangedCount} unchanged + {run.changedCount > 0 && ( + {run.changedCount} changed + )} +
+ )} +
+ +
+
+ ); +} + +interface RunDetailProps { + run: ValidationRunDetail; + onSelectSnapshot: (snapshot: RunSnapshot) => void; +} + +function RunDetail({ run, onSelectSnapshot }: RunDetailProps) { + const unchangedSnapshots = run.snapshots.filter((s) => s.status === "unchanged"); + const changedSnapshots = run.snapshots.filter((s) => s.status === "changed"); + + return ( +
+
+

+ {run.name || `Run ${run.id.slice(0, 8)}`} +

+

+ {formatDate(run.createdAt)} | Baseline: {run.baseline.name} +

+ {run.summary &&

{run.summary}

} +
+ +
+ {/* Changed Snapshots First */} + {changedSnapshots.length > 0 && ( +
+

+ Changed ({changedSnapshots.length}) +

+
+ {changedSnapshots.map((snapshot) => ( + onSelectSnapshot(snapshot)} + /> + ))} +
+
+ )} + + {/* Unchanged Snapshots */} + {unchangedSnapshots.length > 0 && ( +
+

+ Unchanged ({unchangedSnapshots.length}) +

+
+ {unchangedSnapshots.map((snapshot) => ( + onSelectSnapshot(snapshot)} + /> + ))} +
+
+ )} +
+
+ ); +} + +function SnapshotListItem({ + snapshot, + onSelect, +}: { + snapshot: RunSnapshot; + onSelect: () => void; +}) { + return ( +
+
+

{snapshot.documentTitle}

+

+ {snapshot.keptCount} matched | {snapshot.newCount} new | {snapshot.lostCount} gone +

+
+ +
+ ); +} + +function StatusBadge({ status }: { status: string }) { + const colors = { + running: "bg-yellow-100 text-yellow-800", + completed: "bg-green-100 text-green-800", + failed: "bg-red-100 text-red-800", + }; + return ( + + {status} + + ); +} + +function LoadingState({ message }: { message: string }) { + return ( +
+
{message}
+
+ ); +} + +function EmptyState({ message, action }: { message: string; action: string }) { + return ( +
+

{message}

+

{action}

+
+ ); +} diff --git a/apps/web/src/app/monitor/lab/components/tabs/RunTab.tsx b/apps/web/src/app/monitor/lab/components/tabs/RunTab.tsx new file mode 100644 index 00000000..133456b8 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/tabs/RunTab.tsx @@ -0,0 +1,306 @@ +"use client"; + +import { useEffect, useState, useCallback } from "react"; +import { useBaselines } from "../../hooks/useBaselines"; +import { formatDate } from "../../utils/formatters"; +import type { Baseline } from "../../types"; +import { PlayIcon, ArrowPathIcon, CheckCircleIcon, XCircleIcon } from "@heroicons/react/24/outline"; + +interface RunTabProps { + agentId: string; + selectedBaseline: Baseline | null; + onSelectBaseline: (baseline: Baseline | null) => void; +} + +interface RunProgress { + phase: "idle" | "starting" | "running" | "comparing" | "done" | "error"; + message: string; + completed: number; + total: number; + runId?: string; + error?: string; +} + +export function RunTab({ agentId, selectedBaseline, onSelectBaseline }: RunTabProps) { + const { baselines, loading: baselinesLoading, refresh: refreshBaselines } = useBaselines(agentId); + const [runName, setRunName] = useState(""); + const [progress, setProgress] = useState({ + phase: "idle", + message: "", + completed: 0, + total: 0, + }); + + useEffect(() => { + refreshBaselines(); + }, [refreshBaselines]); + + const pollJobStatus = useCallback(async (jobIds: string[]): Promise => { + const res = await fetch(`/api/monitor/lab/jobs/status?jobIds=${jobIds.join(",")}`); + if (!res.ok) throw new Error("Failed to check job status"); + + const data = await res.json(); + setProgress((p) => ({ + ...p, + completed: data.summary.completed + data.summary.failed, + total: data.summary.total, + message: `${data.summary.completed} completed, ${data.summary.running} running, ${data.summary.pending} pending`, + })); + + return data.summary.allDone; + }, []); + + const startRun = async () => { + if (!selectedBaseline) return; + + setProgress({ + phase: "starting", + message: "Creating validation run...", + completed: 0, + total: 0, + }); + + try { + // Start the run + const startRes = await fetch("/api/monitor/lab/runs/start", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + baselineId: selectedBaseline.id, + name: runName || undefined, + }), + }); + + if (!startRes.ok) { + const err = await startRes.json(); + throw new Error(err.error || "Failed to start run"); + } + + const startData = await startRes.json(); + const runId = startData.run.id; + const { jobIds } = startData; + + setProgress({ + phase: "running", + message: `Evaluating ${jobIds.length} documents...`, + completed: 0, + total: jobIds.length, + runId, + }); + + // Poll for job completion + const maxWaitMs = 10 * 60 * 1000; // 10 minutes + const pollIntervalMs = 3000; // 3 seconds + const startTime = Date.now(); + + while (Date.now() - startTime < maxWaitMs) { + const allDone = await pollJobStatus(jobIds); + if (allDone) break; + await new Promise((r) => setTimeout(r, pollIntervalMs)); + } + + // Finalize the run (compare results) + setProgress((p) => ({ + ...p, + phase: "comparing", + message: "Comparing results with baseline...", + })); + + const finalizeRes = await fetch(`/api/monitor/lab/runs/${runId}/finalize`, { + method: "POST", + }); + + if (!finalizeRes.ok) { + const err = await finalizeRes.json(); + throw new Error(err.error || "Failed to finalize run"); + } + + const finalizeData = await finalizeRes.json(); + + setProgress({ + phase: "done", + message: finalizeData.summary, + completed: finalizeData.unchangedCount + finalizeData.changedCount, + total: finalizeData.unchangedCount + finalizeData.changedCount, + runId, + }); + + setRunName(""); + } catch (error) { + setProgress((p) => ({ + ...p, + phase: "error", + message: error instanceof Error ? error.message : "Unknown error", + error: error instanceof Error ? error.message : "Unknown error", + })); + } + }; + + if (baselinesLoading) { + return ; + } + + if (baselines.length === 0) { + return ( + + ); + } + + const isRunning = progress.phase === "starting" || progress.phase === "running" || progress.phase === "comparing"; + + return ( +
+ {/* Baseline Selector */} +
+

Select Baseline

+ +
+ + {/* Run Configuration */} + {selectedBaseline && ( +
+

Run Validation

+
+
+ + setRunName(e.target.value)} + placeholder="e.g., After filter changes" + disabled={isRunning} + className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100" + /> +
+
+ + + Will re-evaluate {selectedBaseline.snapshotCount} documents + +
+
+
+ )} + + {/* Progress */} + {progress.phase !== "idle" && ( + + )} +
+ ); +} + +function ProgressPanel({ progress }: { progress: RunProgress }) { + const getStatusColor = () => { + switch (progress.phase) { + case "done": + return "bg-green-50 border-green-200"; + case "error": + return "bg-red-50 border-red-200"; + default: + return "bg-blue-50 border-blue-200"; + } + }; + + const getIcon = () => { + switch (progress.phase) { + case "done": + return ; + case "error": + return ; + default: + return ; + } + }; + + const progressPercent = progress.total > 0 ? Math.round((progress.completed / progress.total) * 100) : 0; + + return ( +
+
+ {getIcon()} +
+

+ {progress.phase === "starting" && "Starting..."} + {progress.phase === "running" && "Running Evaluations"} + {progress.phase === "comparing" && "Comparing Results"} + {progress.phase === "done" && "Completed"} + {progress.phase === "error" && "Error"} +

+

{progress.message}

+ + {/* Progress bar */} + {(progress.phase === "running" || progress.phase === "comparing") && progress.total > 0 && ( +
+
+ {progress.completed} / {progress.total} + {progressPercent}% +
+
+
+
+
+ )} + + {/* Link to history */} + {progress.phase === "done" && progress.runId && ( +

+ View results in the History tab +

+ )} +
+
+
+ ); +} + +function LoadingState({ message }: { message: string }) { + return ( +
+
{message}
+
+ ); +} + +function EmptyState({ message, action }: { message: string; action: string }) { + return ( +
+

{message}

+

{action}

+
+ ); +} diff --git a/apps/web/src/app/monitor/lab/hooks/useBaselines.ts b/apps/web/src/app/monitor/lab/hooks/useBaselines.ts new file mode 100644 index 00000000..a5f3ebc3 --- /dev/null +++ b/apps/web/src/app/monitor/lab/hooks/useBaselines.ts @@ -0,0 +1,63 @@ +import { useState, useCallback, useEffect } from "react"; +import type { Baseline } from "../types"; + +interface UseBaselinesReturn { + baselines: Baseline[]; + loading: boolean; + error: string | null; + refresh: () => Promise; + createBaseline: (name: string, description: string, evaluationVersionIds: string[]) => Promise; + deleteBaseline: (id: string) => Promise; +} + +export function useBaselines(agentId: string): UseBaselinesReturn { + const [baselines, setBaselines] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + const refresh = useCallback(async () => { + if (!agentId) return; + setLoading(true); + setError(null); + try { + const res = await fetch(`/api/monitor/lab/baselines?agentId=${agentId}`); + if (!res.ok) throw new Error("Failed to fetch baselines"); + const data = await res.json(); + setBaselines(data.baselines); + } catch (err) { + setError(err instanceof Error ? err.message : "Unknown error"); + } finally { + setLoading(false); + } + }, [agentId]); + + useEffect(() => { + refresh(); + }, [refresh]); + + const createBaseline = useCallback( + async (name: string, description: string, evaluationVersionIds: string[]): Promise => { + const res = await fetch("/api/monitor/lab/baselines", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ name, description, agentId, evaluationVersionIds }), + }); + if (!res.ok) throw new Error("Failed to create baseline"); + const data = await res.json(); + await refresh(); + return data.baseline; + }, + [agentId, refresh] + ); + + const deleteBaseline = useCallback( + async (id: string) => { + const res = await fetch(`/api/monitor/lab/baselines/${id}`, { method: "DELETE" }); + if (!res.ok) throw new Error("Failed to delete baseline"); + await refresh(); + }, + [refresh] + ); + + return { baselines, loading, error, refresh, createBaseline, deleteBaseline }; +} diff --git a/apps/web/src/app/monitor/lab/hooks/useCorpusDocs.ts b/apps/web/src/app/monitor/lab/hooks/useCorpusDocs.ts new file mode 100644 index 00000000..8fe9f0d6 --- /dev/null +++ b/apps/web/src/app/monitor/lab/hooks/useCorpusDocs.ts @@ -0,0 +1,40 @@ +import { useState, useCallback } from "react"; +import type { CorpusDocument } from "../types"; + +interface UseCorpusDocsReturn { + documents: CorpusDocument[]; + loading: boolean; + error: string | null; + refresh: (filter?: string) => Promise; +} + +export function useCorpusDocs(agentId: string): UseCorpusDocsReturn { + const [documents, setDocuments] = useState([]); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + + const refresh = useCallback( + async (filter?: string) => { + if (!agentId) return; + setLoading(true); + setError(null); + try { + const url = new URL("/api/monitor/lab/corpus", window.location.origin); + url.searchParams.set("agentId", agentId); + if (filter) url.searchParams.set("filter", filter); + + const res = await fetch(url.toString()); + if (!res.ok) throw new Error("Failed to fetch corpus documents"); + const data = await res.json(); + setDocuments(data.documents); + } catch (err) { + setError(err instanceof Error ? err.message : "Unknown error"); + } finally { + setLoading(false); + } + }, + [agentId] + ); + + return { documents, loading, error, refresh }; +} diff --git a/apps/web/src/app/monitor/lab/hooks/useRuns.ts b/apps/web/src/app/monitor/lab/hooks/useRuns.ts new file mode 100644 index 00000000..c8551272 --- /dev/null +++ b/apps/web/src/app/monitor/lab/hooks/useRuns.ts @@ -0,0 +1,73 @@ +import { useState, useCallback, useEffect } from "react"; +import type { ValidationRun, ValidationRunDetail } from "../types"; + +interface UseRunsReturn { + runs: ValidationRun[]; + loading: boolean; + error: string | null; + refresh: () => Promise; + startRun: (baselineId: string, name?: string) => Promise; + getRunDetail: (runId: string) => Promise; + deleteRun: (runId: string) => Promise; +} + +export function useRuns(baselineId: string | null): UseRunsReturn { + const [runs, setRuns] = useState([]); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + + const refresh = useCallback(async () => { + if (!baselineId) { + setRuns([]); + return; + } + setLoading(true); + setError(null); + try { + const res = await fetch(`/api/monitor/lab/runs?baselineId=${baselineId}`); + if (!res.ok) throw new Error("Failed to fetch runs"); + const data = await res.json(); + setRuns(data.runs); + } catch (err) { + setError(err instanceof Error ? err.message : "Unknown error"); + } finally { + setLoading(false); + } + }, [baselineId]); + + useEffect(() => { + refresh(); + }, [refresh]); + + const startRun = useCallback( + async (baselineId: string, name?: string): Promise => { + const res = await fetch("/api/monitor/lab/runs", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ baselineId, name }), + }); + if (!res.ok) throw new Error("Failed to start run"); + const data = await res.json(); + return data.run; + }, + [] + ); + + const getRunDetail = useCallback(async (runId: string): Promise => { + const res = await fetch(`/api/monitor/lab/runs/${runId}`); + if (!res.ok) return null; + const data = await res.json(); + return data.run; + }, []); + + const deleteRun = useCallback( + async (runId: string) => { + const res = await fetch(`/api/monitor/lab/runs/${runId}`, { method: "DELETE" }); + if (!res.ok) throw new Error("Failed to delete run"); + await refresh(); + }, + [refresh] + ); + + return { runs, loading, error, refresh, startRun, getRunDetail, deleteRun }; +} diff --git a/apps/web/src/app/monitor/lab/page.tsx b/apps/web/src/app/monitor/lab/page.tsx new file mode 100644 index 00000000..77e5c422 --- /dev/null +++ b/apps/web/src/app/monitor/lab/page.tsx @@ -0,0 +1,331 @@ +"use client"; + +import { useState, useCallback } from "react"; +import { useBaselines } from "./hooks/useBaselines"; +import { useRuns } from "./hooks/useRuns"; +import type { Baseline } from "./types"; +import { formatDate } from "./utils/formatters"; +import { PlusIcon, PlayIcon, ArrowPathIcon, TrashIcon } from "@heroicons/react/24/outline"; +import { CreateBaselineModal } from "./components/baselines/CreateBaselineModal"; +import { RunDetail } from "./components/history/RunDetail"; + +const AGENT_ID = "system-fallacy-check"; + +function getDefaultRunName(): string { + const now = new Date(); + return `Run ${now.toLocaleString("en-US", { month: "short", day: "numeric", hour: "numeric", minute: "2-digit" })}`; +} + +export default function LabPage() { + const { baselines, loading: baselinesLoading, refresh: refreshBaselines, deleteBaseline } = useBaselines(AGENT_ID); + const [selectedBaseline, setSelectedBaseline] = useState(null); + const [showCreateModal, setShowCreateModal] = useState(false); + const [expandedRun, setExpandedRun] = useState(null); + + // Run state + const [runName, setRunName] = useState(getDefaultRunName); + const [runProgress, setRunProgress] = useState<{ + phase: "idle" | "starting" | "running" | "comparing" | "done" | "error"; + message: string; + completed: number; + total: number; + }>({ phase: "idle", message: "", completed: 0, total: 0 }); + + // Get runs for selected baseline + const { runs, loading: runsLoading, refresh: refreshRuns } = useRuns(selectedBaseline?.id ?? null); + + const pollJobStatus = useCallback(async (jobIds: string[]): Promise => { + const res = await fetch(`/api/monitor/lab/jobs/status?jobIds=${jobIds.join(",")}`); + if (!res.ok) throw new Error("Failed to check job status"); + const data = await res.json(); + setRunProgress((p) => ({ + ...p, + completed: data.summary.completed + data.summary.failed, + total: data.summary.total, + message: `${data.summary.completed} completed, ${data.summary.running} running, ${data.summary.pending} pending`, + })); + return data.summary.allDone; + }, []); + + const startRun = async () => { + if (!selectedBaseline) return; + + setRunProgress({ phase: "starting", message: "Creating validation run...", completed: 0, total: 0 }); + + try { + const startRes = await fetch("/api/monitor/lab/runs/start", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ baselineId: selectedBaseline.id, name: runName || undefined }), + }); + + if (!startRes.ok) { + const err = await startRes.json(); + throw new Error(err.error || "Failed to start run"); + } + + const startData = await startRes.json(); + const runId = startData.run.id; + const { jobIds } = startData; + + setRunProgress({ + phase: "running", + message: `Evaluating ${jobIds.length} documents...`, + completed: 0, + total: jobIds.length, + }); + + // Poll for completion + const maxWaitMs = 10 * 60 * 1000; + const pollIntervalMs = 3000; + const startTime = Date.now(); + + while (Date.now() - startTime < maxWaitMs) { + const allDone = await pollJobStatus(jobIds); + if (allDone) break; + await new Promise((r) => setTimeout(r, pollIntervalMs)); + } + + setRunProgress((p) => ({ ...p, phase: "comparing", message: "Comparing results..." })); + + const finalizeRes = await fetch(`/api/monitor/lab/runs/${runId}/finalize`, { method: "POST" }); + if (!finalizeRes.ok) { + const err = await finalizeRes.json(); + throw new Error(err.error || "Failed to finalize run"); + } + + const finalizeData = await finalizeRes.json(); + setRunProgress({ + phase: "done", + message: finalizeData.summary, + completed: finalizeData.unchangedCount + finalizeData.changedCount, + total: finalizeData.unchangedCount + finalizeData.changedCount, + }); + + setRunName(getDefaultRunName()); + refreshRuns(); + } catch (error) { + setRunProgress((p) => ({ + ...p, + phase: "error", + message: error instanceof Error ? error.message : "Unknown error", + })); + } + }; + + const isRunning = runProgress.phase === "starting" || runProgress.phase === "running" || runProgress.phase === "comparing"; + const progressPercent = runProgress.total > 0 ? Math.round((runProgress.completed / runProgress.total) * 100) : 0; + + const handleBaselineCreated = () => { + setShowCreateModal(false); + refreshBaselines(); + }; + + const handleDeleteBaseline = async (id: string) => { + if (!confirm("Delete this baseline?")) return; + await deleteBaseline(id); + if (selectedBaseline?.id === id) { + setSelectedBaseline(null); + } + }; + + return ( +
+ {/* Left Sidebar - Baselines */} +
+
+
+

Baselines

+ +
+

Select a baseline to run validation

+
+ +
+ {baselinesLoading ? ( +
Loading...
+ ) : baselines.length === 0 ? ( +
No baselines yet
+ ) : ( +
+ {baselines.map((baseline) => ( +
{ + setSelectedBaseline(baseline); + setExpandedRun(null); + }} + className={`p-3 cursor-pointer hover:bg-gray-100 ${ + selectedBaseline?.id === baseline.id ? "bg-blue-50 border-l-4 border-blue-500" : "" + }`} + > +
+
+
{baseline.name}
+
+ {baseline.snapshotCount} docs Β· {formatDate(baseline.createdAt)} +
+
+ +
+
+ ))} +
+ )} +
+
+ + {/* Main Content */} +
+ {!selectedBaseline ? ( +
+
+

Select a baseline to get started

+

or create a new one

+
+
+ ) : ( + <> + {/* Run Controls Header */} +
+
+
+

{selectedBaseline.name}

+

+ {selectedBaseline.snapshotCount} documents Β· Created {formatDate(selectedBaseline.createdAt)} +

+
+
+ setRunName(e.target.value)} + placeholder="Run name (optional)" + disabled={isRunning} + className="px-3 py-2 border rounded-md text-sm w-48 disabled:bg-gray-100" + /> + +
+
+ + {/* Progress Bar */} + {runProgress.phase !== "idle" && ( +
+
+ + {runProgress.phase === "starting" && "Starting..."} + {runProgress.phase === "running" && "Running evaluations"} + {runProgress.phase === "comparing" && "Comparing results"} + {runProgress.phase === "done" && "Complete"} + {runProgress.phase === "error" && "Error"} + + {runProgress.message} +
+ {(runProgress.phase === "running" || runProgress.phase === "comparing") && ( +
+
+
+ )} +
+ )} +
+ + {/* Run History */} +
+

Run History

+ {runsLoading ? ( +
Loading runs...
+ ) : runs.length === 0 ? ( +
+

No runs yet for this baseline

+

Click "Run Validation" to start

+
+ ) : ( +
+ {runs.map((run) => ( +
+
setExpandedRun(expandedRun === run.id ? null : run.id)} + className="p-4 cursor-pointer hover:bg-gray-50 flex items-center justify-between" + > +
+
{run.name}
+
+ {formatDate(run.createdAt)} Β· {run.summary || run.status} +
+
+
+ + {expandedRun === run.id ? "β–Ό" : "β–Ά"} +
+
+ {expandedRun === run.id && ( +
+ +
+ )} +
+ ))} +
+ )} +
+ + )} +
+ + {/* Create Baseline Modal */} + {showCreateModal && ( + setShowCreateModal(false)} + onCreated={handleBaselineCreated} + /> + )} +
+ ); +} + +function RunStatusBadge({ status }: { status: string }) { + const styles = { + completed: "bg-green-100 text-green-800", + running: "bg-blue-100 text-blue-800", + failed: "bg-red-100 text-red-800", + pending: "bg-gray-100 text-gray-800", + }; + return ( + + {status} + + ); +} diff --git a/apps/web/src/app/monitor/lab/types.ts b/apps/web/src/app/monitor/lab/types.ts new file mode 100644 index 00000000..8d3b3b58 --- /dev/null +++ b/apps/web/src/app/monitor/lab/types.ts @@ -0,0 +1,122 @@ +// Types for the Lab (Validation) feature + +export interface Baseline { + id: string; + name: string; + description: string | null; + commitHash: string | null; + createdAt: string; + snapshotCount: number; +} + +export interface CorpusDocument { + documentId: string; + title: string; + contentLength: number; + lastEvaluatedAt: string | null; + evaluationCount: number; +} + +export interface ValidationRun { + id: string; + name: string | null; + commitHash: string | null; + status: "running" | "completed" | "failed"; + summary: string | null; + createdAt: string; + completedAt: string | null; + snapshotCount: number; + unchangedCount: number; + changedCount: number; +} + +export interface RunSnapshot { + id: string; + status: "unchanged" | "changed"; + keptCount: number; + newCount: number; + lostCount: number; + documentId: string; + documentTitle: string; + comparisonData: ComparisonData | null; +} + +export interface ComparisonData { + matchedComments: CommentMatch[]; + newComments: Comment[]; + lostComments: Comment[]; + filteredItems?: FilteredItem[]; + pipelineCounts?: PipelineCounts; + extractionPhase?: ExtractionPhase; + stages?: StageMetrics[]; + totalDurationMs?: number; +} + +export interface CommentMatch { + baselineComment: Comment; + currentComment: Comment; + matchConfidence: number; + status: string; +} + +export interface Comment { + id: string; + quotedText: string; + header: string | null; + description: string; + importance: number | null; +} + +export interface FilteredItem { + stage: "supported-elsewhere-filter" | "review"; + filterReason: string; + quotedText: string; + header?: string; + originalIndex?: number; + supportLocation?: string; +} + +export interface PipelineCounts { + issuesAfterDedup: number; + issuesAfterFiltering: number; + commentsGenerated: number; + commentsKept: number; +} + +export interface ExtractorInfo { + extractorId: string; + model: string; + issuesFound: number; + durationMs?: number; + costUsd?: number; +} + +export interface ExtractionPhase { + totalIssuesBeforeJudge: number; + totalIssuesAfterJudge: number; + extractors?: ExtractorInfo[]; + judgeDurationMs?: number; +} + +export interface StageMetrics { + stageName: string; + durationMs: number; + inputCount: number; + outputCount: number; + model?: string; + costUsd?: number; +} + +export interface ValidationRunDetail { + id: string; + name: string | null; + commitHash: string | null; + status: string; + summary: string | null; + createdAt: string; + completedAt: string | null; + baseline: { id: string; name: string }; + snapshots: RunSnapshot[]; +} + +export type TabId = "baselines" | "run" | "history"; diff --git a/apps/web/src/app/monitor/lab/utils/formatters.ts b/apps/web/src/app/monitor/lab/utils/formatters.ts new file mode 100644 index 00000000..d855ac6e --- /dev/null +++ b/apps/web/src/app/monitor/lab/utils/formatters.ts @@ -0,0 +1,54 @@ +// Display formatting utilities + +export function formatDate(dateString: string): string { + const date = new Date(dateString); + return date.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + year: "numeric", + hour: "2-digit", + minute: "2-digit", + }); +} + +export function formatDateShort(dateString: string): string { + const date = new Date(dateString); + return date.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + }); +} + +export function truncate(str: string, maxLen: number): string { + const clean = str.replace(/[\n\r\t]+/g, " ").replace(/\s+/g, " ").trim(); + if (clean.length <= maxLen) return clean; + return clean.slice(0, maxLen - 3) + "..."; +} + +export function formatStatus(status: string): string { + switch (status) { + case "running": + return "Running"; + case "completed": + return "Completed"; + case "failed": + return "Failed"; + case "unchanged": + return "Unchanged"; + case "changed": + return "Changed"; + default: + return status; + } +} + +export function formatFilterStage(stage: string): string { + switch (stage) { + case "supported-elsewhere-filter": + return "Filter"; + case "review": + return "Review"; + default: + return stage; + } +} From f93f350fc4f2d28c6238936e241626dcabe7a51e Mon Sep 17 00:00:00 2001 From: Michael Ravits Date: Sun, 18 Jan 2026 16:15:43 +0000 Subject: [PATCH 42/72] feat(lab): Add profile editor with configurable filter chain - Add profile management UI with full CRUD operations - Add filter chain editor with add/remove/reorder capabilities - Support temperature, reasoning, and custom prompt settings for filters - Add model selector with Anthropic + OpenRouter models - Extend Claude wrapper to support ThinkingConfig with custom budget_tokens - Wire pipeline to read filter config from profile - Add profile-types and profile-loader for backend config validation - Add migrations for FallacyCheckerProfile and ValidationRun.profileId - Remove deprecated ProfileEditorModal (replaced by Profiles tab) - Improve flow summary with clearer labels (max/model, similarity, intake) Co-Authored-By: Claude Opus 4.5 --- .../src/app/api/monitor/lab/models/route.ts | 26 + .../api/monitor/lab/profiles/[id]/route.ts | 150 +++++ .../src/app/api/monitor/lab/profiles/route.ts | 133 ++++ .../src/app/api/monitor/lab/prompts/route.ts | 22 + .../app/api/monitor/lab/runs/start/route.ts | 4 +- .../components/profiles/ExtractorEditor.tsx | 523 +++++++++++++++ .../components/profiles/FilterChainEditor.tsx | 592 +++++++++++++++++ .../lab/components/profiles/JudgeEditor.tsx | 201 ++++++ .../lab/components/profiles/ModelSelector.tsx | 160 +++++ .../components/profiles/ProfileDetailView.tsx | 606 ++++++++++++++++++ .../lab/components/profiles/ProfilesList.tsx | 130 ++++ .../monitor/lab/hooks/useDefaultPrompts.ts | 37 ++ .../src/app/monitor/lab/hooks/useModels.ts | 86 +++ .../src/app/monitor/lab/hooks/useProfiles.ts | 110 ++++ apps/web/src/app/monitor/lab/page.tsx | 263 ++++++-- apps/web/src/app/monitor/lab/types.ts | 124 ++++ internal-packages/ai/package.json | 12 + .../ai/src/analysis-plugins/PluginManager.ts | 23 +- .../fallacy-check/extraction/config.ts | 30 + .../extraction/multiExtractor.ts | 11 +- .../plugins/fallacy-check/extraction/types.ts | 14 + .../plugins/fallacy-check/index.ts | 218 ++++++- .../plugins/fallacy-check/profile-loader.ts | 439 +++++++++++++ .../plugins/fallacy-check/profile-types.ts | 308 +++++++++ .../telemetry/PipelineTelemetry.ts | 11 + .../plugins/fallacy-check/telemetry/index.ts | 1 + .../plugins/fallacy-check/telemetry/types.ts | 122 +++- internal-packages/ai/src/claude/wrapper.ts | 93 ++- internal-packages/ai/src/index.ts | 7 + .../ai/src/tools/fallacy-extractor/index.ts | 131 +--- .../ai/src/tools/fallacy-extractor/prompts.ts | 117 ++++ .../ai/src/tools/fallacy-extractor/types.ts | 25 + .../ai/src/tools/fallacy-judge/index.ts | 30 +- .../ai/src/tools/fallacy-judge/prompts.ts | 33 + .../ai/src/tools/fallacy-judge/types.ts | 3 + .../ai/src/tools/fallacy-review/index.ts | 6 +- .../ai/src/tools/fallacy-review/types.ts | 3 + .../ai/src/tools/generated-schemas.ts | 26 +- .../tools/supported-elsewhere-filter/index.ts | 100 +-- .../supported-elsewhere-filter/prompts.ts | 53 ++ .../tools/supported-elsewhere-filter/types.ts | 18 + internal-packages/ai/src/utils/allModels.ts | 168 +++++ internal-packages/ai/src/utils/openrouter.ts | 51 ++ .../documentAnalysis/analyzeDocument.ts | 49 +- .../documentAnalysis/unified/index.ts | 7 + .../migration.sql | 22 + .../migration.sql | 2 + internal-packages/db/prisma/schema.prisma | 17 + .../repositories/MetaEvaluationRepository.ts | 5 +- meta-evals/src/components/ModelSelector.tsx | 2 +- 50 files changed, 5028 insertions(+), 296 deletions(-) create mode 100644 apps/web/src/app/api/monitor/lab/models/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/profiles/[id]/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/profiles/route.ts create mode 100644 apps/web/src/app/api/monitor/lab/prompts/route.ts create mode 100644 apps/web/src/app/monitor/lab/components/profiles/ExtractorEditor.tsx create mode 100644 apps/web/src/app/monitor/lab/components/profiles/FilterChainEditor.tsx create mode 100644 apps/web/src/app/monitor/lab/components/profiles/JudgeEditor.tsx create mode 100644 apps/web/src/app/monitor/lab/components/profiles/ModelSelector.tsx create mode 100644 apps/web/src/app/monitor/lab/components/profiles/ProfileDetailView.tsx create mode 100644 apps/web/src/app/monitor/lab/components/profiles/ProfilesList.tsx create mode 100644 apps/web/src/app/monitor/lab/hooks/useDefaultPrompts.ts create mode 100644 apps/web/src/app/monitor/lab/hooks/useModels.ts create mode 100644 apps/web/src/app/monitor/lab/hooks/useProfiles.ts create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/profile-loader.ts create mode 100644 internal-packages/ai/src/analysis-plugins/plugins/fallacy-check/profile-types.ts create mode 100644 internal-packages/ai/src/tools/fallacy-extractor/prompts.ts create mode 100644 internal-packages/ai/src/tools/fallacy-judge/prompts.ts create mode 100644 internal-packages/ai/src/tools/supported-elsewhere-filter/prompts.ts create mode 100644 internal-packages/ai/src/utils/allModels.ts create mode 100644 internal-packages/db/prisma/migrations/20260118095326_add_fallacy_checker_profile/migration.sql create mode 100644 internal-packages/db/prisma/migrations/20260118100032_add_profileid_to_validation_run/migration.sql diff --git a/apps/web/src/app/api/monitor/lab/models/route.ts b/apps/web/src/app/api/monitor/lab/models/route.ts new file mode 100644 index 00000000..58716f8e --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/models/route.ts @@ -0,0 +1,26 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { getAllModels } from "@roast/ai"; + +/** + * GET /api/monitor/lab/models + * Fetch all available models from Anthropic + OpenRouter + */ +export async function GET(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + try { + const models = await getAllModels(); + return NextResponse.json({ models }); + } catch (error) { + logger.error("Error fetching models:", error); + return commonErrors.serverError("Failed to fetch models"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/profiles/[id]/route.ts b/apps/web/src/app/api/monitor/lab/profiles/[id]/route.ts new file mode 100644 index 00000000..035477d5 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/profiles/[id]/route.ts @@ -0,0 +1,150 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { prisma } from "@roast/db"; + +/** + * GET /api/monitor/lab/profiles/[id] + * Get a single profile by ID + */ +export async function GET( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id } = await params; + + try { + const profile = await prisma.fallacyCheckerProfile.findUnique({ + where: { id }, + }); + + if (!profile) { + return NextResponse.json({ error: "Profile not found" }, { status: 404 }); + } + + return NextResponse.json({ profile }); + } catch (error) { + logger.error("Error fetching profile:", error); + return commonErrors.serverError("Failed to fetch profile"); + } +} + +/** + * PUT /api/monitor/lab/profiles/[id] + * Update a profile + */ +export async function PUT( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id } = await params; + + try { + const body = await request.json(); + const { name, description, config, isDefault } = body; + + // Check profile exists + const existing = await prisma.fallacyCheckerProfile.findUnique({ + where: { id }, + }); + + if (!existing) { + return NextResponse.json({ error: "Profile not found" }, { status: 404 }); + } + + // Check for duplicate name (excluding current profile) + if (name && name !== existing.name) { + const duplicate = await prisma.fallacyCheckerProfile.findFirst({ + where: { + agentId: existing.agentId, + name, + id: { not: id }, + }, + }); + + if (duplicate) { + return NextResponse.json( + { error: "A profile with this name already exists" }, + { status: 400 } + ); + } + } + + // If setting as default, unset other defaults first + if (isDefault && !existing.isDefault) { + await prisma.fallacyCheckerProfile.updateMany({ + where: { agentId: existing.agentId, isDefault: true, id: { not: id } }, + data: { isDefault: false }, + }); + } + + const profile = await prisma.fallacyCheckerProfile.update({ + where: { id }, + data: { + ...(name !== undefined && { name }), + ...(description !== undefined && { description }), + ...(config !== undefined && { config }), + ...(isDefault !== undefined && { isDefault }), + }, + }); + + logger.info("Profile updated", { profileId: id }); + + return NextResponse.json({ profile }); + } catch (error) { + logger.error("Error updating profile:", error); + return commonErrors.serverError("Failed to update profile"); + } +} + +/** + * DELETE /api/monitor/lab/profiles/[id] + * Delete a profile + */ +export async function DELETE( + request: NextRequest, + { params }: { params: Promise<{ id: string }> } +) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const { id } = await params; + + try { + const existing = await prisma.fallacyCheckerProfile.findUnique({ + where: { id }, + }); + + if (!existing) { + return NextResponse.json({ error: "Profile not found" }, { status: 404 }); + } + + await prisma.fallacyCheckerProfile.delete({ + where: { id }, + }); + + logger.info("Profile deleted", { profileId: id }); + + return NextResponse.json({ success: true }); + } catch (error) { + logger.error("Error deleting profile:", error); + return commonErrors.serverError("Failed to delete profile"); + } +} diff --git a/apps/web/src/app/api/monitor/lab/profiles/route.ts b/apps/web/src/app/api/monitor/lab/profiles/route.ts new file mode 100644 index 00000000..e185439e --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/profiles/route.ts @@ -0,0 +1,133 @@ +import { NextRequest, NextResponse } from "next/server"; +import { logger } from "@/infrastructure/logging/logger"; +import { authenticateRequest } from "@/infrastructure/auth/auth-helpers"; +import { commonErrors } from "@/infrastructure/http/api-response-helpers"; +import { isAdmin } from "@/infrastructure/auth/auth"; +import { prisma } from "@roast/db"; + +/** + * GET /api/monitor/lab/profiles + * List all profiles for an agent + */ +export async function GET(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + const agentId = request.nextUrl.searchParams.get("agentId"); + if (!agentId) { + return NextResponse.json({ error: "agentId is required" }, { status: 400 }); + } + + try { + const profiles = await prisma.fallacyCheckerProfile.findMany({ + where: { agentId }, + orderBy: [ + { isDefault: "desc" }, + { name: "asc" }, + ], + }); + + return NextResponse.json({ profiles }); + } catch (error) { + logger.error("Error fetching profiles:", error); + return commonErrors.serverError("Failed to fetch profiles"); + } +} + +/** + * POST /api/monitor/lab/profiles + * Create a new profile + */ +export async function POST(request: NextRequest) { + const userId = await authenticateRequest(request); + if (!userId) return commonErrors.unauthorized(); + + const adminCheck = await isAdmin(); + if (!adminCheck) return commonErrors.forbidden(); + + try { + const body = await request.json(); + const { name, description, agentId, config, isDefault } = body; + + if (!name || !agentId) { + return NextResponse.json( + { error: "name and agentId are required" }, + { status: 400 } + ); + } + + // Check for duplicate name + const existing = await prisma.fallacyCheckerProfile.findFirst({ + where: { agentId, name }, + }); + + if (existing) { + return NextResponse.json( + { error: "A profile with this name already exists" }, + { status: 400 } + ); + } + + // If setting as default, unset other defaults first + if (isDefault) { + await prisma.fallacyCheckerProfile.updateMany({ + where: { agentId, isDefault: true }, + data: { isDefault: false }, + }); + } + + const profile = await prisma.fallacyCheckerProfile.create({ + data: { + name, + description: description ?? null, + agentId, + config: config ?? getDefaultConfig(), + isDefault: isDefault ?? false, + }, + }); + + logger.info("Profile created", { profileId: profile.id, name, agentId }); + + return NextResponse.json({ profile }); + } catch (error) { + logger.error("Error creating profile:", error); + return commonErrors.serverError("Failed to create profile"); + } +} + +/** + * Default profile configuration - matches the real fallacy checker defaults + */ +function getDefaultConfig() { + return { + version: 1, + models: { + extractors: [ + { model: "claude-sonnet-4-5-20250929", temperature: 0, thinking: false }, + { model: "google/gemini-3-flash-preview", temperature: "default", thinking: true }, + { model: "google/gemini-2.5-flash", temperature: "default", thinking: true }, + ], + judge: { + model: "claude-sonnet-4-5-20250929", + enabled: false, + }, + }, + thresholds: { + minSeverityThreshold: 60, + maxIssues: 15, + dedupThreshold: 0.7, + maxIssuesToProcess: 25, + }, + filterChain: { + filters: [ + { type: "dedup", enabled: true }, + { type: "supported-elsewhere", enabled: true }, + { type: "severity", enabled: true }, + { type: "review", enabled: true }, + ], + }, + }; +} diff --git a/apps/web/src/app/api/monitor/lab/prompts/route.ts b/apps/web/src/app/api/monitor/lab/prompts/route.ts new file mode 100644 index 00000000..233d4043 --- /dev/null +++ b/apps/web/src/app/api/monitor/lab/prompts/route.ts @@ -0,0 +1,22 @@ +import { NextResponse } from "next/server"; +import { + DEFAULT_EXTRACTOR_SYSTEM_PROMPT, + DEFAULT_EXTRACTOR_USER_PROMPT, +} from "@roast/ai/fallacy-extractor/prompts"; +import { DEFAULT_JUDGE_SYSTEM_PROMPT } from "@roast/ai/fallacy-judge/prompts"; +import { DEFAULT_SUPPORTED_ELSEWHERE_SYSTEM_PROMPT } from "@roast/ai/supported-elsewhere-filter/prompts"; + +/** + * GET /api/monitor/lab/prompts + * + * Returns the default prompts for the fallacy extractor, judge, and filter. + * Used by the profile editor UI to show placeholders. + */ +export async function GET() { + return NextResponse.json({ + extractorSystemPrompt: DEFAULT_EXTRACTOR_SYSTEM_PROMPT, + extractorUserPrompt: DEFAULT_EXTRACTOR_USER_PROMPT, + judgeSystemPrompt: DEFAULT_JUDGE_SYSTEM_PROMPT, + filterSystemPrompt: DEFAULT_SUPPORTED_ELSEWHERE_SYSTEM_PROMPT, + }); +} diff --git a/apps/web/src/app/api/monitor/lab/runs/start/route.ts b/apps/web/src/app/api/monitor/lab/runs/start/route.ts index b583b99c..aa358464 100644 --- a/apps/web/src/app/api/monitor/lab/runs/start/route.ts +++ b/apps/web/src/app/api/monitor/lab/runs/start/route.ts @@ -22,7 +22,7 @@ export async function POST(request: NextRequest) { try { const body = await request.json(); - const { baselineId, name } = body; + const { baselineId, name, profileId } = body; if (!baselineId) { return NextResponse.json({ error: "baselineId is required" }, { status: 400 }); @@ -49,6 +49,7 @@ export async function POST(request: NextRequest) { const run = await metaEvaluationRepository.createValidationRun({ baselineId, name: name || `Run ${new Date().toLocaleString()}`, + profileId: profileId || undefined, }); // Create batch for the jobs @@ -92,6 +93,7 @@ export async function POST(request: NextRequest) { logger.info("Validation run started", { runId: run.id, baselineId, + profileId: profileId || null, documentCount: documentIds.length, jobCount: jobIds.length, }); diff --git a/apps/web/src/app/monitor/lab/components/profiles/ExtractorEditor.tsx b/apps/web/src/app/monitor/lab/components/profiles/ExtractorEditor.tsx new file mode 100644 index 00000000..c2dce422 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/profiles/ExtractorEditor.tsx @@ -0,0 +1,523 @@ +"use client"; + +import { useState, useMemo, useRef, useEffect } from "react"; +import { PlusIcon, TrashIcon, ChevronDownIcon, MagnifyingGlassIcon } from "@heroicons/react/24/outline"; +import type { ExtractorConfig, ReasoningConfig, ReasoningEffort } from "../../types"; +import { useModels, type ModelInfo } from "../../hooks/useModels"; + +const REASONING_OPTIONS: Array<{ value: string; label: string; tokens: string }> = [ + { value: "off", label: "Off", tokens: "" }, + { value: "minimal", label: "Minimal", tokens: "1K" }, + { value: "low", label: "Low", tokens: "2K" }, + { value: "medium", label: "Medium", tokens: "8K" }, + { value: "high", label: "High", tokens: "16K" }, + { value: "xhigh", label: "Very High", tokens: "32K" }, +]; + +const TEMP_PRESETS: Array<{ value: "default" | number; label: string }> = [ + { value: "default", label: "Auto" }, + { value: 0, label: "0" }, + { value: 0.3, label: "0.3" }, + { value: 0.7, label: "0.7" }, + { value: 1.0, label: "1.0" }, +]; + +interface ExtractorEditorProps { + extractors: ExtractorConfig[]; + onChange: (extractors: ExtractorConfig[]) => void; + disabled?: boolean; +} + +export function ExtractorEditor({ extractors, onChange, disabled }: ExtractorEditorProps) { + const { models, loading: modelsLoading, error: modelsError } = useModels(); + const [addingExtractor, setAddingExtractor] = useState(false); + + const updateExtractor = (index: number, updates: Partial) => { + const newExtractors = [...extractors]; + newExtractors[index] = { ...newExtractors[index], ...updates }; + onChange(newExtractors); + }; + + const removeExtractor = (index: number) => { + if (extractors.length <= 1) return; + onChange(extractors.filter((_, i) => i !== index)); + }; + + const addExtractor = (model: ModelInfo) => { + onChange([ + ...extractors, + { model: model.id, temperature: "default", thinking: false }, + ]); + setAddingExtractor(false); + }; + + return ( +
+ {extractors.map((ext, index) => ( + updateExtractor(index, updates)} + onRemove={() => removeExtractor(index)} + canRemove={extractors.length > 1} + disabled={disabled} + /> + ))} + + {/* Add Extractor Button / Model Selector */} + {!disabled && ( + addingExtractor ? ( + setAddingExtractor(false)} + /> + ) : ( + + ) + )} +
+ ); +} + +interface ExtractorRowProps { + extractor: ExtractorConfig; + index: number; + models: ModelInfo[]; + modelsLoading: boolean; + onChange: (updates: Partial) => void; + onRemove: () => void; + canRemove: boolean; + disabled?: boolean; +} + +function ExtractorRow({ + extractor, + index, + models, + modelsLoading, + onChange, + onRemove, + canRemove, + disabled, +}: ExtractorRowProps) { + const [showModelDropdown, setShowModelDropdown] = useState(false); + const [showCustomTemp, setShowCustomTemp] = useState(false); + const [customTempValue, setCustomTempValue] = useState( + typeof extractor.temperature === "number" ? extractor.temperature : 0.5 + ); + + const modelName = getModelDisplayName(extractor.model); + + // Find model info for the selected model + const modelInfo = models.find((m) => m.id === extractor.model); + const supportsTemperature = modelInfo?.supportsTemperature ?? true; + const supportsReasoning = modelInfo?.supportsReasoning ?? true; + const defaultTemperature = modelInfo?.defaultTemperature; + const maxTemperature = modelInfo?.maxTemperature ?? 1; + + // Check if current value is a preset or custom + // Must include all dropdown option values, not just TEMP_PRESETS + const DROPDOWN_TEMPS = [0, 0.3, 0.7, 1, 1.5, 2]; + const isCustomTemp = typeof extractor.temperature === "number" && + !DROPDOWN_TEMPS.includes(extractor.temperature); + + // Build auto label with default temp if known + const autoLabel = defaultTemperature !== undefined + ? `Auto (${defaultTemperature})` + : "Auto"; + + return ( +
+ {/* Top row: model, reasoning, delete */} +
+ {/* Index */} + {index + 1} + + {/* Model Selector */} +
+ + {showModelDropdown && ( + { + onChange({ model: model.id }); + setShowModelDropdown(false); + }} + onCancel={() => setShowModelDropdown(false)} + compact + /> + )} +
+ + {/* Reasoning Dropdown - only show if model supports it */} + {supportsReasoning ? ( + + ) : ( + + Reasoning N/A + + )} + + {/* Delete Button */} + +
+ + {/* Temperature row - only show if model supports it */} + {supportsTemperature ? ( +
+
+ Temperature + +
+ + {/* Custom temperature slider */} + {(showCustomTemp || isCustomTemp) && !disabled && ( +
+ { + const val = parseFloat(e.target.value); + setCustomTempValue(val); + onChange({ temperature: val }); + }} + className="flex-1 h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer accent-blue-600" + /> + { + const val = parseFloat(e.target.value); + if (!isNaN(val) && val >= 0 && val <= maxTemperature) { + setCustomTempValue(val); + onChange({ temperature: val }); + } + }} + className="w-16 px-2 py-1 text-center text-sm border rounded" + /> +
+ )} +
+ ) : ( +
+ Temperature not supported by this model +
+ )} +
+ ); +} + +interface ModelSelectorProps { + models: ModelInfo[]; + loading: boolean; + error: string | null; + onSelect: (model: ModelInfo) => void; + onCancel: () => void; + compact?: boolean; +} + +function ModelSelector({ models, loading, error, onSelect, onCancel, compact }: ModelSelectorProps) { + const [search, setSearch] = useState(""); + const [highlightedIndex, setHighlightedIndex] = useState(0); + const inputRef = useRef(null); + const listRef = useRef(null); + + useEffect(() => { + inputRef.current?.focus(); + }, []); + + const filteredModels = useMemo(() => { + if (!search.trim()) return models; + const lowerSearch = search.toLowerCase(); + return models.filter( + (m) => + m.id.toLowerCase().includes(lowerSearch) || + m.name.toLowerCase().includes(lowerSearch) + ); + }, [models, search]); + + const anthropicModels = filteredModels.filter((m) => m.provider === "anthropic"); + const openRouterModels = filteredModels.filter((m) => m.provider === "openrouter"); + + // Handle keyboard navigation + const handleKeyDown = (e: React.KeyboardEvent) => { + if (e.key === "Escape") { + onCancel(); + } else if (e.key === "ArrowDown") { + e.preventDefault(); + setHighlightedIndex((prev) => Math.min(prev + 1, filteredModels.length - 1)); + } else if (e.key === "ArrowUp") { + e.preventDefault(); + setHighlightedIndex((prev) => Math.max(prev - 1, 0)); + } else if (e.key === "Enter" && filteredModels.length > 0) { + e.preventDefault(); + onSelect(filteredModels[highlightedIndex]); + } + }; + + // Scroll highlighted item into view + useEffect(() => { + const list = listRef.current; + if (!list) return; + const highlighted = list.querySelector(`[data-index="${highlightedIndex}"]`); + highlighted?.scrollIntoView({ block: "nearest" }); + }, [highlightedIndex]); + + if (loading) { + return ( +
+ Loading models... +
+ ); + } + + if (error) { + return ( +
+ {error} + +
+ ); + } + + return ( +
+ {/* Search Input */} +
+
+ + { + setSearch(e.target.value); + setHighlightedIndex(0); + }} + placeholder="Search models..." + className="flex-1 bg-transparent text-sm outline-none" + /> +
+
+ {filteredModels.length} models + +
+
+ + {/* Model List */} +
+ {anthropicModels.length > 0 && ( + <> +
+ Anthropic ({anthropicModels.length}) +
+ {anthropicModels.map((model, i) => { + const globalIndex = filteredModels.indexOf(model); + return ( + onSelect(model)} + onMouseEnter={() => setHighlightedIndex(globalIndex)} + dataIndex={globalIndex} + /> + ); + })} + + )} + + {openRouterModels.length > 0 && ( + <> +
+ OpenRouter ({openRouterModels.length}) +
+ {openRouterModels.map((model) => { + const globalIndex = filteredModels.indexOf(model); + return ( + onSelect(model)} + onMouseEnter={() => setHighlightedIndex(globalIndex)} + dataIndex={globalIndex} + /> + ); + })} + + )} + + {filteredModels.length === 0 && ( +
+ No models found matching "{search}" +
+ )} +
+
+ ); +} + +interface ModelItemProps { + model: ModelInfo; + isHighlighted: boolean; + onSelect: () => void; + onMouseEnter: () => void; + dataIndex: number; +} + +function ModelItem({ model, isHighlighted, onSelect, onMouseEnter, dataIndex }: ModelItemProps) { + return ( + + ); +} + +/** + * Shorten model ID for display + * e.g., "claude-sonnet-4-5-20250929" -> "claude-sonnet-4-5" + * e.g., "google/gemini-2.5-flash" -> "gemini-2.5-flash" + */ +function getModelDisplayName(modelId: string): string { + // Remove date suffix like -20250929 + let name = modelId.replace(/-\d{8}$/, ""); + + // Remove provider prefix like "google/" + if (name.includes("/")) { + name = name.split("/").pop() || name; + } + + return name; +} + +/** + * Convert ReasoningConfig to dropdown value string + * Handles both new reasoning config and legacy thinking boolean + */ +function getReasoningValue(reasoning: ReasoningConfig | undefined, thinking?: boolean): string { + // Handle new reasoning config + if (reasoning !== undefined) { + if (reasoning === false) return "off"; + if (typeof reasoning === "object" && "effort" in reasoning) { + return reasoning.effort; + } + // Custom budget_tokens - default to "high" in the dropdown + if (typeof reasoning === "object" && "budget_tokens" in reasoning) { + return "high"; + } + } + + // Fallback to legacy thinking boolean + if (thinking === true) return "medium"; // Default legacy "on" to medium + return "off"; +} diff --git a/apps/web/src/app/monitor/lab/components/profiles/FilterChainEditor.tsx b/apps/web/src/app/monitor/lab/components/profiles/FilterChainEditor.tsx new file mode 100644 index 00000000..f2d9c073 --- /dev/null +++ b/apps/web/src/app/monitor/lab/components/profiles/FilterChainEditor.tsx @@ -0,0 +1,592 @@ +"use client"; + +import { useState } from "react"; +import { + ChevronUpIcon, + ChevronDownIcon, + TrashIcon, + PlusIcon, + ChevronRightIcon, +} from "@heroicons/react/24/outline"; +import type { + FilterChainItem, + SupportedElsewhereFilterConfig, + SeverityFilterConfig, + ConfidenceFilterConfig, + ReasoningConfig, + ReasoningEffort, +} from "../../types"; +import { AVAILABLE_FILTER_TYPES, EFFORT_TO_BUDGET_TOKENS } from "../../types"; +import { useModels } from "../../hooks/useModels"; +import { ModelSelector, getModelDisplayName } from "./ModelSelector"; + +interface FilterChainEditorProps { + filters: FilterChainItem[]; + onChange: (filters: FilterChainItem[]) => void; + disabled?: boolean; + defaultFilterPrompt?: string; +} + +export function FilterChainEditor({ + filters, + onChange, + disabled, + defaultFilterPrompt, +}: FilterChainEditorProps) { + const [showAddMenu, setShowAddMenu] = useState(false); + + const moveFilter = (index: number, direction: "up" | "down") => { + if (disabled) return; + const newFilters = [...filters]; + const newIndex = direction === "up" ? index - 1 : index + 1; + if (newIndex < 0 || newIndex >= filters.length) return; + [newFilters[index], newFilters[newIndex]] = [newFilters[newIndex], newFilters[index]]; + onChange(newFilters); + }; + + const removeFilter = (index: number) => { + if (disabled) return; + onChange(filters.filter((_, i) => i !== index)); + }; + + const updateFilter = (index: number, updates: Partial) => { + if (disabled) return; + const newFilters = [...filters]; + newFilters[index] = { ...newFilters[index], ...updates } as FilterChainItem; + onChange(newFilters); + }; + + const toggleFilter = (index: number) => { + updateFilter(index, { enabled: !filters[index].enabled }); + }; + + const addFilter = (type: FilterChainItem["type"]) => { + if (disabled) return; + const id = `filter-${Date.now()}`; + let newFilter: FilterChainItem; + + switch (type) { + case "supported-elsewhere": + newFilter = { + id, + type: "supported-elsewhere", + enabled: true, + model: "claude-sonnet-4-5-20250929", + temperature: 0.1, + }; + break; + case "severity": + newFilter = { + id, + type: "severity", + enabled: true, + minSeverity: 50, + }; + break; + case "confidence": + newFilter = { + id, + type: "confidence", + enabled: true, + minConfidence: 50, + }; + break; + } + + onChange([...filters, newFilter]); + setShowAddMenu(false); + }; + + return ( +
+
+
+ +

+ Filters run in sequence. Each filter can remove issues from the pipeline. +

+
+
+ + {/* Filter List */} +
+ {filters.length === 0 ? ( +
+ No filters configured. Add a filter to remove false positives. +
+ ) : ( + filters.map((filter, index) => ( + moveFilter(index, dir)} + onRemove={() => removeFilter(index)} + onUpdate={(updates) => updateFilter(index, updates)} + onToggle={() => toggleFilter(index)} + /> + )) + )} +
+ + {/* Add Filter Button */} + {!disabled && ( +
+ + + {showAddMenu && ( +
+
+ Available Filters +
+ {AVAILABLE_FILTER_TYPES.map((filterType) => ( + + ))} +
+ +
+
+ )} +
+ )} +
+ ); +} + +interface FilterItemEditorProps { + filter: FilterChainItem; + index: number; + totalFilters: number; + disabled?: boolean; + defaultFilterPrompt?: string; + onMove: (direction: "up" | "down") => void; + onRemove: () => void; + onUpdate: (updates: Partial) => void; + onToggle: () => void; +} + +function FilterItemEditor({ + filter, + index, + totalFilters, + disabled, + defaultFilterPrompt, + onMove, + onRemove, + onUpdate, + onToggle, +}: FilterItemEditorProps) { + const [isExpanded, setIsExpanded] = useState(false); + + const filterLabel = AVAILABLE_FILTER_TYPES.find((f) => f.type === filter.type)?.label || filter.type; + + return ( +
+ {/* Header Row */} +
+ {/* Order controls */} + {!disabled && ( +
+ + +
+ )} + + {/* Index badge */} + + {index + 1} + + + {/* Expand/collapse button */} + + + {/* Enable/Disable toggle */} + + + {/* Delete button */} + {!disabled && ( + + )} +
+ + {/* Expanded Settings */} + {isExpanded && ( +
+ {filter.type === "supported-elsewhere" && ( + + )} + {filter.type === "severity" && ( + + )} + {filter.type === "confidence" && ( + + )} +
+ )} +
+ ); +} + +interface SupportedElsewhereSettingsProps { + filter: SupportedElsewhereFilterConfig; + disabled?: boolean; + defaultPrompt?: string; + onUpdate: (updates: Partial) => void; +} + +const TEMP_PRESETS: Array = ["default", 0, 0.1, 0.3, 0.5, 0.7, 1.0]; +const REASONING_EFFORT_OPTIONS: ReasoningEffort[] = ["minimal", "low", "medium", "high", "xhigh"]; + +function SupportedElsewhereSettings({ + filter, + disabled, + defaultPrompt, + onUpdate, +}: SupportedElsewhereSettingsProps) { + const { models, loading: modelsLoading } = useModels(); + const [showModelDropdown, setShowModelDropdown] = useState(false); + const [showTempDropdown, setShowTempDropdown] = useState(false); + const [showReasoningDropdown, setShowReasoningDropdown] = useState(false); + + // Get display value for temperature + const tempDisplay = filter.temperature === undefined || filter.temperature === "default" + ? "default" + : filter.temperature; + + // Get display value for reasoning + const getReasoningDisplay = () => { + if (filter.reasoning === undefined || filter.reasoning === false) return "Off"; + if ("effort" in filter.reasoning) return filter.reasoning.effort; + if ("budget_tokens" in filter.reasoning) return `${filter.reasoning.budget_tokens} tokens`; + return "Off"; + }; + + // Check if reasoning is enabled + const isReasoningEnabled = filter.reasoning !== undefined && filter.reasoning !== false; + + return ( +
+

+ Uses an LLM to check if each flagged issue is actually supported, explained, or qualified + elsewhere in the document. Issues that are well-supported are filtered out. +

+ + {/* Model Selection */} +
+ Model +
+ + {showModelDropdown && ( + { + onUpdate({ model: model.id }); + setShowModelDropdown(false); + }} + onCancel={() => setShowModelDropdown(false)} + /> + )} +
+
+ + {/* Temperature Selection */} +
+ Temperature +
+ + {showTempDropdown && ( +
+ {TEMP_PRESETS.map((temp) => ( + + ))} +
+ +
+
+ )} +
+
+ + {/* Reasoning/Thinking Selection */} +
+ Reasoning +
+ + {showReasoningDropdown && ( +
+ {/* Off option */} + +
+ {/* Effort levels */} + {REASONING_EFFORT_OPTIONS.map((effort) => { + const isSelected = filter.reasoning && "effort" in filter.reasoning && filter.reasoning.effort === effort; + return ( + + ); + })} +
+ +
+
+ )} +
+
+ + {/* Custom Prompt */} +
+
+ Custom Prompt + {filter.customPrompt && ( + + )} +
+