Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
af20c67
docs: Add fallacy checker refactor plan
michaelr524 Jan 3, 2026
ac79e4d
fix: Correct motte-bailey fallacy definition
michaelr524 Jan 3, 2026
8b6642a
feat(meta-evals): Add document search and improve UI
michaelr524 Jan 3, 2026
fa3fcbd
feat(meta-evals): Add delete series, better errors, tmux dev-env
michaelr524 Jan 3, 2026
2a4dd60
refactor: Switch fallacy extractor from chunked to single-pass analysis
michaelr524 Jan 3, 2026
e0e8b65
feat: Add supported-elsewhere filter to reduce false positives
michaelr524 Jan 3, 2026
85e39ed
feat: Add OpenRouter support for multi-model filter testing
michaelr524 Jan 4, 2026
02ea420
feat: Add OpenRouter support for fallacy extraction + improve dev-env…
michaelr524 Jan 4, 2026
c4a44a1
docs: Add prioritized implementation plan for fallacy checker refactor
michaelr524 Jan 7, 2026
19b6290
docs: Add Phase 5 (meta-eval scoring) to implementation plan
michaelr524 Jan 7, 2026
bb3a1df
feat: Add pipeline telemetry for fallacy checker observability
michaelr524 Jan 7, 2026
bd0d97c
feat: Add validation framework for fallacy checker regression testing
michaelr524 Jan 7, 2026
406bb27
feat: Add baseline management and pipeline execution to validation
michaelr524 Jan 7, 2026
f4b531b
refactor: Restructure main menu as clean router
michaelr524 Jan 7, 2026
0bb9314
feat(meta-evals): Add validation run persistence and filter reasoning UI
michaelr524 Jan 7, 2026
c35bd5c
feat: Add multi-extractor with thinking/temperature controls + OpenRo…
michaelr524 Jan 11, 2026
c997c6f
feat(meta-evals): Add Extractor Lab for testing extraction in isolation
michaelr524 Jan 11, 2026
82d1385
fix(meta-evals): Fix ESM import for fallacy-extraction module
michaelr524 Jan 11, 2026
025e30d
feat(meta-evals): Improve Extractor Lab with env config and scrollabl…
michaelr524 Jan 11, 2026
0c71d52
fix(meta-evals): Fix issue detail view truncation and escape navigation
michaelr524 Jan 11, 2026
ce504ba
fix(meta-evals): Let ExtractorLab handle escape navigation internally
michaelr524 Jan 11, 2026
236ba02
fix(meta-evals): Exclude all screens with internal escape navigation
michaelr524 Jan 11, 2026
f4be023
fix(meta-evals): Add proper escape handling to all screens
michaelr524 Jan 11, 2026
6bc2d6b
feat(meta-evals): Add reusable ModelSelector component
michaelr524 Jan 11, 2026
8950e86
feat(meta-evals): Add reusable DocumentSelector component
michaelr524 Jan 11, 2026
d57acd1
fix(meta-evals): Only show search spinner when actively filtering
michaelr524 Jan 11, 2026
fa743ad
refactor(meta-evals): Use DocumentSelector in Validation + full-width…
michaelr524 Jan 11, 2026
ec31142
refactor(meta-evals): Remove manual text truncation from DocumentSele…
michaelr524 Jan 11, 2026
e84e81f
fix(meta-evals): Don't replace initial documents on empty filter
michaelr524 Jan 11, 2026
6049147
feat(meta-evals): Add smart truncation with column alignment
michaelr524 Jan 11, 2026
6e64248
feat(meta-evals): Add LLM Judge integration to Extractor Lab
michaelr524 Jan 11, 2026
e2943c9
feat(meta-evals): Add multi-judge selection and comparison view
michaelr524 Jan 11, 2026
ca24af4
feat(meta-evals): Add deduplication step to Extractor Lab
michaelr524 Jan 11, 2026
24d419c
fix(meta-evals): Show error messages in judge comparison + increase m…
michaelr524 Jan 11, 2026
aa6f580
feat(meta-evals): Add pre-judge deduplication step
michaelr524 Jan 11, 2026
4d10793
refactor(meta-evals): Split ExtractorLab into smaller modules
michaelr524 Jan 11, 2026
59e5ac6
refactor(fallacy-check): Extract dedup into separate module
michaelr524 Jan 11, 2026
d0199f4
feat(meta-evals): Add quality-based deduplication with Jaccard simila…
michaelr524 Jan 11, 2026
1c63858
feat(fallacy-check): Add Jaccard similarity dedup with quality-based …
michaelr524 Jan 11, 2026
0575614
feat(meta-evals): Add filtered items drilldown and improve validation…
michaelr524 Jan 15, 2026
81a46a1
feat(web): Add Validation Lab UI for pipeline regression testing
michaelr524 Jan 17, 2026
f93f350
feat(lab): Add profile editor with configurable filter chain
michaelr524 Jan 18, 2026
c91e100
fix(ai): Use auto tool_choice when thinking is enabled
michaelr524 Jan 18, 2026
5486f63
fix(ai): Auto-adjust max_tokens when thinking is enabled
michaelr524 Jan 18, 2026
0a94c86
feat(ai): Add unified usage metrics with cost tracking
michaelr524 Jan 18, 2026
a366785
feat(lab): Add model endpoints API and improve pipeline infrastructure
michaelr524 Jan 18, 2026
022f5a2
feat(ai): Add Principle of Charity filter and dynamic filter chain ex…
michaelr524 Jan 18, 2026
d33edcf
feat(lab): Add version selection to baseline creation
michaelr524 Jan 21, 2026
785c930
refactor(ai): Unify model config handling and add provider support to…
michaelr524 Jan 21, 2026
e1c73c0
feat(ai): Add judge and filter telemetry capture
michaelr524 Jan 21, 2026
40222ac
feat(ai): Add telemetry capture to filters and pass to pipeline stages
michaelr524 Jan 21, 2026
5eceeb9
refactor(ai): Consolidate duplicated types into shared common.ts
michaelr524 Jan 21, 2026
4a8ac55
refactor(ai): Replace console.log with context.logger.debug in filter…
michaelr524 Jan 21, 2026
c35648f
refactor: Code review fixes - consolidate constants, cleanup logs, sp…
michaelr524 Jan 21, 2026
2b44e82
fix(ai): Separate client-safe exports to avoid async_hooks bundling
michaelr524 Jan 22, 2026
0f41ea6
fix(ai): Move verbose logs to debug level, improve 429 error messages
michaelr524 Jan 22, 2026
c3f2446
fix(ai): Add current date context to all fallacy checker prompts
michaelr524 Jan 22, 2026
f620317
feat(lab): Add "All Evals" tab and passed items telemetry
michaelr524 Jan 22, 2026
0a21ea5
fix(lab): Fix tab layout wrapping issue with three tabs
michaelr524 Jan 22, 2026
786fed6
chore(ai): Remove unused imports and dead code in fallacy-check
michaelr524 Jan 23, 2026
fbd437c
refactor(ai): Remove unused resolveReasoningEffortForExtractor method
michaelr524 Jan 23, 2026
6ff8463
refactor(ai): Fix strict lint warnings and add PR lint script
michaelr524 Jan 23, 2026
948fec5
refactor(db): Fix strict lint warnings in MetaEvaluationRepository
michaelr524 Jan 23, 2026
0c3d67c
refactor(jobs): Fix strict lint warnings and add lint config
michaelr524 Jan 23, 2026
40f7f1a
refactor(ai,jobs): Add DocumentAnalysisResult type and fix any warnings
michaelr524 Jan 23, 2026
e14450a
refactor(web): Fix strict lint warnings in Lab UI and API routes
michaelr524 Jan 23, 2026
ddef396
fix(ai): Add client-safe tool types to fix web app typecheck
michaelr524 Jan 23, 2026
df11a5e
docs: Add modify-check workflow loop to CLAUDE.md
michaelr524 Jan 23, 2026
80c7b5f
refactor: Replace inline types with named interfaces
michaelr524 Jan 23, 2026
bf2a022
docs: Add NO INLINE TYPES rule to CLAUDE.md
michaelr524 Jan 23, 2026
6db5d74
refactor(ai): Clean up comments and replace console.log with logger
michaelr524 Jan 23, 2026
7f3ced5
refactor(ai): Replace console.warn/error with logger across codebase
michaelr524 Jan 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,18 @@ pnpm --filter @roast/web run test:ci # MUST actually run, not assume
# TypeScript compiles ≠ tests pass
```

### Development Workflow: Modify → Check Loop

When making code changes, especially to internal packages (`@roast/ai`, `@roast/db`, `@roast/domain`, `@roast/jobs`), follow this verification loop before committing:

1. **After modifying internal packages**: Run `pnpm turbo run typecheck` (not just `pnpm --filter @roast/web typecheck`). Turbo handles the dependency graph—it rebuilds packages first, then typechecks consumers with fresh `.d.ts` files. This mimics CI's clean-build behavior.

2. **After modifying web app only**: Run `pnpm --filter @roast/web run typecheck && pnpm --filter @roast/web run lint`.

3. **Before pushing**: Always run the full check: `pnpm turbo run typecheck lint --parallel`. This catches cross-package type errors that per-package checks miss due to stale `dist/` folders.

4. **Why this matters**: TypeScript project references use compiled `dist/` for type resolution. Local dev accumulates stale builds; CI starts fresh. If you see typecheck errors and assume they're "pre-existing," verify by rebuilding the source package first (`pnpm --filter @roast/ai run build`).

## Commands Quick Reference

### Development
Expand All @@ -186,6 +198,30 @@ pnpm --filter @roast/db run gen # Generate Prisma client
pnpm --filter @roast/db run db:push # Push schema changes
```

### Dev Environment & Database Access (Primary)

**🚨 MANDATORY: Use ONLY these scripts for database access - NO EXCEPTIONS 🚨**

```bash
dev/scripts/dev-env.sh start|stop|status|attach|restart # Manage tmux dev session
dev/scripts/dev-env.sh psql [args] # Connect to local DB via Docker
```

**FORBIDDEN database access methods** (DO NOT USE):
```bash
❌ psql -h localhost ... # Direct psql - FORBIDDEN
❌ PGPASSWORD=... psql ... # Direct psql with password - FORBIDDEN
❌ docker run ... postgres psql # Docker-based psql - FORBIDDEN
❌ Any other method # If it's not dev-env.sh psql, DON'T USE IT
```

**Database utilities** (Docker-based, no local psql needed):
- `dev/scripts/dev/db/lib/db_functions.sh` - Core DB functions (`psql_local`, `psql_prod`, `pg_dump_prod`, `copy_data`)
- `dev/scripts/dev/db/lib/common_utils.sh` - Shared bash utilities
- `dev/scripts/dev/db/setup_db.sh` - Example: sync prod schema to local

**AI agents MUST use `dev/scripts/dev-env.sh psql` - no alternatives allowed.**

### Testing
```bash
# Test categories by cost/dependencies:
Expand All @@ -203,6 +239,29 @@ pnpm --filter @roast/web run typecheck # TypeScript
# MUST run both - linter doesn't catch type errors!
```

### 🚨 Type Definitions: NO INLINE TYPES 🚨

**NEVER use inline types.** Always define named interfaces or type aliases.

```typescript
// ❌ WRONG - inline types
function Foo({ data }: { data: string; count: number }) { }
const [state, setState] = useState<{ loading: boolean; error?: string }>();
let result: { success: boolean; value: number };

// ✅ CORRECT - named types
interface FooProps { data: string; count: number; }
function Foo({ data }: FooProps) { }

interface LoadingState { loading: boolean; error?: string; }
const [state, setState] = useState<LoadingState>();

interface Result { success: boolean; value: number; }
let result: Result;
```

**Why:** Inline types hurt readability, reusability, and refactoring. Named types are self-documenting and can be exported/shared.

## MCP Server Quick Fix

**Problem**: Claude Code caches MCP servers, changes don't take effect
Expand Down Expand Up @@ -278,6 +337,22 @@ Details here"
/bin/rm, /bin/cat, /bin/echo # Use full paths
```

## Tmux Key Sending

When sending multiple keystrokes to tmux sessions (e.g., navigating CLI menus), use a loop with delays between keystrokes instead of sending them all at once.

**Bad** (keys may be dropped or processed incorrectly):
```bash
tmux send-keys -t session Down Down Down Down Down Enter
```

**Good** (reliable keystroke delivery):
```bash
for i in {1..5}; do tmux send-keys -t session Down; sleep 0.1; done; tmux send-keys -t session Enter
```

This ensures each keystroke is processed before the next is sent, preventing navigation issues in terminal UIs.

## Documentation Structure
- `/dev/docs/README.md` - Documentation index
- `/dev/docs/development/` - Development guides
Expand Down
28 changes: 28 additions & 0 deletions apps/web/src/app/api/monitor/lab/baselines/[id]/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { NextRequest, NextResponse } from "next/server";
import { logger } from "@/infrastructure/logging/logger";
import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
import { commonErrors } from "@/infrastructure/http/api-response-helpers";
import { isAdmin } from "@/infrastructure/auth/auth";
import { metaEvaluationRepository } from "@roast/db";
import type { RouteIdParams } from "../../types";

export async function DELETE(
request: NextRequest,
{ params }: RouteIdParams
) {
const userId = await authenticateRequest(request);
if (!userId) return commonErrors.unauthorized();

const adminCheck = await isAdmin();
if (!adminCheck) return commonErrors.forbidden();

const { id } = await params;

try {
await metaEvaluationRepository.deleteValidationBaseline(id);
return NextResponse.json({ success: true });
} catch (error) {
logger.error("Error deleting baseline:", error);
return commonErrors.serverError("Failed to delete baseline");
}
}
96 changes: 96 additions & 0 deletions apps/web/src/app/api/monitor/lab/baselines/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import { NextRequest, NextResponse } from "next/server";
import { logger } from "@/infrastructure/logging/logger";
import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
import { commonErrors } from "@/infrastructure/http/api-response-helpers";
import { isAdmin } from "@/infrastructure/auth/auth";
import { metaEvaluationRepository, prisma } from "@roast/db";

export async function GET(request: NextRequest) {
const userId = await authenticateRequest(request);
if (!userId) return commonErrors.unauthorized();

const adminCheck = await isAdmin();
if (!adminCheck) return commonErrors.forbidden();

const agentId = request.nextUrl.searchParams.get("agentId");
if (!agentId) {
return NextResponse.json({ error: "agentId is required" }, { status: 400 });
}

try {
const baselines = await metaEvaluationRepository.getValidationBaselines(agentId);
return NextResponse.json({ baselines });
} catch (error) {
logger.error("Error fetching baselines:", error);
return commonErrors.serverError("Failed to fetch baselines");
}
}

export async function POST(request: NextRequest) {
const userId = await authenticateRequest(request);
if (!userId) return commonErrors.unauthorized();

const adminCheck = await isAdmin();
if (!adminCheck) return commonErrors.forbidden();

try {
const body = await request.json();
const { name, description, agentId, documentIds, evaluationVersionIds, beforeDate } = body;

if (!name || !agentId) {
return NextResponse.json(
{ error: "name and agentId are required" },
{ status: 400 }
);
}

// Get evaluation version IDs from document IDs if not provided directly
let evalVersionIds = evaluationVersionIds;
if (!evalVersionIds?.length && documentIds?.length) {
// Get the latest evaluation version for each document (optionally before cutoff date)
const evaluations = await prisma.evaluationVersion.findMany({
where: {
agentId,
evaluation: {
documentId: { in: documentIds },
},
...(beforeDate ? { createdAt: { lt: new Date(beforeDate) } } : {}),
},
orderBy: { createdAt: "desc" },
select: {
id: true,
evaluation: { select: { documentId: true } },
},
});

// Keep only the latest version per document (before cutoff if specified)
const latestByDoc = new Map<string, string>();
for (const ev of evaluations) {
if (!latestByDoc.has(ev.evaluation.documentId)) {
latestByDoc.set(ev.evaluation.documentId, ev.id);
}
}
evalVersionIds = Array.from(latestByDoc.values());
}

if (!evalVersionIds?.length) {
return NextResponse.json(
{ error: "No evaluation versions found for the selected documents" },
{ status: 400 }
);
}

const baseline = await metaEvaluationRepository.createValidationBaseline({
name,
description,
agentId,
evaluationVersionIds: evalVersionIds,
createdById: userId,
});

return NextResponse.json({ baseline });
} catch (error) {
logger.error("Error creating baseline:", error);
return commonErrors.serverError("Failed to create baseline");
}
}
33 changes: 33 additions & 0 deletions apps/web/src/app/api/monitor/lab/corpus/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { NextRequest, NextResponse } from "next/server";
import { logger } from "@/infrastructure/logging/logger";
import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
import { commonErrors } from "@/infrastructure/http/api-response-helpers";
import { isAdmin } from "@/infrastructure/auth/auth";
import { metaEvaluationRepository } from "@roast/db";

export async function GET(request: NextRequest) {
const userId = await authenticateRequest(request);
if (!userId) return commonErrors.unauthorized();

const adminCheck = await isAdmin();
if (!adminCheck) return commonErrors.forbidden();

const agentId = request.nextUrl.searchParams.get("agentId");
const filter = request.nextUrl.searchParams.get("filter") || undefined;
const limit = parseInt(request.nextUrl.searchParams.get("limit") || "500", 10);

if (!agentId) {
return NextResponse.json({ error: "agentId is required" }, { status: 400 });
}

try {
const documents = await metaEvaluationRepository.getValidationCorpusDocuments(agentId, {
filter,
limit,
});
return NextResponse.json({ documents });
} catch (error) {
logger.error("Error fetching corpus documents:", error);
return commonErrors.serverError("Failed to fetch corpus documents");
}
}
38 changes: 38 additions & 0 deletions apps/web/src/app/api/monitor/lab/corpus/snapshots/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { NextRequest, NextResponse } from "next/server";
import { logger } from "@/infrastructure/logging/logger";
import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
import { commonErrors } from "@/infrastructure/http/api-response-helpers";
import { isAdmin } from "@/infrastructure/auth/auth";
import { metaEvaluationRepository } from "@roast/db";

// Get evaluation snapshots for a set of documents (used when creating baselines)
export async function GET(request: NextRequest) {
const userId = await authenticateRequest(request);
if (!userId) return commonErrors.unauthorized();

const adminCheck = await isAdmin();
if (!adminCheck) return commonErrors.forbidden();

const agentId = request.nextUrl.searchParams.get("agentId");
const documentIdsParam = request.nextUrl.searchParams.get("documentIds");

if (!agentId || !documentIdsParam) {
return NextResponse.json(
{ error: "agentId and documentIds are required" },
{ status: 400 }
);
}

const documentIds = documentIdsParam.split(",").filter(Boolean);
if (documentIds.length === 0) {
return NextResponse.json({ error: "documentIds cannot be empty" }, { status: 400 });
}

try {
const snapshots = await metaEvaluationRepository.getEvaluationSnapshots(documentIds, agentId);
return NextResponse.json({ snapshots });
} catch (error) {
logger.error("Error fetching evaluation snapshots:", error);
return commonErrors.serverError("Failed to fetch evaluation snapshots");
}
}
56 changes: 56 additions & 0 deletions apps/web/src/app/api/monitor/lab/corpus/versions/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import { NextRequest, NextResponse } from "next/server";
import { logger } from "@/infrastructure/logging/logger";
import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
import { commonErrors } from "@/infrastructure/http/api-response-helpers";
import { isAdmin } from "@/infrastructure/auth/auth";
import { prisma } from "@roast/db";

export async function GET(request: NextRequest) {
const userId = await authenticateRequest(request);
if (!userId) return commonErrors.unauthorized();

const adminCheck = await isAdmin();
if (!adminCheck) return commonErrors.forbidden();

const agentId = request.nextUrl.searchParams.get("agentId");
const documentId = request.nextUrl.searchParams.get("documentId");
const beforeDate = request.nextUrl.searchParams.get("beforeDate");

if (!agentId || !documentId) {
return NextResponse.json(
{ error: "agentId and documentId are required" },
{ status: 400 }
);
}

try {
const versions = await prisma.evaluationVersion.findMany({
where: {
agentId,
evaluation: { documentId },
...(beforeDate ? { createdAt: { lt: new Date(beforeDate) } } : {}),
},
orderBy: { createdAt: "desc" },
select: {
id: true,
createdAt: true,
grade: true,
summary: true,
version: true,
},
});

return NextResponse.json({
versions: versions.map((v) => ({
id: v.id,
createdAt: v.createdAt.toISOString(),
grade: v.grade,
summary: v.summary,
version: v.version,
})),
});
} catch (error) {
logger.error("Error fetching evaluation versions:", error);
return commonErrors.serverError("Failed to fetch evaluation versions");
}
}
Loading