quantified-uncertainty · michaelr524 · Jan 3, 2026 · Jan 3, 2026 · Jan 3, 2026 · Jan 3, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -177,6 +177,18 @@ pnpm --filter @roast/web run test:ci  # MUST actually run, not assume
 # TypeScript compiles ≠ tests pass
 ```
 
+### Development Workflow: Modify → Check Loop
+
+When making code changes, especially to internal packages (`@roast/ai`, `@roast/db`, `@roast/domain`, `@roast/jobs`), follow this verification loop before committing:
+
+1. **After modifying internal packages**: Run `pnpm turbo run typecheck` (not just `pnpm --filter @roast/web typecheck`). Turbo handles the dependency graph—it rebuilds packages first, then typechecks consumers with fresh `.d.ts` files. This mimics CI's clean-build behavior.
+
+2. **After modifying web app only**: Run `pnpm --filter @roast/web run typecheck && pnpm --filter @roast/web run lint`.
+
+3. **Before pushing**: Always run the full check: `pnpm turbo run typecheck lint --parallel`. This catches cross-package type errors that per-package checks miss due to stale `dist/` folders.
+
+4. **Why this matters**: TypeScript project references use compiled `dist/` for type resolution. Local dev accumulates stale builds; CI starts fresh. If you see typecheck errors and assume they're "pre-existing," verify by rebuilding the source package first (`pnpm --filter @roast/ai run build`).
+
 ## Commands Quick Reference
 
 ### Development
@@ -186,6 +198,30 @@ pnpm --filter @roast/db run gen       # Generate Prisma client
 pnpm --filter @roast/db run db:push   # Push schema changes
 ```
 
+### Dev Environment & Database Access (Primary)
+
+**🚨 MANDATORY: Use ONLY these scripts for database access - NO EXCEPTIONS 🚨**
+
+```bash
+dev/scripts/dev-env.sh start|stop|status|attach|restart  # Manage tmux dev session
+dev/scripts/dev-env.sh psql [args]                       # Connect to local DB via Docker
+```
+
+**FORBIDDEN database access methods** (DO NOT USE):
+```bash
+❌ psql -h localhost ...           # Direct psql - FORBIDDEN
+❌ PGPASSWORD=... psql ...         # Direct psql with password - FORBIDDEN
+❌ docker run ... postgres psql    # Docker-based psql - FORBIDDEN
+❌ Any other method                # If it's not dev-env.sh psql, DON'T USE IT
+```
+
+**Database utilities** (Docker-based, no local psql needed):
+- `dev/scripts/dev/db/lib/db_functions.sh` - Core DB functions (`psql_local`, `psql_prod`, `pg_dump_prod`, `copy_data`)
+- `dev/scripts/dev/db/lib/common_utils.sh` - Shared bash utilities
+- `dev/scripts/dev/db/setup_db.sh` - Example: sync prod schema to local
+
+**AI agents MUST use `dev/scripts/dev-env.sh psql` - no alternatives allowed.**
+
 ### Testing
 ```bash
 # Test categories by cost/dependencies:
@@ -203,6 +239,29 @@ pnpm --filter @roast/web run typecheck  # TypeScript
 # MUST run both - linter doesn't catch type errors!
 ```
 
+### 🚨 Type Definitions: NO INLINE TYPES 🚨
+
+**NEVER use inline types.** Always define named interfaces or type aliases.
+
+```typescript
+// ❌ WRONG - inline types
+function Foo({ data }: { data: string; count: number }) { }
+const [state, setState] = useState<{ loading: boolean; error?: string }>();
+let result: { success: boolean; value: number };
+
+// ✅ CORRECT - named types
+interface FooProps { data: string; count: number; }
+function Foo({ data }: FooProps) { }
+
+interface LoadingState { loading: boolean; error?: string; }
+const [state, setState] = useState<LoadingState>();
+
+interface Result { success: boolean; value: number; }
+let result: Result;
+```
+
+**Why:** Inline types hurt readability, reusability, and refactoring. Named types are self-documenting and can be exported/shared.
+
 ## MCP Server Quick Fix
 
 **Problem**: Claude Code caches MCP servers, changes don't take effect
@@ -278,6 +337,22 @@ Details here"
 /bin/rm, /bin/cat, /bin/echo  # Use full paths
 ```
 
+## Tmux Key Sending
+
+When sending multiple keystrokes to tmux sessions (e.g., navigating CLI menus), use a loop with delays between keystrokes instead of sending them all at once.
+
+**Bad** (keys may be dropped or processed incorrectly):
+```bash
+tmux send-keys -t session Down Down Down Down Down Enter
+```
+
+**Good** (reliable keystroke delivery):
+```bash
+for i in {1..5}; do tmux send-keys -t session Down; sleep 0.1; done; tmux send-keys -t session Enter
+```
+
+This ensures each keystroke is processed before the next is sent, preventing navigation issues in terminal UIs.
+
 ## Documentation Structure
 - `/dev/docs/README.md` - Documentation index
 - `/dev/docs/development/` - Development guides

diff --git a/apps/web/src/app/api/monitor/lab/baselines/[id]/route.ts b/apps/web/src/app/api/monitor/lab/baselines/[id]/route.ts
@@ -0,0 +1,28 @@
+import { NextRequest, NextResponse } from "next/server";
+import { logger } from "@/infrastructure/logging/logger";
+import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
+import { commonErrors } from "@/infrastructure/http/api-response-helpers";
+import { isAdmin } from "@/infrastructure/auth/auth";
+import { metaEvaluationRepository } from "@roast/db";
+import type { RouteIdParams } from "../../types";
+
+export async function DELETE(
+  request: NextRequest,
+  { params }: RouteIdParams
+) {
+  const userId = await authenticateRequest(request);
+  if (!userId) return commonErrors.unauthorized();
+
+  const adminCheck = await isAdmin();
+  if (!adminCheck) return commonErrors.forbidden();
+
+  const { id } = await params;
+
+  try {
+    await metaEvaluationRepository.deleteValidationBaseline(id);
+    return NextResponse.json({ success: true });
+  } catch (error) {
+    logger.error("Error deleting baseline:", error);
+    return commonErrors.serverError("Failed to delete baseline");
+  }
+}
diff --git a/apps/web/src/app/api/monitor/lab/baselines/route.ts b/apps/web/src/app/api/monitor/lab/baselines/route.ts
@@ -0,0 +1,96 @@
+import { NextRequest, NextResponse } from "next/server";
+import { logger } from "@/infrastructure/logging/logger";
+import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
+import { commonErrors } from "@/infrastructure/http/api-response-helpers";
+import { isAdmin } from "@/infrastructure/auth/auth";
+import { metaEvaluationRepository, prisma } from "@roast/db";
+
+export async function GET(request: NextRequest) {
+  const userId = await authenticateRequest(request);
+  if (!userId) return commonErrors.unauthorized();
+
+  const adminCheck = await isAdmin();
+  if (!adminCheck) return commonErrors.forbidden();
+
+  const agentId = request.nextUrl.searchParams.get("agentId");
+  if (!agentId) {
+    return NextResponse.json({ error: "agentId is required" }, { status: 400 });
+  }
+
+  try {
+    const baselines = await metaEvaluationRepository.getValidationBaselines(agentId);
+    return NextResponse.json({ baselines });
+  } catch (error) {
+    logger.error("Error fetching baselines:", error);
+    return commonErrors.serverError("Failed to fetch baselines");
+  }
+}
+
+export async function POST(request: NextRequest) {
+  const userId = await authenticateRequest(request);
+  if (!userId) return commonErrors.unauthorized();
+
+  const adminCheck = await isAdmin();
+  if (!adminCheck) return commonErrors.forbidden();
+
+  try {
+    const body = await request.json();
+    const { name, description, agentId, documentIds, evaluationVersionIds, beforeDate } = body;
+
+    if (!name || !agentId) {
+      return NextResponse.json(
+        { error: "name and agentId are required" },
+        { status: 400 }
+      );
+    }
+
+    // Get evaluation version IDs from document IDs if not provided directly
+    let evalVersionIds = evaluationVersionIds;
+    if (!evalVersionIds?.length && documentIds?.length) {
+      // Get the latest evaluation version for each document (optionally before cutoff date)
+      const evaluations = await prisma.evaluationVersion.findMany({
+        where: {
+          agentId,
+          evaluation: {
+            documentId: { in: documentIds },
+          },
+          ...(beforeDate ? { createdAt: { lt: new Date(beforeDate) } } : {}),
+        },
+        orderBy: { createdAt: "desc" },
+        select: {
+          id: true,
+          evaluation: { select: { documentId: true } },
+        },
+      });
+
+      // Keep only the latest version per document (before cutoff if specified)
+      const latestByDoc = new Map<string, string>();
+      for (const ev of evaluations) {
+        if (!latestByDoc.has(ev.evaluation.documentId)) {
+          latestByDoc.set(ev.evaluation.documentId, ev.id);
+        }
+      }
+      evalVersionIds = Array.from(latestByDoc.values());
+    }
+
+    if (!evalVersionIds?.length) {
+      return NextResponse.json(
+        { error: "No evaluation versions found for the selected documents" },
+        { status: 400 }
+      );
+    }
+
+    const baseline = await metaEvaluationRepository.createValidationBaseline({
+      name,
+      description,
+      agentId,
+      evaluationVersionIds: evalVersionIds,
+      createdById: userId,
+    });
+
+    return NextResponse.json({ baseline });
+  } catch (error) {
+    logger.error("Error creating baseline:", error);
+    return commonErrors.serverError("Failed to create baseline");
+  }
+}
diff --git a/apps/web/src/app/api/monitor/lab/corpus/route.ts b/apps/web/src/app/api/monitor/lab/corpus/route.ts
@@ -0,0 +1,33 @@
+import { NextRequest, NextResponse } from "next/server";
+import { logger } from "@/infrastructure/logging/logger";
+import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
+import { commonErrors } from "@/infrastructure/http/api-response-helpers";
+import { isAdmin } from "@/infrastructure/auth/auth";
+import { metaEvaluationRepository } from "@roast/db";
+
+export async function GET(request: NextRequest) {
+  const userId = await authenticateRequest(request);
+  if (!userId) return commonErrors.unauthorized();
+
+  const adminCheck = await isAdmin();
+  if (!adminCheck) return commonErrors.forbidden();
+
+  const agentId = request.nextUrl.searchParams.get("agentId");
+  const filter = request.nextUrl.searchParams.get("filter") || undefined;
+  const limit = parseInt(request.nextUrl.searchParams.get("limit") || "500", 10);
+
+  if (!agentId) {
+    return NextResponse.json({ error: "agentId is required" }, { status: 400 });
+  }
+
+  try {
+    const documents = await metaEvaluationRepository.getValidationCorpusDocuments(agentId, {
+      filter,
+      limit,
+    });
+    return NextResponse.json({ documents });
+  } catch (error) {
+    logger.error("Error fetching corpus documents:", error);
+    return commonErrors.serverError("Failed to fetch corpus documents");
+  }
+}
diff --git a/apps/web/src/app/api/monitor/lab/corpus/snapshots/route.ts b/apps/web/src/app/api/monitor/lab/corpus/snapshots/route.ts
@@ -0,0 +1,38 @@
+import { NextRequest, NextResponse } from "next/server";
+import { logger } from "@/infrastructure/logging/logger";
+import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
+import { commonErrors } from "@/infrastructure/http/api-response-helpers";
+import { isAdmin } from "@/infrastructure/auth/auth";
+import { metaEvaluationRepository } from "@roast/db";
+
+// Get evaluation snapshots for a set of documents (used when creating baselines)
+export async function GET(request: NextRequest) {
+  const userId = await authenticateRequest(request);
+  if (!userId) return commonErrors.unauthorized();
+
+  const adminCheck = await isAdmin();
+  if (!adminCheck) return commonErrors.forbidden();
+
+  const agentId = request.nextUrl.searchParams.get("agentId");
+  const documentIdsParam = request.nextUrl.searchParams.get("documentIds");
+
+  if (!agentId || !documentIdsParam) {
+    return NextResponse.json(
+      { error: "agentId and documentIds are required" },
+      { status: 400 }
+    );
+  }
+
+  const documentIds = documentIdsParam.split(",").filter(Boolean);
+  if (documentIds.length === 0) {
+    return NextResponse.json({ error: "documentIds cannot be empty" }, { status: 400 });
+  }
+
+  try {
+    const snapshots = await metaEvaluationRepository.getEvaluationSnapshots(documentIds, agentId);
+    return NextResponse.json({ snapshots });
+  } catch (error) {
+    logger.error("Error fetching evaluation snapshots:", error);
+    return commonErrors.serverError("Failed to fetch evaluation snapshots");
+  }
+}
diff --git a/apps/web/src/app/api/monitor/lab/corpus/versions/route.ts b/apps/web/src/app/api/monitor/lab/corpus/versions/route.ts
@@ -0,0 +1,56 @@
+import { NextRequest, NextResponse } from "next/server";
+import { logger } from "@/infrastructure/logging/logger";
+import { authenticateRequest } from "@/infrastructure/auth/auth-helpers";
+import { commonErrors } from "@/infrastructure/http/api-response-helpers";
+import { isAdmin } from "@/infrastructure/auth/auth";
+import { prisma } from "@roast/db";
+
+export async function GET(request: NextRequest) {
+  const userId = await authenticateRequest(request);
+  if (!userId) return commonErrors.unauthorized();
+
+  const adminCheck = await isAdmin();
+  if (!adminCheck) return commonErrors.forbidden();
+
+  const agentId = request.nextUrl.searchParams.get("agentId");
+  const documentId = request.nextUrl.searchParams.get("documentId");
+  const beforeDate = request.nextUrl.searchParams.get("beforeDate");
+
+  if (!agentId || !documentId) {
+    return NextResponse.json(
+      { error: "agentId and documentId are required" },
+      { status: 400 }
+    );
+  }
+
+  try {
+    const versions = await prisma.evaluationVersion.findMany({
+      where: {
+        agentId,
+        evaluation: { documentId },
+        ...(beforeDate ? { createdAt: { lt: new Date(beforeDate) } } : {}),
+      },
+      orderBy: { createdAt: "desc" },
+      select: {
+        id: true,
+        createdAt: true,
+        grade: true,
+        summary: true,
+        version: true,
+      },
+    });
+
+    return NextResponse.json({
+      versions: versions.map((v) => ({
+        id: v.id,
+        createdAt: v.createdAt.toISOString(),
+        grade: v.grade,
+        summary: v.summary,
+        version: v.version,
+      })),
+    });
+  } catch (error) {
+    logger.error("Error fetching evaluation versions:", error);
+    return commonErrors.serverError("Failed to fetch evaluation versions");
+  }
+}