diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 0000000..737d30b --- /dev/null +++ b/bench/README.md @@ -0,0 +1,64 @@ +# Quipbench + +Standalone benchmark runner + static dashboard for Quipslop models. + +## What it does + +- Runs live OpenRouter self-play rounds (same mechanics as the main game) +- Computes Elo-first leaderboard with wins/games/win-rate +- Stores run + match + rating records in `bench/quipbench.sqlite` +- Exports latest snapshot to `bench/out/latest.json` and `bench/out/latest.js` +- Renders a standalone dashboard at `bench/dashboard/index.html` + +## Prerequisites + +- Bun +- `OPENROUTER_API_KEY` set in environment for live runs + +## Commands + +From repo root: + +- `bun run quipbench:run` +- `bun run quipbench:export` +- `bun run quipbench:open` + +### Run options + +`quipbench:run` supports CLI flags: + +- `--rounds=100` +- `--concurrency=4` +- `--k=24` +- `--initialElo=1500` +- `--seed=12345` +- `--out=bench/out` +- `--db=bench/quipbench.sqlite` + +Example: + +```bash +bun bench/run.ts --rounds=150 --concurrency=6 --seed=42 +``` + +## Output contract (`latest` snapshot) + +`bench/out/latest.json` and `bench/out/latest.js` contain: + +- `runMeta`: `runId`, `startedAt`, `endedAt`, `roundsRequested`, `roundsCompleted`, `failures`, `concurrency`, `eloK`, `initialElo`, `seed` +- `leaderboard[]`: `rank`, `modelId`, `modelName`, `elo`, `wins`, `games`, `winRate` +- `chart[]`: `{ modelName, elo }` + +## Dashboard + +Open `bench/dashboard/index.html` directly or via: + +```bash +bun run quipbench:open +``` + +The dashboard reads `../out/latest.js` and shows: + +- run metadata summary +- vertical Elo bar chart with model names under each bar +- leaderboard table diff --git a/bench/config.ts b/bench/config.ts new file mode 100644 index 0000000..01b15ab --- /dev/null +++ b/bench/config.ts @@ -0,0 +1,29 @@ +import { join } from "node:path"; + +export const BENCH_DIR = import.meta.dir; + +export const DEFAULT_ROUNDS = 100; +export const DEFAULT_CONCURRENCY = 4; +export const DEFAULT_ELO_K = 24; +export const DEFAULT_INITIAL_ELO = 1500; + +export const DEFAULT_DB_PATH = join(BENCH_DIR, "quipbench.sqlite"); +export const DEFAULT_OUTPUT_DIR = join(BENCH_DIR, "out"); +export const DEFAULT_LATEST_JSON_PATH = join(DEFAULT_OUTPUT_DIR, "latest.json"); +export const DEFAULT_LATEST_JS_PATH = join(DEFAULT_OUTPUT_DIR, "latest.js"); + +export function parsePositiveInt( + value: string | undefined, + fallback: number, +): number { + const parsed = Number.parseInt(value ?? "", 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} + +export function parsePositiveNumber( + value: string | undefined, + fallback: number, +): number { + const parsed = Number.parseFloat(value ?? ""); + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} diff --git a/bench/dashboard/app.js b/bench/dashboard/app.js new file mode 100644 index 0000000..42fa1f8 --- /dev/null +++ b/bench/dashboard/app.js @@ -0,0 +1,277 @@ +const root = document.getElementById("app"); +const snapshot = window.__QUIPBENCH_LATEST__; +const logoImageCache = new Map(); + +function logoFor(name) { + if (name.includes("Gemini")) return "./assets/logos/gemini.svg"; + if (name.includes("Kimi")) return "./assets/logos/kimi.svg"; + if (name.includes("DeepSeek")) return "./assets/logos/deepseek.svg"; + if (name.includes("GLM")) return "./assets/logos/glm.svg"; + if (name.includes("GPT")) return "./assets/logos/openai.svg"; + if (name.includes("Opus") || name.includes("Sonnet")) return "./assets/logos/claude.svg"; + if (name.includes("Grok")) return "./assets/logos/grok.svg"; + if (name.includes("MiniMax")) return "./assets/logos/minimax.svg"; + return null; +} + +function formatDate(ts) { + const date = new Date(ts); + if (Number.isNaN(date.getTime())) return ts; + return date.toLocaleString(); +} + +function rowHtml(row) { + const logo = logoFor(row.modelName); + return ` + + ${row.rank} + +
+ ${logo ? `` : ""} + ${row.modelName} +
+ + ${row.elo.toFixed(2)} + ${row.wins} + ${row.games} + ${row.winRate.toFixed(2)}% + + `; +} + +function renderChart(rows) { + const chartCanvas = document.getElementById("elo-chart"); + if (!chartCanvas || typeof Chart === "undefined" || rows.length === 0) return; + + const sorted = [...rows].sort((a, b) => b.elo - a.elo); + const labels = sorted.map((row) => row.modelName); + const data = sorted.map((row) => Number(row.elo.toFixed(2))); + const max = Math.max(...data); + const min = Math.min(...data); + const yMin = Math.floor(min - 20); + const yMax = Math.ceil(max + 10); + + const iconPlugin = { + id: "barIcons", + afterDatasetsDraw(chart) { + const { ctx } = chart; + const meta = chart.getDatasetMeta(0); + const topY = chart.scales.y.getPixelForValue(yMax); + + meta.data.forEach((bar, index) => { + const modelName = labels[index]; + const iconUrl = logoFor(modelName); + if (!iconUrl) return; + + let img = logoImageCache.get(iconUrl); + if (!img) { + img = new Image(); + img.src = iconUrl; + img.onload = () => chart.draw(); + logoImageCache.set(iconUrl, img); + } + if (!img.complete || !img.naturalWidth) return; + + const iconSize = 18; + const x = bar.x - iconSize / 2; + const y = Math.max(topY + 4, bar.y - iconSize - 6); + + ctx.save(); + ctx.fillStyle = "#0a0a0a"; + ctx.strokeStyle = "#2a2a2a"; + ctx.lineWidth = 1; + if (typeof ctx.roundRect === "function") { + ctx.beginPath(); + ctx.roundRect(x - 3, y - 3, iconSize + 6, iconSize + 6, 6); + ctx.fill(); + ctx.stroke(); + } else { + ctx.fillRect(x - 3, y - 3, iconSize + 6, iconSize + 6); + ctx.strokeRect(x - 3, y - 3, iconSize + 6, iconSize + 6); + } + ctx.drawImage(img, x, y, iconSize, iconSize); + ctx.restore(); + }); + }, + }; + + new Chart(chartCanvas, { + type: "bar", + data: { + labels, + datasets: [ + { + label: "Elo", + data, + borderWidth: 1, + borderColor: "#3c2018", + backgroundColor: [ + "#e8ab97", + "#e09a81", + "#d98367", + "#d97757", + "#ca6b4b", + "#bc6141", + "#ae5637", + "#9f4b2d", + ], + borderRadius: 6, + maxBarThickness: 72, + }, + ], + }, + options: { + responsive: true, + maintainAspectRatio: false, + plugins: { + legend: { display: false }, + tooltip: { + displayColors: false, + backgroundColor: "#101010", + borderColor: "#2d2d2d", + borderWidth: 1, + titleColor: "#f0f0f0", + bodyColor: "#d4d4d4", + callbacks: { + label(context) { + return `Elo ${Number(context.raw).toFixed(2)}`; + }, + }, + }, + }, + scales: { + x: { + ticks: { + color: "#b8b8b8", + maxRotation: 0, + autoSkip: false, + font: { family: "JetBrains Mono", size: 11 }, + }, + grid: { color: "rgba(255,255,255,0.04)" }, + }, + y: { + min: yMin, + max: yMax, + ticks: { + color: "#8b8b8b", + font: { family: "JetBrains Mono", size: 11 }, + }, + grid: { color: "rgba(255,255,255,0.07)" }, + }, + }, + }, + plugins: [iconPlugin], + }); +} + +function renderEmpty() { + root.innerHTML = ` +
+
+
+ Quipbench +

Quipbench

+
+
+
+

No snapshot found

+

Run a benchmark first: bun run quipbench:run

+

Then refresh this page. Snapshot expected at bench/out/latest.js.

+
+
+ `; +} + +function render(snapshotData) { + const meta = snapshotData.runMeta; + const leaderboard = snapshotData.leaderboard; + const champion = leaderboard[0]; + + root.innerHTML = ` +
+
+
+ Quipbench +

Quipbench

+
+ +
+ Run ${meta.runId} + ${meta.roundsCompleted}/${meta.roundsRequested} rounds + ${meta.failures} failures +
+
+ +
+
+
Started
+
${formatDate(meta.startedAt)}
+
+
+
Ended
+
${formatDate(meta.endedAt)}
+
+
+
Champion
+
${champion ? champion.modelName : "-"}
+
+
+
Seed
+
${meta.seed}
+
+
+ +
+
+

Elo Leaderboard

+
+
+ +
+
+ +
+
+

Leaderboard Table

+
+
+ + + + + + + + + + + + + ${leaderboard.map(rowHtml).join("")} + +
#ModelEloWinsGamesWin Rate
+
+
+
+ `; +} + +if (!snapshot || !snapshot.leaderboard) { + renderEmpty(); +} else { + render(snapshot); + renderChart(snapshot.leaderboard); +} diff --git a/bench/dashboard/app.ts b/bench/dashboard/app.ts new file mode 100644 index 0000000..8997c7e --- /dev/null +++ b/bench/dashboard/app.ts @@ -0,0 +1,3 @@ +// Source shim to satisfy TypeScript-based workflows. +// The standalone dashboard intentionally runs plain JS for direct browser compatibility. +import "./app.js"; diff --git a/bench/dashboard/assets/logo.svg b/bench/dashboard/assets/logo.svg new file mode 100644 index 0000000..719d15b --- /dev/null +++ b/bench/dashboard/assets/logo.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/bench/dashboard/assets/logos/claude.svg b/bench/dashboard/assets/logos/claude.svg new file mode 100644 index 0000000..d300701 --- /dev/null +++ b/bench/dashboard/assets/logos/claude.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/bench/dashboard/assets/logos/deepseek.svg b/bench/dashboard/assets/logos/deepseek.svg new file mode 100644 index 0000000..1401c17 --- /dev/null +++ b/bench/dashboard/assets/logos/deepseek.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/bench/dashboard/assets/logos/gemini.svg b/bench/dashboard/assets/logos/gemini.svg new file mode 100644 index 0000000..87cce06 --- /dev/null +++ b/bench/dashboard/assets/logos/gemini.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/bench/dashboard/assets/logos/glm.svg b/bench/dashboard/assets/logos/glm.svg new file mode 100644 index 0000000..28ca728 --- /dev/null +++ b/bench/dashboard/assets/logos/glm.svg @@ -0,0 +1,215 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/bench/dashboard/assets/logos/grok.svg b/bench/dashboard/assets/logos/grok.svg new file mode 100644 index 0000000..06ab179 --- /dev/null +++ b/bench/dashboard/assets/logos/grok.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/bench/dashboard/assets/logos/kimi.svg b/bench/dashboard/assets/logos/kimi.svg new file mode 100644 index 0000000..db43fce --- /dev/null +++ b/bench/dashboard/assets/logos/kimi.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/bench/dashboard/assets/logos/minimax.svg b/bench/dashboard/assets/logos/minimax.svg new file mode 100644 index 0000000..2a60bd4 --- /dev/null +++ b/bench/dashboard/assets/logos/minimax.svg @@ -0,0 +1 @@ +Minimax \ No newline at end of file diff --git a/bench/dashboard/assets/logos/openai.svg b/bench/dashboard/assets/logos/openai.svg new file mode 100644 index 0000000..b6d542d --- /dev/null +++ b/bench/dashboard/assets/logos/openai.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/bench/dashboard/index.html b/bench/dashboard/index.html new file mode 100644 index 0000000..edcf297 --- /dev/null +++ b/bench/dashboard/index.html @@ -0,0 +1,21 @@ + + + + + + Quipbench Dashboard + + + + + + +
+ + + + + diff --git a/bench/dashboard/styles.css b/bench/dashboard/styles.css new file mode 100644 index 0000000..31a5699 --- /dev/null +++ b/bench/dashboard/styles.css @@ -0,0 +1,251 @@ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +:root { + --bg: #050505; + --surface: #0a0a0a; + --surface-2: #111111; + --border: #212121; + --border-light: #2d2d2d; + --text: #ededed; + --text-dim: #a2a2a2; + --text-muted: #6a6a6a; + --accent: #d97757; + --mono: "JetBrains Mono", "SF Mono", monospace; + --sans: "Inter", -apple-system, sans-serif; + --serif: "DM Serif Display", Georgia, serif; +} + +body { + background: var(--bg); + color: var(--text); + font-family: var(--sans); + min-height: 100vh; + -webkit-font-smoothing: antialiased; +} + +.app { + min-height: 100vh; +} + +.shell { + max-width: 1200px; + margin: 0 auto; + padding: 32px 20px 64px; + display: flex; + flex-direction: column; + gap: 20px; +} + +.header { + display: flex; + flex-wrap: wrap; + justify-content: space-between; + gap: 12px; + align-items: center; +} + +.brand { + display: inline-flex; + align-items: center; + gap: 12px; +} + +.brand img { + width: 28px; + height: 28px; +} + +.brand h1 { + font-family: var(--serif); + font-size: clamp(26px, 3.5vw, 36px); + letter-spacing: -0.4px; +} + +.header-links { + display: inline-flex; + align-items: center; + gap: 10px; +} + +.header-links a { + color: var(--text-dim); + text-decoration: none; + border: 1px solid var(--border); + background: rgba(255, 255, 255, 0.02); + border-radius: 999px; + padding: 6px 10px; + font-family: var(--mono); + font-size: 11px; + display: inline-flex; + align-items: center; + gap: 6px; +} + +.header-links a:hover { + color: var(--text); + border-color: var(--border-light); +} + +.header-links a svg { + width: 13px; + height: 13px; + fill: currentColor; +} + +.meta-pills { + display: flex; + flex-wrap: wrap; + gap: 8px; +} + +.pill { + border: 1px solid var(--border); + background: rgba(255, 255, 255, 0.02); + color: var(--text-dim); + border-radius: 999px; + padding: 6px 10px; + font-size: 11px; +} + +.panel { + border: 1px solid var(--border); + background: var(--surface); + border-radius: 12px; + padding: 18px; +} + +.panel--summary { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 12px; +} + +.summary-block { + border: 1px solid var(--border); + background: var(--surface-2); + border-radius: 8px; + padding: 10px 12px; + min-width: 0; +} + +.summary-label { + color: var(--text-muted); + font-size: 11px; + letter-spacing: 0.5px; + text-transform: uppercase; + margin-bottom: 6px; +} + +.summary-value { + font-size: 14px; + color: var(--text); + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.panel-head { + margin-bottom: 16px; + display: flex; + flex-wrap: wrap; + align-items: baseline; + justify-content: space-between; + gap: 8px; +} + +.panel-head h2 { + font-size: 18px; + letter-spacing: -0.2px; +} + +.panel-head p { + color: var(--text-muted); + font-size: 11px; +} + +.chart-shell { + height: min(46vw, 420px); + min-height: 280px; + border: 1px solid var(--border); + border-radius: 10px; + background: linear-gradient(to bottom, #121212, #0d0d0d); + padding: 12px; +} + +.table-wrap { + overflow-x: auto; +} + +table { + width: 100%; + border-collapse: collapse; + min-width: 680px; +} + +th, +td { + text-align: left; + padding: 10px; + border-bottom: 1px solid var(--border); + font-size: 13px; +} + +th { + color: var(--text-muted); + font-weight: 600; +} + +.model-cell { + display: inline-flex; + align-items: center; + gap: 8px; +} + +.model-cell img { + width: 16px; + height: 16px; +} + +.rank { + color: var(--text-dim); + width: 36px; +} + +.mono { + font-family: var(--mono); +} + +.empty { + font-family: var(--mono); + font-size: 14px; + color: var(--text-muted); +} + +code { + font-family: var(--mono); + font-size: 12px; + border: 1px solid var(--border); + border-radius: 6px; + padding: 2px 6px; + background: var(--surface-2); +} + +@media (max-width: 900px) { + .panel--summary { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } +} + +@media (max-width: 560px) { + .shell { + padding: 22px 12px 40px; + } + + .panel--summary { + grid-template-columns: 1fr; + } +} diff --git a/bench/db.ts b/bench/db.ts new file mode 100644 index 0000000..d01b196 --- /dev/null +++ b/bench/db.ts @@ -0,0 +1,372 @@ +import { Database } from "bun:sqlite"; +import type { LeaderboardRow, MatchRecord, RunMeta } from "./types"; + +export type RunRow = { + id: string; + started_at: string; + ended_at: string | null; + rounds_requested: number; + rounds_completed: number; + failures: number; + concurrency: number; + elo_k: number; + initial_elo: number; + seed: number; + status: string; + output_dir: string; +}; + +export function openBenchDb(path: string): Database { + const db = new Database(path, { create: true }); + initSchema(db); + return db; +} + +function initSchema(db: Database) { + db.exec(` + CREATE TABLE IF NOT EXISTS runs ( + id TEXT PRIMARY KEY, + started_at TEXT NOT NULL, + ended_at TEXT, + rounds_requested INTEGER NOT NULL, + rounds_completed INTEGER NOT NULL DEFAULT 0, + failures INTEGER NOT NULL DEFAULT 0, + concurrency INTEGER NOT NULL, + elo_k REAL NOT NULL, + initial_elo REAL NOT NULL, + seed INTEGER NOT NULL, + status TEXT NOT NULL, + output_dir TEXT NOT NULL + ); + + CREATE TABLE IF NOT EXISTS matches ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id TEXT NOT NULL, + round_num INTEGER NOT NULL, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + prompter_id TEXT NOT NULL, + prompter_name TEXT NOT NULL, + contestant_a_id TEXT NOT NULL, + contestant_a_name TEXT NOT NULL, + contestant_b_id TEXT NOT NULL, + contestant_b_name TEXT NOT NULL, + prompt TEXT, + answer_a TEXT, + answer_b TEXT, + votes_a INTEGER NOT NULL DEFAULT 0, + votes_b INTEGER NOT NULL DEFAULT 0, + winner TEXT NOT NULL, + error TEXT, + payload_json TEXT, + FOREIGN KEY(run_id) REFERENCES runs(id) + ); + + CREATE TABLE IF NOT EXISTS ratings ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id TEXT NOT NULL, + model_id TEXT NOT NULL, + model_name TEXT NOT NULL, + elo REAL NOT NULL, + wins INTEGER NOT NULL, + games INTEGER NOT NULL, + win_rate REAL NOT NULL, + rank INTEGER NOT NULL, + FOREIGN KEY(run_id) REFERENCES runs(id) + ); + + CREATE INDEX IF NOT EXISTS idx_matches_run_id ON matches(run_id); + CREATE INDEX IF NOT EXISTS idx_ratings_run_id ON ratings(run_id); + CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at); + `); +} + +export function insertRunStart( + db: Database, + row: { + id: string; + startedAt: string; + roundsRequested: number; + concurrency: number; + eloK: number; + initialElo: number; + seed: number; + outputDir: string; + }, +) { + const stmt = db.prepare(` + INSERT INTO runs ( + id, + started_at, + rounds_requested, + concurrency, + elo_k, + initial_elo, + seed, + status, + output_dir + ) VALUES ( + $id, + $started_at, + $rounds_requested, + $concurrency, + $elo_k, + $initial_elo, + $seed, + 'running', + $output_dir + ) + `); + + stmt.run({ + $id: row.id, + $started_at: row.startedAt, + $rounds_requested: row.roundsRequested, + $concurrency: row.concurrency, + $elo_k: row.eloK, + $initial_elo: row.initialElo, + $seed: row.seed, + $output_dir: row.outputDir, + }); +} + +export function updateRunProgress( + db: Database, + runId: string, + progress: { roundsCompleted: number; failures: number }, +) { + const stmt = db.prepare(` + UPDATE runs + SET rounds_completed = $rounds_completed, + failures = $failures + WHERE id = $id + `); + stmt.run({ + $id: runId, + $rounds_completed: progress.roundsCompleted, + $failures: progress.failures, + }); +} + +export function finalizeRun( + db: Database, + runId: string, + status: "completed" | "failed", + endedAt: string, +) { + const stmt = db.prepare(` + UPDATE runs + SET status = $status, + ended_at = $ended_at + WHERE id = $id + `); + stmt.run({ + $id: runId, + $status: status, + $ended_at: endedAt, + }); +} + +export function insertMatch(db: Database, match: MatchRecord) { + const stmt = db.prepare(` + INSERT INTO matches ( + run_id, + round_num, + prompter_id, + prompter_name, + contestant_a_id, + contestant_a_name, + contestant_b_id, + contestant_b_name, + prompt, + answer_a, + answer_b, + votes_a, + votes_b, + winner, + error, + payload_json + ) VALUES ( + $run_id, + $round_num, + $prompter_id, + $prompter_name, + $contestant_a_id, + $contestant_a_name, + $contestant_b_id, + $contestant_b_name, + $prompt, + $answer_a, + $answer_b, + $votes_a, + $votes_b, + $winner, + $error, + $payload_json + ) + `); + + stmt.run({ + $run_id: match.runId, + $round_num: match.roundNum, + $prompter_id: match.prompter.id, + $prompter_name: match.prompter.name, + $contestant_a_id: match.contestantA.id, + $contestant_a_name: match.contestantA.name, + $contestant_b_id: match.contestantB.id, + $contestant_b_name: match.contestantB.name, + $prompt: match.prompt ?? null, + $answer_a: match.answerA ?? null, + $answer_b: match.answerB ?? null, + $votes_a: match.votesA, + $votes_b: match.votesB, + $winner: match.winner, + $error: match.error ?? null, + $payload_json: JSON.stringify(match), + }); +} + +export function replaceRatings( + db: Database, + runId: string, + leaderboard: LeaderboardRow[], +) { + db.prepare("DELETE FROM ratings WHERE run_id = $run_id").run({ $run_id: runId }); + + const stmt = db.prepare(` + INSERT INTO ratings ( + run_id, + model_id, + model_name, + elo, + wins, + games, + win_rate, + rank + ) VALUES ( + $run_id, + $model_id, + $model_name, + $elo, + $wins, + $games, + $win_rate, + $rank + ) + `); + + const tx = db.transaction((rows: LeaderboardRow[]) => { + for (const row of rows) { + stmt.run({ + $run_id: runId, + $model_id: row.modelId, + $model_name: row.modelName, + $elo: row.elo, + $wins: row.wins, + $games: row.games, + $win_rate: row.winRate, + $rank: row.rank, + }); + } + }); + + tx(leaderboard); +} + +export function getLatestCompletedRunId(db: Database): string | null { + const row = db + .query( + ` + SELECT id + FROM runs + WHERE status = 'completed' + ORDER BY datetime(ended_at) DESC, datetime(started_at) DESC + LIMIT 1 + `, + ) + .get() as { id: string } | null; + + return row?.id ?? null; +} + +export function getRunRow(db: Database, runId: string): RunRow | null { + return ( + (db + .query(` + SELECT + id, + started_at, + ended_at, + rounds_requested, + rounds_completed, + failures, + concurrency, + elo_k, + initial_elo, + seed, + status, + output_dir + FROM runs + WHERE id = $run_id + LIMIT 1 + `) + .get({ $run_id: runId }) as RunRow | null) ?? null + ); +} + +export function getRatingsForRun( + db: Database, + runId: string, +): LeaderboardRow[] { + const rows = db + .query(` + SELECT + rank, + model_id, + model_name, + elo, + wins, + games, + win_rate + FROM ratings + WHERE run_id = $run_id + ORDER BY rank ASC + `) + .all({ $run_id: runId }) as Array<{ + rank: number; + model_id: string; + model_name: string; + elo: number; + wins: number; + games: number; + win_rate: number; + }>; + + return rows.map((row) => ({ + rank: row.rank, + modelId: row.model_id, + modelName: row.model_name, + elo: row.elo, + wins: row.wins, + games: row.games, + winRate: row.win_rate, + })); +} + +export function runRowToMeta(row: RunRow): RunMeta { + if (!row.ended_at) { + throw new Error(`Run ${row.id} is not finalized`); + } + + return { + runId: row.id, + startedAt: row.started_at, + endedAt: row.ended_at, + roundsRequested: row.rounds_requested, + roundsCompleted: row.rounds_completed, + failures: row.failures, + concurrency: row.concurrency, + eloK: row.elo_k, + initialElo: row.initial_elo, + seed: row.seed, + }; +} diff --git a/bench/elo.test.ts b/bench/elo.test.ts new file mode 100644 index 0000000..080581b --- /dev/null +++ b/bench/elo.test.ts @@ -0,0 +1,21 @@ +import { expect, test } from "bun:test"; +import { expectedScore, updatePairElo } from "./elo"; + +test("expected score is symmetric", () => { + const a = expectedScore(1600, 1500); + const b = expectedScore(1500, 1600); + expect(Number((a + b).toFixed(8))).toBe(1); + expect(a).toBeGreaterThan(0.5); +}); + +test("winner gains rating and loser drops", () => { + const { nextA, nextB } = updatePairElo(1500, 1500, 1, 24); + expect(nextA).toBeGreaterThan(1500); + expect(nextB).toBeLessThan(1500); +}); + +test("tie moves ratings toward each other", () => { + const { nextA, nextB } = updatePairElo(1700, 1500, 0.5, 24); + expect(nextA).toBeLessThan(1700); + expect(nextB).toBeGreaterThan(1500); +}); diff --git a/bench/elo.ts b/bench/elo.ts new file mode 100644 index 0000000..f20dd3f --- /dev/null +++ b/bench/elo.ts @@ -0,0 +1,26 @@ +export function expectedScore(playerElo: number, opponentElo: number): number { + return 1 / (1 + 10 ** ((opponentElo - playerElo) / 400)); +} + +export function nextElo( + playerElo: number, + opponentElo: number, + actualScore: number, + k: number, +): number { + const exp = expectedScore(playerElo, opponentElo); + return playerElo + k * (actualScore - exp); +} + +export function updatePairElo( + eloA: number, + eloB: number, + actualA: number, + k: number, +): { nextA: number; nextB: number } { + const actualB = 1 - actualA; + return { + nextA: nextElo(eloA, eloB, actualA, k), + nextB: nextElo(eloB, eloA, actualB, k), + }; +} diff --git a/bench/export.ts b/bench/export.ts new file mode 100644 index 0000000..394bc51 --- /dev/null +++ b/bench/export.ts @@ -0,0 +1,96 @@ +import { mkdirSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { DEFAULT_DB_PATH, DEFAULT_OUTPUT_DIR } from "./config"; +import { + getLatestCompletedRunId, + getRatingsForRun, + getRunRow, + openBenchDb, + runRowToMeta, +} from "./db"; +import type { QuipbenchSnapshot } from "./types"; + +function parseArgs(argv: string[]) { + const args: Record = {}; + for (const raw of argv) { + if (!raw.startsWith("--")) continue; + const stripped = raw.slice(2); + const [key, ...rest] = stripped.split("="); + if (!key) continue; + args[key] = rest.length === 0 ? "true" : rest.join("="); + } + return args; +} + +export async function exportLatestSnapshot(options?: { + dbPath?: string; + outputDir?: string; + runId?: string; +}): Promise<{ + snapshot: QuipbenchSnapshot; + latestJsonPath: string; + latestJsPath: string; +}> { + const dbPath = options?.dbPath ?? DEFAULT_DB_PATH; + const outputDir = options?.outputDir ?? DEFAULT_OUTPUT_DIR; + + mkdirSync(outputDir, { recursive: true }); + + const db = openBenchDb(dbPath); + try { + const runId = options?.runId ?? getLatestCompletedRunId(db); + if (!runId) { + throw new Error("No completed Quipbench runs found"); + } + + const runRow = getRunRow(db, runId); + if (!runRow) { + throw new Error(`Run not found: ${runId}`); + } + + const leaderboard = getRatingsForRun(db, runId); + if (leaderboard.length === 0) { + throw new Error(`Run ${runId} has no ratings`); + } + + const snapshot: QuipbenchSnapshot = { + runMeta: runRowToMeta(runRow), + leaderboard, + chart: leaderboard.map((row) => ({ + modelName: row.modelName, + elo: row.elo, + })), + }; + + const latestJsonPath = join(outputDir, "latest.json"); + const latestJsPath = join(outputDir, "latest.js"); + + writeFileSync(latestJsonPath, JSON.stringify(snapshot, null, 2)); + writeFileSync( + latestJsPath, + `window.__QUIPBENCH_LATEST__ = ${JSON.stringify(snapshot, null, 2)};\n`, + ); + + return { snapshot, latestJsonPath, latestJsPath }; + } finally { + db.close(); + } +} + +async function main() { + const args = parseArgs(process.argv.slice(2)); + const dbPath = args.db ? join(process.cwd(), args.db) : DEFAULT_DB_PATH; + const outputDir = args.out ? join(process.cwd(), args.out) : DEFAULT_OUTPUT_DIR; + const runId = args.runId; + + const result = await exportLatestSnapshot({ dbPath, outputDir, runId }); + console.log(`Exported snapshot: ${result.latestJsonPath}`); + console.log(`Exported snapshot script: ${result.latestJsPath}`); +} + +if (import.meta.main) { + main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + }); +} diff --git a/bench/finalize-partial.ts b/bench/finalize-partial.ts new file mode 100644 index 0000000..9a4e46c --- /dev/null +++ b/bench/finalize-partial.ts @@ -0,0 +1,105 @@ +import { QUIPBENCH_MODELS } from "./models"; +import { updatePairElo } from "./elo"; +import { buildLeaderboard } from "./leaderboard"; +import { openBenchDb, replaceRatings } from "./db"; +import { DEFAULT_DB_PATH, DEFAULT_OUTPUT_DIR } from "./config"; +import { exportLatestSnapshot } from "./export"; +import type { RatingState } from "./types"; + +const db = openBenchDb(DEFAULT_DB_PATH); + +try { + const run = db.query(` + SELECT id, initial_elo, elo_k, rounds_requested + FROM runs + WHERE status = 'running' + ORDER BY datetime(started_at) DESC + LIMIT 1 + `).get() as { id: string; initial_elo: number; elo_k: number; rounds_requested: number } | null; + + if (!run) { + throw new Error("No running Quipbench run found to finalize"); + } + + const ratings = new Map(); + for (const model of QUIPBENCH_MODELS) { + ratings.set(model.id, { + model, + elo: run.initial_elo, + wins: 0, + games: 0, + }); + } + + const matches = db.query(` + SELECT contestant_a_id, contestant_b_id, winner + FROM matches + WHERE run_id = $run_id + ORDER BY round_num ASC, id ASC + `).all({ $run_id: run.id }) as Array<{ + contestant_a_id: string; + contestant_b_id: string; + winner: "A" | "B" | "TIE" | "ERROR"; + }>; + + let completed = 0; + let failures = 0; + + for (const match of matches) { + const a = ratings.get(match.contestant_a_id); + const b = ratings.get(match.contestant_b_id); + if (!a || !b) continue; + + if (match.winner === "ERROR") { + failures += 1; + continue; + } + + completed += 1; + a.games += 1; + b.games += 1; + + let scoreA = 0.5; + if (match.winner === "A") { + scoreA = 1; + a.wins += 1; + } else if (match.winner === "B") { + scoreA = 0; + b.wins += 1; + } + + const next = updatePairElo(a.elo, b.elo, scoreA, run.elo_k); + a.elo = next.nextA; + b.elo = next.nextB; + } + + const leaderboard = buildLeaderboard(Array.from(ratings.values())); + replaceRatings(db, run.id, leaderboard); + + db.prepare(` + UPDATE runs + SET status = 'completed', + ended_at = $ended_at, + rounds_completed = $rounds_completed, + failures = $failures + WHERE id = $run_id + `).run({ + $run_id: run.id, + $ended_at: new Date().toISOString(), + $rounds_completed: completed, + $failures: failures, + }); + + const out = await exportLatestSnapshot({ + dbPath: DEFAULT_DB_PATH, + outputDir: DEFAULT_OUTPUT_DIR, + runId: run.id, + }); + + console.log(`Finalized partial run: ${run.id}`); + console.log(`Completed rounds: ${completed}`); + console.log(`Failures: ${failures}`); + console.log(`Snapshot JS: ${out.latestJsPath}`); +} finally { + db.close(); +} diff --git a/bench/integration.test.ts b/bench/integration.test.ts new file mode 100644 index 0000000..bbe31cb --- /dev/null +++ b/bench/integration.test.ts @@ -0,0 +1,120 @@ +import { expect, test } from "bun:test"; +import { mkdtempSync, readFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { Database } from "bun:sqlite"; +import { runQuipbench } from "./run"; +import type { BenchModel, QuipbenchAi } from "./types"; + +const TEST_MODELS: BenchModel[] = [ + { id: "model/a", name: "Model A" }, + { id: "model/b", name: "Model B" }, + { id: "model/c", name: "Model C" }, + { id: "model/d", name: "Model D" }, +]; + +const STUB_AI: QuipbenchAi = { + async generatePrompt(model) { + return `Prompt by ${model.name}`; + }, + async generateAnswer(model, prompt) { + return `${model.name} answer for ${prompt}`; + }, + async vote(_voter, _prompt, answerA, answerB) { + return answerA.length >= answerB.length ? "A" : "B"; + }, +}; + +test("run writes DB rows and latest snapshot", async () => { + const tempRoot = mkdtempSync(join(tmpdir(), "quipbench-it-")); + const dbPath = join(tempRoot, "bench.sqlite"); + const outDir = join(tempRoot, "out"); + + const result = await runQuipbench({ + rounds: 8, + concurrency: 2, + seed: 123, + models: TEST_MODELS, + ai: STUB_AI, + dbPath, + outputDir: outDir, + }); + + expect(result.runMeta.roundsRequested).toBe(8); + expect(result.runMeta.roundsCompleted + result.runMeta.failures).toBe(8); + expect(result.leaderboard.length).toBe(TEST_MODELS.length); + + const db = new Database(dbPath); + const runsCount = (db.query("SELECT COUNT(*) as c FROM runs").get() as { c: number }).c; + const matchesCount = ( + db.query("SELECT COUNT(*) as c FROM matches WHERE run_id = $run_id").get({ + $run_id: result.runMeta.runId, + }) as { c: number } + ).c; + const ratingsCount = ( + db.query("SELECT COUNT(*) as c FROM ratings WHERE run_id = $run_id").get({ + $run_id: result.runMeta.runId, + }) as { c: number } + ).c; + db.close(); + + expect(runsCount).toBe(1); + expect(matchesCount).toBe(8); + expect(ratingsCount).toBe(TEST_MODELS.length); + + const latest = JSON.parse(readFileSync(result.snapshotPathJson, "utf8")) as { + runMeta: { runId: string }; + leaderboard: unknown[]; + chart: unknown[]; + }; + + expect(latest.runMeta.runId).toBe(result.runMeta.runId); + expect(latest.leaderboard.length).toBe(TEST_MODELS.length); + expect(latest.chart.length).toBe(TEST_MODELS.length); +}); + +test("round failures are persisted and run still finalizes", async () => { + const tempRoot = mkdtempSync(join(tmpdir(), "quipbench-it-fail-")); + const dbPath = join(tempRoot, "bench.sqlite"); + const outDir = join(tempRoot, "out"); + + const failingAi: QuipbenchAi = { + async generatePrompt() { + throw new Error("forced prompt failure"); + }, + async generateAnswer() { + return "unused"; + }, + async vote() { + return "A"; + }, + }; + + const result = await runQuipbench({ + rounds: 3, + concurrency: 2, + seed: 7, + models: TEST_MODELS, + ai: failingAi, + dbPath, + outputDir: outDir, + }); + + expect(result.runMeta.roundsCompleted).toBe(0); + expect(result.runMeta.failures).toBe(3); + + const db = new Database(dbPath); + const errorMatches = ( + db + .query("SELECT COUNT(*) as c FROM matches WHERE run_id = $run_id AND winner = 'ERROR'") + .get({ $run_id: result.runMeta.runId }) as { c: number } + ).c; + db.close(); + + expect(errorMatches).toBe(3); + + const latest = JSON.parse(readFileSync(result.snapshotPathJson, "utf8")) as { + runMeta: { failures: number }; + }; + expect(latest.runMeta.failures).toBe(3); +}); diff --git a/bench/leaderboard.test.ts b/bench/leaderboard.test.ts new file mode 100644 index 0000000..27cfa0b --- /dev/null +++ b/bench/leaderboard.test.ts @@ -0,0 +1,25 @@ +import { expect, test } from "bun:test"; +import { buildLeaderboard } from "./leaderboard"; + +const model = (id: string, name: string) => ({ id, name }); + +test("leaderboard sorts by elo, then wins, then win rate, then name", () => { + const rows = buildLeaderboard([ + { model: model("a", "Alpha"), elo: 1500, wins: 5, games: 10 }, + { model: model("b", "Beta"), elo: 1600, wins: 1, games: 1 }, + { model: model("c", "Charlie"), elo: 1500, wins: 6, games: 12 }, + { model: model("d", "Delta"), elo: 1500, wins: 6, games: 8 }, + { model: model("e", "Echo"), elo: 1500, wins: 6, games: 8 }, + ]); + + expect(rows.map((r) => r.modelName)).toEqual([ + "Beta", + "Delta", + "Echo", + "Charlie", + "Alpha", + ]); + expect(rows[0]?.rank).toBe(1); + expect(rows[4]?.rank).toBe(5); + expect(rows[1]?.winRate).toBe(75); +}); diff --git a/bench/leaderboard.ts b/bench/leaderboard.ts new file mode 100644 index 0000000..6e73f81 --- /dev/null +++ b/bench/leaderboard.ts @@ -0,0 +1,25 @@ +import type { LeaderboardRow, RatingState } from "./types"; + +export function buildLeaderboard(rows: RatingState[]): LeaderboardRow[] { + const sorted = [...rows].sort((a, b) => { + if (b.elo !== a.elo) return b.elo - a.elo; + if (b.wins !== a.wins) return b.wins - a.wins; + const wrA = a.games > 0 ? a.wins / a.games : 0; + const wrB = b.games > 0 ? b.wins / b.games : 0; + if (wrB !== wrA) return wrB - wrA; + return a.model.name.localeCompare(b.model.name); + }); + + return sorted.map((entry, index) => ({ + rank: index + 1, + modelId: entry.model.id, + modelName: entry.model.name, + elo: Number(entry.elo.toFixed(2)), + wins: entry.wins, + games: entry.games, + winRate: + entry.games > 0 + ? Number(((entry.wins / entry.games) * 100).toFixed(2)) + : 0, + })); +} diff --git a/bench/models.ts b/bench/models.ts new file mode 100644 index 0000000..1b4115d --- /dev/null +++ b/bench/models.ts @@ -0,0 +1,12 @@ +import type { BenchModel } from "./types"; + +// Quipbench source-of-truth model roster. +export const QUIPBENCH_MODELS: BenchModel[] = [ + { id: "google/gemini-3.1-pro-preview", name: "Gemini 3.1 Pro" }, + { id: "moonshotai/kimi-k2", name: "Kimi K2" }, + { id: "deepseek/deepseek-v3.2", name: "DeepSeek 3.2" }, + { id: "openai/gpt-5.2", name: "GPT-5.2" }, + { id: "anthropic/claude-opus-4.6", name: "Opus 4.6" }, + { id: "anthropic/claude-sonnet-4.6", name: "Sonnet 4.6" }, + { id: "x-ai/grok-4.1-fast", name: "Grok 4.1" }, +]; diff --git a/bench/open.ts b/bench/open.ts new file mode 100644 index 0000000..4f14248 --- /dev/null +++ b/bench/open.ts @@ -0,0 +1,30 @@ +import { spawnSync } from "node:child_process"; +import { resolve } from "node:path"; + +const dashboardPath = resolve(import.meta.dir, "dashboard", "index.html"); + +function openPath(target: string) { + if (process.platform === "darwin") { + return spawnSync("open", [target], { stdio: "inherit" }); + } + + if (process.platform === "win32") { + return spawnSync("cmd", ["/c", "start", "", target], { + stdio: "inherit", + shell: false, + }); + } + + return spawnSync("xdg-open", [target], { stdio: "inherit" }); +} + +const result = openPath(dashboardPath); +if (result.error) { + console.error(`Could not open dashboard: ${result.error.message}`); + process.exit(1); +} +if (typeof result.status === "number" && result.status !== 0) { + process.exit(result.status); +} + +console.log(`Opened Quipbench dashboard: ${dashboardPath}`); diff --git a/bench/run.ts b/bench/run.ts new file mode 100644 index 0000000..0e1e9cd --- /dev/null +++ b/bench/run.ts @@ -0,0 +1,525 @@ +import { mkdirSync } from "node:fs"; +import { join } from "node:path"; +import { + DEFAULT_CONCURRENCY, + DEFAULT_DB_PATH, + DEFAULT_ELO_K, + DEFAULT_INITIAL_ELO, + DEFAULT_OUTPUT_DIR, + DEFAULT_ROUNDS, + parsePositiveInt, + parsePositiveNumber, +} from "./config"; +import { QUIPBENCH_MODELS } from "./models"; +import type { + BenchModel, + MatchRecord, + QuipbenchAi, + QuipbenchRunResult, + RatingState, + VoteRecord, +} from "./types"; +import { updatePairElo } from "./elo"; +import { buildLeaderboard } from "./leaderboard"; +import { + finalizeRun, + insertMatch, + insertRunStart, + openBenchDb, + replaceRatings, + updateRunProgress, +} from "./db"; +import { exportLatestSnapshot } from "./export"; + +type RetryFn = ( + fn: () => Promise, + validate: (result: T) => boolean, + retries: number, + label: string, +) => Promise; + +function defaultIsRealString(value: string, minLength = 5): boolean { + return value.trim().length >= minLength; +} + +const defaultWithRetry: RetryFn = async (fn, validate, retries, label) => { + let lastError: unknown; + + for (let attempt = 1; attempt <= retries; attempt++) { + try { + const result = await fn(); + if (validate(result)) return result; + lastError = new Error(`${label}: validation failed`); + } catch (error) { + lastError = error; + } + + if (attempt < retries) { + await new Promise((resolve) => setTimeout(resolve, 250 * attempt)); + } + } + + throw lastError instanceof Error + ? lastError + : new Error(`${label}: all retry attempts failed`); +}; + +async function loadLiveAi(): Promise<{ + ai: QuipbenchAi; + retry: RetryFn; + isRealStringFn: (value: string, minLength?: number) => boolean; +}> { + const gameModule = await import("../game.ts"); + type GameModel = typeof gameModule.MODELS[number]; + + function toGameModel(model: BenchModel): GameModel { + return model as unknown as GameModel; + } + + return { + ai: { + async generatePrompt(model) { + return gameModule.callGeneratePrompt(toGameModel(model)); + }, + async generateAnswer(model, prompt) { + return gameModule.callGenerateAnswer(toGameModel(model), prompt); + }, + async vote(voter, prompt, answerA, answerB) { + return gameModule.callVote( + toGameModel(voter), + prompt, + { answer: answerA }, + { answer: answerB }, + ); + }, + }, + retry: gameModule.withRetry as RetryFn, + isRealStringFn: gameModule.isRealString, + }; +} + +function mulberry32(seed: number): () => number { + let t = seed >>> 0; + return () => { + t += 0x6d2b79f5; + let r = Math.imul(t ^ (t >>> 15), t | 1); + r ^= r + Math.imul(r ^ (r >>> 7), r | 61); + return ((r ^ (r >>> 14)) >>> 0) / 4294967296; + }; +} + +function roundRng(seed: number, roundNum: number): () => number { + const mixed = (seed ^ Math.imul(roundNum, 0x9e3779b1)) >>> 0; + return mulberry32(mixed); +} + +function shuffleWithRng(items: T[], rng: () => number): T[] { + const arr = [...items]; + for (let i = arr.length - 1; i > 0; i--) { + const j = Math.floor(rng() * (i + 1)); + [arr[i], arr[j]] = [arr[j]!, arr[i]!]; + } + return arr; +} + +function parseArgs(argv: string[]) { + const args: Record = {}; + for (const raw of argv) { + if (!raw.startsWith("--")) continue; + const trimmed = raw.slice(2); + const [key, ...rest] = trimmed.split("="); + if (!key) continue; + args[key] = rest.length === 0 ? "true" : rest.join("="); + } + return args; +} + +function createRunId(): string { + return `quipbench-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`; +} + +type RoundRunResult = { + match: MatchRecord; +}; + +async function runRound(options: { + runId: string; + roundNum: number; + models: BenchModel[]; + ai: QuipbenchAi; + seed: number; + retry: RetryFn; + isRealStringFn: (value: string, minLength?: number) => boolean; +}): Promise { + const { runId, roundNum, models, ai, seed, retry, isRealStringFn } = options; + const rng = roundRng(seed, roundNum); + const shuffled = shuffleWithRng(models, rng); + + const prompter = shuffled[0]!; + const contestantA = shuffled[1]!; + const contestantB = shuffled[2]!; + const voters = [prompter, ...shuffled.slice(3)]; + + const baseMatch: MatchRecord = { + runId, + roundNum, + prompter, + contestantA, + contestantB, + votesA: 0, + votesB: 0, + winner: "ERROR", + votes: [], + }; + + let prompt = ""; + try { + prompt = await retry( + () => ai.generatePrompt(prompter), + (value) => isRealStringFn(value, 10), + 3, + `QB:R${roundNum}:prompt:${prompter.name}`, + ); + } catch (error) { + return { + match: { + ...baseMatch, + winner: "ERROR", + error: + error instanceof Error + ? `Prompt failed: ${error.message}` + : "Prompt failed", + }, + }; + } + + let answerA = ""; + let answerB = ""; + try { + [answerA, answerB] = await Promise.all([ + retry( + () => ai.generateAnswer(contestantA, prompt), + (value) => isRealStringFn(value, 3), + 3, + `QB:R${roundNum}:answer:${contestantA.name}`, + ), + retry( + () => ai.generateAnswer(contestantB, prompt), + (value) => isRealStringFn(value, 3), + 3, + `QB:R${roundNum}:answer:${contestantB.name}`, + ), + ]); + } catch (error) { + return { + match: { + ...baseMatch, + prompt, + winner: "ERROR", + error: + error instanceof Error + ? `Answer failed: ${error.message}` + : "Answer failed", + }, + }; + } + + const votes: VoteRecord[] = []; + let votesA = 0; + let votesB = 0; + + await Promise.all( + voters.map(async (voter) => { + const showAFirst = rng() > 0.5; + const first = showAFirst ? answerA : answerB; + const second = showAFirst ? answerB : answerA; + + try { + const decision = await retry( + () => ai.vote(voter, prompt, first, second), + (value) => value === "A" || value === "B", + 3, + `QB:R${roundNum}:vote:${voter.name}`, + ); + + const votedForA = showAFirst ? decision === "A" : decision === "B"; + if (votedForA) votesA += 1; + else votesB += 1; + + votes.push({ + voterId: voter.id, + voterName: voter.name, + votedFor: votedForA ? "A" : "B", + showAFirst, + }); + } catch (error) { + votes.push({ + voterId: voter.id, + voterName: voter.name, + votedFor: null, + showAFirst, + error: error instanceof Error ? error.message : String(error), + }); + } + }), + ); + + let winner: MatchRecord["winner"] = "TIE"; + if (votesA > votesB) winner = "A"; + else if (votesB > votesA) winner = "B"; + + return { + match: { + ...baseMatch, + prompt, + answerA, + answerB, + votesA, + votesB, + winner, + votes, + }, + }; +} + +export type RunQuipbenchOptions = { + rounds?: number; + concurrency?: number; + eloK?: number; + initialElo?: number; + seed?: number; + outputDir?: string; + dbPath?: string; + models?: BenchModel[]; + ai?: QuipbenchAi; +}; + +export async function runQuipbench( + options: RunQuipbenchOptions = {}, +): Promise { + const rounds = options.rounds ?? DEFAULT_ROUNDS; + const concurrency = options.concurrency ?? DEFAULT_CONCURRENCY; + const eloK = options.eloK ?? DEFAULT_ELO_K; + const initialElo = options.initialElo ?? DEFAULT_INITIAL_ELO; + const models = options.models ?? QUIPBENCH_MODELS; + const outputDir = options.outputDir ?? DEFAULT_OUTPUT_DIR; + const dbPath = options.dbPath ?? DEFAULT_DB_PATH; + const seed = options.seed ?? Math.floor(Math.random() * 2_000_000_000); + + if (!options.ai && !process.env.OPENROUTER_API_KEY) { + throw new Error("OPENROUTER_API_KEY is required for live Quipbench runs"); + } + + if (models.length < 3) { + throw new Error("Quipbench requires at least 3 models"); + } + + if (rounds <= 0 || concurrency <= 0 || eloK <= 0 || initialElo <= 0) { + throw new Error("rounds, concurrency, k, and initialElo must be positive"); + } + + let ai: QuipbenchAi = options.ai ?? { + async generatePrompt() { + throw new Error("Live AI is not loaded"); + }, + async generateAnswer() { + throw new Error("Live AI is not loaded"); + }, + async vote() { + throw new Error("Live AI is not loaded"); + }, + }; + let retry: RetryFn = defaultWithRetry; + let isRealStringFn = defaultIsRealString; + + if (!options.ai) { + const live = await loadLiveAi(); + ai = live.ai; + retry = live.retry; + isRealStringFn = live.isRealStringFn; + } + + mkdirSync(outputDir, { recursive: true }); + + const db = openBenchDb(dbPath); + const runId = createRunId(); + const startedAt = new Date().toISOString(); + + insertRunStart(db, { + id: runId, + startedAt, + roundsRequested: rounds, + concurrency, + eloK, + initialElo, + seed, + outputDir, + }); + + const ratings = new Map(); + for (const model of models) { + ratings.set(model.id, { + model, + elo: initialElo, + wins: 0, + games: 0, + }); + } + + let nextRound = 1; + let roundsCompleted = 0; + let failures = 0; + + let writeLock: Promise = Promise.resolve(); + async function serializeWrite(fn: () => void | Promise) { + writeLock = writeLock.then(fn, fn); + await writeLock; + } + + async function worker() { + while (true) { + const roundNum = nextRound; + nextRound += 1; + if (roundNum > rounds) break; + + const { match } = await runRound({ + runId, + roundNum, + models, + ai, + seed, + retry, + isRealStringFn, + }); + + await serializeWrite(() => { + insertMatch(db, match); + + if (match.winner === "ERROR") { + failures += 1; + } else { + roundsCompleted += 1; + + const ratingA = ratings.get(match.contestantA.id); + const ratingB = ratings.get(match.contestantB.id); + if (!ratingA || !ratingB) { + throw new Error("Contestant rating state missing"); + } + + ratingA.games += 1; + ratingB.games += 1; + + let scoreA = 0.5; + if (match.winner === "A") { + scoreA = 1; + ratingA.wins += 1; + } else if (match.winner === "B") { + scoreA = 0; + ratingB.wins += 1; + } + + const updated = updatePairElo(ratingA.elo, ratingB.elo, scoreA, eloK); + ratingA.elo = updated.nextA; + ratingB.elo = updated.nextB; + } + + updateRunProgress(db, runId, { roundsCompleted, failures }); + process.stdout.write( + `\rQuipbench progress: ${roundsCompleted + failures}/${rounds} (ok=${roundsCompleted}, failed=${failures})`, + ); + }); + } + } + + try { + const workers = Array.from( + { length: Math.min(concurrency, rounds) }, + () => worker(), + ); + await Promise.all(workers); + await writeLock; + + const leaderboard = buildLeaderboard(Array.from(ratings.values())); + replaceRatings(db, runId, leaderboard); + + const endedAt = new Date().toISOString(); + finalizeRun(db, runId, "completed", endedAt); + + const snapshotPaths = await exportLatestSnapshot({ + dbPath, + outputDir, + runId, + }); + + process.stdout.write("\n"); + + return { + runMeta: { + runId, + startedAt, + endedAt, + roundsRequested: rounds, + roundsCompleted, + failures, + concurrency, + eloK, + initialElo, + seed, + }, + leaderboard, + snapshotPathJson: snapshotPaths.latestJsonPath, + snapshotPathJs: snapshotPaths.latestJsPath, + }; + } catch (error) { + finalizeRun(db, runId, "failed", new Date().toISOString()); + throw error; + } finally { + db.close(); + } +} + +async function main() { + const args = parseArgs(process.argv.slice(2)); + + const rounds = parsePositiveInt(args.rounds, DEFAULT_ROUNDS); + const concurrency = parsePositiveInt(args.concurrency, DEFAULT_CONCURRENCY); + const eloK = parsePositiveNumber(args.k, DEFAULT_ELO_K); + const initialElo = parsePositiveNumber(args.initialElo, DEFAULT_INITIAL_ELO); + const seedArg = args.seed ? Number.parseInt(args.seed, 10) : undefined; + const seed = Number.isFinite(seedArg) ? seedArg : undefined; + + const outputDir = args.out ? join(process.cwd(), args.out) : DEFAULT_OUTPUT_DIR; + const dbPath = args.db ? join(process.cwd(), args.db) : DEFAULT_DB_PATH; + + const result = await runQuipbench({ + rounds, + concurrency, + eloK, + initialElo, + seed, + outputDir, + dbPath, + }); + + console.log("Quipbench complete"); + console.log(`Run ID: ${result.runMeta.runId}`); + console.log( + `Rounds: ${result.runMeta.roundsCompleted}/${result.runMeta.roundsRequested} (failures=${result.runMeta.failures})`, + ); + console.log(`Snapshot JSON: ${result.snapshotPathJson}`); + console.log(`Snapshot JS: ${result.snapshotPathJs}`); + + const preview = result.leaderboard.slice(0, 10); + for (const row of preview) { + console.log( + `${String(row.rank).padStart(2)}. ${row.modelName.padEnd(20)} Elo ${row.elo.toFixed(2).padStart(8)} | ${row.wins}/${row.games} (${row.winRate.toFixed(2)}%)`, + ); + } +} + +if (import.meta.main) { + main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + }); +} diff --git a/bench/types.ts b/bench/types.ts new file mode 100644 index 0000000..1cf5333 --- /dev/null +++ b/bench/types.ts @@ -0,0 +1,85 @@ +export type BenchModel = { + id: string; + name: string; +}; + +export type VoteRecord = { + voterId: string; + voterName: string; + votedFor: "A" | "B" | null; + showAFirst: boolean; + error?: string; +}; + +export type MatchRecord = { + runId: string; + roundNum: number; + prompter: BenchModel; + contestantA: BenchModel; + contestantB: BenchModel; + prompt?: string; + answerA?: string; + answerB?: string; + votesA: number; + votesB: number; + winner: "A" | "B" | "TIE" | "ERROR"; + votes: VoteRecord[]; + error?: string; +}; + +export type RatingState = { + model: BenchModel; + elo: number; + wins: number; + games: number; +}; + +export type LeaderboardRow = { + rank: number; + modelId: string; + modelName: string; + elo: number; + wins: number; + games: number; + winRate: number; +}; + +export type RunMeta = { + runId: string; + startedAt: string; + endedAt: string; + roundsRequested: number; + roundsCompleted: number; + failures: number; + concurrency: number; + eloK: number; + initialElo: number; + seed: number; +}; + +export type QuipbenchSnapshot = { + runMeta: RunMeta; + leaderboard: LeaderboardRow[]; + chart: Array<{ + modelName: string; + elo: number; + }>; +}; + +export type QuipbenchRunResult = { + runMeta: RunMeta; + leaderboard: LeaderboardRow[]; + snapshotPathJson: string; + snapshotPathJs: string; +}; + +export type QuipbenchAi = { + generatePrompt: (model: BenchModel) => Promise; + generateAnswer: (model: BenchModel, prompt: string) => Promise; + vote: ( + voter: BenchModel, + prompt: string, + answerA: string, + answerB: string, + ) => Promise<"A" | "B">; +}; diff --git a/bun.lock b/bun.lock index ae913d8..d6f2a45 100644 --- a/bun.lock +++ b/bun.lock @@ -1,6 +1,5 @@ { "lockfileVersion": 1, - "configVersion": 1, "workspaces": { "": { "name": "quipslop", @@ -11,6 +10,7 @@ "puppeteer": "^24.2.0", "react": "^19.2.4", "react-dom": "^19.2.4", + "zod": "^4.3.6", }, "devDependencies": { "@types/bun": "latest", diff --git a/package.json b/package.json index 50f949b..208af5c 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,10 @@ "start:cli": "bun quipslop.tsx", "start:web": "bun --hot server.ts", "start:stream": "bun ./scripts/stream-browser.ts live", - "start:stream:dryrun": "bun ./scripts/stream-browser.ts dryrun" + "start:stream:dryrun": "bun ./scripts/stream-browser.ts dryrun", + "quipbench:run": "bun bench/run.ts", + "quipbench:export": "bun bench/export.ts", + "quipbench:open": "bun bench/open.ts" }, "devDependencies": { "@types/bun": "latest", @@ -24,6 +27,7 @@ "ink": "^6.8.0", "puppeteer": "^24.2.0", "react": "^19.2.4", - "react-dom": "^19.2.4" + "react-dom": "^19.2.4", + "zod": "^4.3.6" } }