diff --git a/bench/README.md b/bench/README.md
new file mode 100644
index 0000000..737d30b
--- /dev/null
+++ b/bench/README.md
@@ -0,0 +1,64 @@
+# Quipbench
+
+Standalone benchmark runner + static dashboard for Quipslop models.
+
+## What it does
+
+- Runs live OpenRouter self-play rounds (same mechanics as the main game)
+- Computes Elo-first leaderboard with wins/games/win-rate
+- Stores run + match + rating records in `bench/quipbench.sqlite`
+- Exports latest snapshot to `bench/out/latest.json` and `bench/out/latest.js`
+- Renders a standalone dashboard at `bench/dashboard/index.html`
+
+## Prerequisites
+
+- Bun
+- `OPENROUTER_API_KEY` set in environment for live runs
+
+## Commands
+
+From repo root:
+
+- `bun run quipbench:run`
+- `bun run quipbench:export`
+- `bun run quipbench:open`
+
+### Run options
+
+`quipbench:run` supports CLI flags:
+
+- `--rounds=100`
+- `--concurrency=4`
+- `--k=24`
+- `--initialElo=1500`
+- `--seed=12345`
+- `--out=bench/out`
+- `--db=bench/quipbench.sqlite`
+
+Example:
+
+```bash
+bun bench/run.ts --rounds=150 --concurrency=6 --seed=42
+```
+
+## Output contract (`latest` snapshot)
+
+`bench/out/latest.json` and `bench/out/latest.js` contain:
+
+- `runMeta`: `runId`, `startedAt`, `endedAt`, `roundsRequested`, `roundsCompleted`, `failures`, `concurrency`, `eloK`, `initialElo`, `seed`
+- `leaderboard[]`: `rank`, `modelId`, `modelName`, `elo`, `wins`, `games`, `winRate`
+- `chart[]`: `{ modelName, elo }`
+
+## Dashboard
+
+Open `bench/dashboard/index.html` directly or via:
+
+```bash
+bun run quipbench:open
+```
+
+The dashboard reads `../out/latest.js` and shows:
+
+- run metadata summary
+- vertical Elo bar chart with model names under each bar
+- leaderboard table
diff --git a/bench/config.ts b/bench/config.ts
new file mode 100644
index 0000000..01b15ab
--- /dev/null
+++ b/bench/config.ts
@@ -0,0 +1,29 @@
+import { join } from "node:path";
+
+export const BENCH_DIR = import.meta.dir;
+
+export const DEFAULT_ROUNDS = 100;
+export const DEFAULT_CONCURRENCY = 4;
+export const DEFAULT_ELO_K = 24;
+export const DEFAULT_INITIAL_ELO = 1500;
+
+export const DEFAULT_DB_PATH = join(BENCH_DIR, "quipbench.sqlite");
+export const DEFAULT_OUTPUT_DIR = join(BENCH_DIR, "out");
+export const DEFAULT_LATEST_JSON_PATH = join(DEFAULT_OUTPUT_DIR, "latest.json");
+export const DEFAULT_LATEST_JS_PATH = join(DEFAULT_OUTPUT_DIR, "latest.js");
+
+export function parsePositiveInt(
+ value: string | undefined,
+ fallback: number,
+): number {
+ const parsed = Number.parseInt(value ?? "", 10);
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
+}
+
+export function parsePositiveNumber(
+ value: string | undefined,
+ fallback: number,
+): number {
+ const parsed = Number.parseFloat(value ?? "");
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
+}
diff --git a/bench/dashboard/app.js b/bench/dashboard/app.js
new file mode 100644
index 0000000..42fa1f8
--- /dev/null
+++ b/bench/dashboard/app.js
@@ -0,0 +1,277 @@
+const root = document.getElementById("app");
+const snapshot = window.__QUIPBENCH_LATEST__;
+const logoImageCache = new Map();
+
+function logoFor(name) {
+ if (name.includes("Gemini")) return "./assets/logos/gemini.svg";
+ if (name.includes("Kimi")) return "./assets/logos/kimi.svg";
+ if (name.includes("DeepSeek")) return "./assets/logos/deepseek.svg";
+ if (name.includes("GLM")) return "./assets/logos/glm.svg";
+ if (name.includes("GPT")) return "./assets/logos/openai.svg";
+ if (name.includes("Opus") || name.includes("Sonnet")) return "./assets/logos/claude.svg";
+ if (name.includes("Grok")) return "./assets/logos/grok.svg";
+ if (name.includes("MiniMax")) return "./assets/logos/minimax.svg";
+ return null;
+}
+
+function formatDate(ts) {
+ const date = new Date(ts);
+ if (Number.isNaN(date.getTime())) return ts;
+ return date.toLocaleString();
+}
+
+function rowHtml(row) {
+ const logo = logoFor(row.modelName);
+ return `
+
+ | ${row.rank} |
+
+
+ ${logo ? `  ` : ""}
+ ${row.modelName}
+
+ |
+ ${row.elo.toFixed(2)} |
+ ${row.wins} |
+ ${row.games} |
+ ${row.winRate.toFixed(2)}% |
+
+ `;
+}
+
+function renderChart(rows) {
+ const chartCanvas = document.getElementById("elo-chart");
+ if (!chartCanvas || typeof Chart === "undefined" || rows.length === 0) return;
+
+ const sorted = [...rows].sort((a, b) => b.elo - a.elo);
+ const labels = sorted.map((row) => row.modelName);
+ const data = sorted.map((row) => Number(row.elo.toFixed(2)));
+ const max = Math.max(...data);
+ const min = Math.min(...data);
+ const yMin = Math.floor(min - 20);
+ const yMax = Math.ceil(max + 10);
+
+ const iconPlugin = {
+ id: "barIcons",
+ afterDatasetsDraw(chart) {
+ const { ctx } = chart;
+ const meta = chart.getDatasetMeta(0);
+ const topY = chart.scales.y.getPixelForValue(yMax);
+
+ meta.data.forEach((bar, index) => {
+ const modelName = labels[index];
+ const iconUrl = logoFor(modelName);
+ if (!iconUrl) return;
+
+ let img = logoImageCache.get(iconUrl);
+ if (!img) {
+ img = new Image();
+ img.src = iconUrl;
+ img.onload = () => chart.draw();
+ logoImageCache.set(iconUrl, img);
+ }
+ if (!img.complete || !img.naturalWidth) return;
+
+ const iconSize = 18;
+ const x = bar.x - iconSize / 2;
+ const y = Math.max(topY + 4, bar.y - iconSize - 6);
+
+ ctx.save();
+ ctx.fillStyle = "#0a0a0a";
+ ctx.strokeStyle = "#2a2a2a";
+ ctx.lineWidth = 1;
+ if (typeof ctx.roundRect === "function") {
+ ctx.beginPath();
+ ctx.roundRect(x - 3, y - 3, iconSize + 6, iconSize + 6, 6);
+ ctx.fill();
+ ctx.stroke();
+ } else {
+ ctx.fillRect(x - 3, y - 3, iconSize + 6, iconSize + 6);
+ ctx.strokeRect(x - 3, y - 3, iconSize + 6, iconSize + 6);
+ }
+ ctx.drawImage(img, x, y, iconSize, iconSize);
+ ctx.restore();
+ });
+ },
+ };
+
+ new Chart(chartCanvas, {
+ type: "bar",
+ data: {
+ labels,
+ datasets: [
+ {
+ label: "Elo",
+ data,
+ borderWidth: 1,
+ borderColor: "#3c2018",
+ backgroundColor: [
+ "#e8ab97",
+ "#e09a81",
+ "#d98367",
+ "#d97757",
+ "#ca6b4b",
+ "#bc6141",
+ "#ae5637",
+ "#9f4b2d",
+ ],
+ borderRadius: 6,
+ maxBarThickness: 72,
+ },
+ ],
+ },
+ options: {
+ responsive: true,
+ maintainAspectRatio: false,
+ plugins: {
+ legend: { display: false },
+ tooltip: {
+ displayColors: false,
+ backgroundColor: "#101010",
+ borderColor: "#2d2d2d",
+ borderWidth: 1,
+ titleColor: "#f0f0f0",
+ bodyColor: "#d4d4d4",
+ callbacks: {
+ label(context) {
+ return `Elo ${Number(context.raw).toFixed(2)}`;
+ },
+ },
+ },
+ },
+ scales: {
+ x: {
+ ticks: {
+ color: "#b8b8b8",
+ maxRotation: 0,
+ autoSkip: false,
+ font: { family: "JetBrains Mono", size: 11 },
+ },
+ grid: { color: "rgba(255,255,255,0.04)" },
+ },
+ y: {
+ min: yMin,
+ max: yMax,
+ ticks: {
+ color: "#8b8b8b",
+ font: { family: "JetBrains Mono", size: 11 },
+ },
+ grid: { color: "rgba(255,255,255,0.07)" },
+ },
+ },
+ },
+ plugins: [iconPlugin],
+ });
+}
+
+function renderEmpty() {
+ root.innerHTML = `
+
+
+
+ No snapshot found
+ Run a benchmark first: bun run quipbench:run
+ Then refresh this page. Snapshot expected at bench/out/latest.js.
+
+
+ `;
+}
+
+function render(snapshotData) {
+ const meta = snapshotData.runMeta;
+ const leaderboard = snapshotData.leaderboard;
+ const champion = leaderboard[0];
+
+ root.innerHTML = `
+
+
+
+
+
+
Started
+
${formatDate(meta.startedAt)}
+
+
+
Ended
+
${formatDate(meta.endedAt)}
+
+
+
Champion
+
${champion ? champion.modelName : "-"}
+
+
+
+
+
+
+
Elo Leaderboard
+
+
+
+
+
+
+
+
+
Leaderboard Table
+
+
+
+
+
+ | # |
+ Model |
+ Elo |
+ Wins |
+ Games |
+ Win Rate |
+
+
+
+ ${leaderboard.map(rowHtml).join("")}
+
+
+
+
+
+ `;
+}
+
+if (!snapshot || !snapshot.leaderboard) {
+ renderEmpty();
+} else {
+ render(snapshot);
+ renderChart(snapshot.leaderboard);
+}
diff --git a/bench/dashboard/app.ts b/bench/dashboard/app.ts
new file mode 100644
index 0000000..8997c7e
--- /dev/null
+++ b/bench/dashboard/app.ts
@@ -0,0 +1,3 @@
+// Source shim to satisfy TypeScript-based workflows.
+// The standalone dashboard intentionally runs plain JS for direct browser compatibility.
+import "./app.js";
diff --git a/bench/dashboard/assets/logo.svg b/bench/dashboard/assets/logo.svg
new file mode 100644
index 0000000..719d15b
--- /dev/null
+++ b/bench/dashboard/assets/logo.svg
@@ -0,0 +1,9 @@
+
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/claude.svg b/bench/dashboard/assets/logos/claude.svg
new file mode 100644
index 0000000..d300701
--- /dev/null
+++ b/bench/dashboard/assets/logos/claude.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/deepseek.svg b/bench/dashboard/assets/logos/deepseek.svg
new file mode 100644
index 0000000..1401c17
--- /dev/null
+++ b/bench/dashboard/assets/logos/deepseek.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/gemini.svg b/bench/dashboard/assets/logos/gemini.svg
new file mode 100644
index 0000000..87cce06
--- /dev/null
+++ b/bench/dashboard/assets/logos/gemini.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/glm.svg b/bench/dashboard/assets/logos/glm.svg
new file mode 100644
index 0000000..28ca728
--- /dev/null
+++ b/bench/dashboard/assets/logos/glm.svg
@@ -0,0 +1,215 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/grok.svg b/bench/dashboard/assets/logos/grok.svg
new file mode 100644
index 0000000..06ab179
--- /dev/null
+++ b/bench/dashboard/assets/logos/grok.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/kimi.svg b/bench/dashboard/assets/logos/kimi.svg
new file mode 100644
index 0000000..db43fce
--- /dev/null
+++ b/bench/dashboard/assets/logos/kimi.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/minimax.svg b/bench/dashboard/assets/logos/minimax.svg
new file mode 100644
index 0000000..2a60bd4
--- /dev/null
+++ b/bench/dashboard/assets/logos/minimax.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/assets/logos/openai.svg b/bench/dashboard/assets/logos/openai.svg
new file mode 100644
index 0000000..b6d542d
--- /dev/null
+++ b/bench/dashboard/assets/logos/openai.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/bench/dashboard/index.html b/bench/dashboard/index.html
new file mode 100644
index 0000000..edcf297
--- /dev/null
+++ b/bench/dashboard/index.html
@@ -0,0 +1,21 @@
+
+
+
+
+
+ Quipbench Dashboard
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/bench/dashboard/styles.css b/bench/dashboard/styles.css
new file mode 100644
index 0000000..31a5699
--- /dev/null
+++ b/bench/dashboard/styles.css
@@ -0,0 +1,251 @@
+* {
+ margin: 0;
+ padding: 0;
+ box-sizing: border-box;
+}
+
+:root {
+ --bg: #050505;
+ --surface: #0a0a0a;
+ --surface-2: #111111;
+ --border: #212121;
+ --border-light: #2d2d2d;
+ --text: #ededed;
+ --text-dim: #a2a2a2;
+ --text-muted: #6a6a6a;
+ --accent: #d97757;
+ --mono: "JetBrains Mono", "SF Mono", monospace;
+ --sans: "Inter", -apple-system, sans-serif;
+ --serif: "DM Serif Display", Georgia, serif;
+}
+
+body {
+ background: var(--bg);
+ color: var(--text);
+ font-family: var(--sans);
+ min-height: 100vh;
+ -webkit-font-smoothing: antialiased;
+}
+
+.app {
+ min-height: 100vh;
+}
+
+.shell {
+ max-width: 1200px;
+ margin: 0 auto;
+ padding: 32px 20px 64px;
+ display: flex;
+ flex-direction: column;
+ gap: 20px;
+}
+
+.header {
+ display: flex;
+ flex-wrap: wrap;
+ justify-content: space-between;
+ gap: 12px;
+ align-items: center;
+}
+
+.brand {
+ display: inline-flex;
+ align-items: center;
+ gap: 12px;
+}
+
+.brand img {
+ width: 28px;
+ height: 28px;
+}
+
+.brand h1 {
+ font-family: var(--serif);
+ font-size: clamp(26px, 3.5vw, 36px);
+ letter-spacing: -0.4px;
+}
+
+.header-links {
+ display: inline-flex;
+ align-items: center;
+ gap: 10px;
+}
+
+.header-links a {
+ color: var(--text-dim);
+ text-decoration: none;
+ border: 1px solid var(--border);
+ background: rgba(255, 255, 255, 0.02);
+ border-radius: 999px;
+ padding: 6px 10px;
+ font-family: var(--mono);
+ font-size: 11px;
+ display: inline-flex;
+ align-items: center;
+ gap: 6px;
+}
+
+.header-links a:hover {
+ color: var(--text);
+ border-color: var(--border-light);
+}
+
+.header-links a svg {
+ width: 13px;
+ height: 13px;
+ fill: currentColor;
+}
+
+.meta-pills {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 8px;
+}
+
+.pill {
+ border: 1px solid var(--border);
+ background: rgba(255, 255, 255, 0.02);
+ color: var(--text-dim);
+ border-radius: 999px;
+ padding: 6px 10px;
+ font-size: 11px;
+}
+
+.panel {
+ border: 1px solid var(--border);
+ background: var(--surface);
+ border-radius: 12px;
+ padding: 18px;
+}
+
+.panel--summary {
+ display: grid;
+ grid-template-columns: repeat(4, minmax(0, 1fr));
+ gap: 12px;
+}
+
+.summary-block {
+ border: 1px solid var(--border);
+ background: var(--surface-2);
+ border-radius: 8px;
+ padding: 10px 12px;
+ min-width: 0;
+}
+
+.summary-label {
+ color: var(--text-muted);
+ font-size: 11px;
+ letter-spacing: 0.5px;
+ text-transform: uppercase;
+ margin-bottom: 6px;
+}
+
+.summary-value {
+ font-size: 14px;
+ color: var(--text);
+ overflow: hidden;
+ text-overflow: ellipsis;
+ white-space: nowrap;
+}
+
+.panel-head {
+ margin-bottom: 16px;
+ display: flex;
+ flex-wrap: wrap;
+ align-items: baseline;
+ justify-content: space-between;
+ gap: 8px;
+}
+
+.panel-head h2 {
+ font-size: 18px;
+ letter-spacing: -0.2px;
+}
+
+.panel-head p {
+ color: var(--text-muted);
+ font-size: 11px;
+}
+
+.chart-shell {
+ height: min(46vw, 420px);
+ min-height: 280px;
+ border: 1px solid var(--border);
+ border-radius: 10px;
+ background: linear-gradient(to bottom, #121212, #0d0d0d);
+ padding: 12px;
+}
+
+.table-wrap {
+ overflow-x: auto;
+}
+
+table {
+ width: 100%;
+ border-collapse: collapse;
+ min-width: 680px;
+}
+
+th,
+td {
+ text-align: left;
+ padding: 10px;
+ border-bottom: 1px solid var(--border);
+ font-size: 13px;
+}
+
+th {
+ color: var(--text-muted);
+ font-weight: 600;
+}
+
+.model-cell {
+ display: inline-flex;
+ align-items: center;
+ gap: 8px;
+}
+
+.model-cell img {
+ width: 16px;
+ height: 16px;
+}
+
+.rank {
+ color: var(--text-dim);
+ width: 36px;
+}
+
+.mono {
+ font-family: var(--mono);
+}
+
+.empty {
+ font-family: var(--mono);
+ font-size: 14px;
+ color: var(--text-muted);
+}
+
+code {
+ font-family: var(--mono);
+ font-size: 12px;
+ border: 1px solid var(--border);
+ border-radius: 6px;
+ padding: 2px 6px;
+ background: var(--surface-2);
+}
+
+@media (max-width: 900px) {
+ .panel--summary {
+ grid-template-columns: repeat(2, minmax(0, 1fr));
+ }
+}
+
+@media (max-width: 560px) {
+ .shell {
+ padding: 22px 12px 40px;
+ }
+
+ .panel--summary {
+ grid-template-columns: 1fr;
+ }
+}
diff --git a/bench/db.ts b/bench/db.ts
new file mode 100644
index 0000000..d01b196
--- /dev/null
+++ b/bench/db.ts
@@ -0,0 +1,372 @@
+import { Database } from "bun:sqlite";
+import type { LeaderboardRow, MatchRecord, RunMeta } from "./types";
+
+export type RunRow = {
+ id: string;
+ started_at: string;
+ ended_at: string | null;
+ rounds_requested: number;
+ rounds_completed: number;
+ failures: number;
+ concurrency: number;
+ elo_k: number;
+ initial_elo: number;
+ seed: number;
+ status: string;
+ output_dir: string;
+};
+
+export function openBenchDb(path: string): Database {
+ const db = new Database(path, { create: true });
+ initSchema(db);
+ return db;
+}
+
+function initSchema(db: Database) {
+ db.exec(`
+ CREATE TABLE IF NOT EXISTS runs (
+ id TEXT PRIMARY KEY,
+ started_at TEXT NOT NULL,
+ ended_at TEXT,
+ rounds_requested INTEGER NOT NULL,
+ rounds_completed INTEGER NOT NULL DEFAULT 0,
+ failures INTEGER NOT NULL DEFAULT 0,
+ concurrency INTEGER NOT NULL,
+ elo_k REAL NOT NULL,
+ initial_elo REAL NOT NULL,
+ seed INTEGER NOT NULL,
+ status TEXT NOT NULL,
+ output_dir TEXT NOT NULL
+ );
+
+ CREATE TABLE IF NOT EXISTS matches (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ run_id TEXT NOT NULL,
+ round_num INTEGER NOT NULL,
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+ prompter_id TEXT NOT NULL,
+ prompter_name TEXT NOT NULL,
+ contestant_a_id TEXT NOT NULL,
+ contestant_a_name TEXT NOT NULL,
+ contestant_b_id TEXT NOT NULL,
+ contestant_b_name TEXT NOT NULL,
+ prompt TEXT,
+ answer_a TEXT,
+ answer_b TEXT,
+ votes_a INTEGER NOT NULL DEFAULT 0,
+ votes_b INTEGER NOT NULL DEFAULT 0,
+ winner TEXT NOT NULL,
+ error TEXT,
+ payload_json TEXT,
+ FOREIGN KEY(run_id) REFERENCES runs(id)
+ );
+
+ CREATE TABLE IF NOT EXISTS ratings (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ run_id TEXT NOT NULL,
+ model_id TEXT NOT NULL,
+ model_name TEXT NOT NULL,
+ elo REAL NOT NULL,
+ wins INTEGER NOT NULL,
+ games INTEGER NOT NULL,
+ win_rate REAL NOT NULL,
+ rank INTEGER NOT NULL,
+ FOREIGN KEY(run_id) REFERENCES runs(id)
+ );
+
+ CREATE INDEX IF NOT EXISTS idx_matches_run_id ON matches(run_id);
+ CREATE INDEX IF NOT EXISTS idx_ratings_run_id ON ratings(run_id);
+ CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
+ `);
+}
+
+export function insertRunStart(
+ db: Database,
+ row: {
+ id: string;
+ startedAt: string;
+ roundsRequested: number;
+ concurrency: number;
+ eloK: number;
+ initialElo: number;
+ seed: number;
+ outputDir: string;
+ },
+) {
+ const stmt = db.prepare(`
+ INSERT INTO runs (
+ id,
+ started_at,
+ rounds_requested,
+ concurrency,
+ elo_k,
+ initial_elo,
+ seed,
+ status,
+ output_dir
+ ) VALUES (
+ $id,
+ $started_at,
+ $rounds_requested,
+ $concurrency,
+ $elo_k,
+ $initial_elo,
+ $seed,
+ 'running',
+ $output_dir
+ )
+ `);
+
+ stmt.run({
+ $id: row.id,
+ $started_at: row.startedAt,
+ $rounds_requested: row.roundsRequested,
+ $concurrency: row.concurrency,
+ $elo_k: row.eloK,
+ $initial_elo: row.initialElo,
+ $seed: row.seed,
+ $output_dir: row.outputDir,
+ });
+}
+
+export function updateRunProgress(
+ db: Database,
+ runId: string,
+ progress: { roundsCompleted: number; failures: number },
+) {
+ const stmt = db.prepare(`
+ UPDATE runs
+ SET rounds_completed = $rounds_completed,
+ failures = $failures
+ WHERE id = $id
+ `);
+ stmt.run({
+ $id: runId,
+ $rounds_completed: progress.roundsCompleted,
+ $failures: progress.failures,
+ });
+}
+
+export function finalizeRun(
+ db: Database,
+ runId: string,
+ status: "completed" | "failed",
+ endedAt: string,
+) {
+ const stmt = db.prepare(`
+ UPDATE runs
+ SET status = $status,
+ ended_at = $ended_at
+ WHERE id = $id
+ `);
+ stmt.run({
+ $id: runId,
+ $status: status,
+ $ended_at: endedAt,
+ });
+}
+
+export function insertMatch(db: Database, match: MatchRecord) {
+ const stmt = db.prepare(`
+ INSERT INTO matches (
+ run_id,
+ round_num,
+ prompter_id,
+ prompter_name,
+ contestant_a_id,
+ contestant_a_name,
+ contestant_b_id,
+ contestant_b_name,
+ prompt,
+ answer_a,
+ answer_b,
+ votes_a,
+ votes_b,
+ winner,
+ error,
+ payload_json
+ ) VALUES (
+ $run_id,
+ $round_num,
+ $prompter_id,
+ $prompter_name,
+ $contestant_a_id,
+ $contestant_a_name,
+ $contestant_b_id,
+ $contestant_b_name,
+ $prompt,
+ $answer_a,
+ $answer_b,
+ $votes_a,
+ $votes_b,
+ $winner,
+ $error,
+ $payload_json
+ )
+ `);
+
+ stmt.run({
+ $run_id: match.runId,
+ $round_num: match.roundNum,
+ $prompter_id: match.prompter.id,
+ $prompter_name: match.prompter.name,
+ $contestant_a_id: match.contestantA.id,
+ $contestant_a_name: match.contestantA.name,
+ $contestant_b_id: match.contestantB.id,
+ $contestant_b_name: match.contestantB.name,
+ $prompt: match.prompt ?? null,
+ $answer_a: match.answerA ?? null,
+ $answer_b: match.answerB ?? null,
+ $votes_a: match.votesA,
+ $votes_b: match.votesB,
+ $winner: match.winner,
+ $error: match.error ?? null,
+ $payload_json: JSON.stringify(match),
+ });
+}
+
+export function replaceRatings(
+ db: Database,
+ runId: string,
+ leaderboard: LeaderboardRow[],
+) {
+ db.prepare("DELETE FROM ratings WHERE run_id = $run_id").run({ $run_id: runId });
+
+ const stmt = db.prepare(`
+ INSERT INTO ratings (
+ run_id,
+ model_id,
+ model_name,
+ elo,
+ wins,
+ games,
+ win_rate,
+ rank
+ ) VALUES (
+ $run_id,
+ $model_id,
+ $model_name,
+ $elo,
+ $wins,
+ $games,
+ $win_rate,
+ $rank
+ )
+ `);
+
+ const tx = db.transaction((rows: LeaderboardRow[]) => {
+ for (const row of rows) {
+ stmt.run({
+ $run_id: runId,
+ $model_id: row.modelId,
+ $model_name: row.modelName,
+ $elo: row.elo,
+ $wins: row.wins,
+ $games: row.games,
+ $win_rate: row.winRate,
+ $rank: row.rank,
+ });
+ }
+ });
+
+ tx(leaderboard);
+}
+
+export function getLatestCompletedRunId(db: Database): string | null {
+ const row = db
+ .query(
+ `
+ SELECT id
+ FROM runs
+ WHERE status = 'completed'
+ ORDER BY datetime(ended_at) DESC, datetime(started_at) DESC
+ LIMIT 1
+ `,
+ )
+ .get() as { id: string } | null;
+
+ return row?.id ?? null;
+}
+
+export function getRunRow(db: Database, runId: string): RunRow | null {
+ return (
+ (db
+ .query(`
+ SELECT
+ id,
+ started_at,
+ ended_at,
+ rounds_requested,
+ rounds_completed,
+ failures,
+ concurrency,
+ elo_k,
+ initial_elo,
+ seed,
+ status,
+ output_dir
+ FROM runs
+ WHERE id = $run_id
+ LIMIT 1
+ `)
+ .get({ $run_id: runId }) as RunRow | null) ?? null
+ );
+}
+
+export function getRatingsForRun(
+ db: Database,
+ runId: string,
+): LeaderboardRow[] {
+ const rows = db
+ .query(`
+ SELECT
+ rank,
+ model_id,
+ model_name,
+ elo,
+ wins,
+ games,
+ win_rate
+ FROM ratings
+ WHERE run_id = $run_id
+ ORDER BY rank ASC
+ `)
+ .all({ $run_id: runId }) as Array<{
+ rank: number;
+ model_id: string;
+ model_name: string;
+ elo: number;
+ wins: number;
+ games: number;
+ win_rate: number;
+ }>;
+
+ return rows.map((row) => ({
+ rank: row.rank,
+ modelId: row.model_id,
+ modelName: row.model_name,
+ elo: row.elo,
+ wins: row.wins,
+ games: row.games,
+ winRate: row.win_rate,
+ }));
+}
+
+export function runRowToMeta(row: RunRow): RunMeta {
+ if (!row.ended_at) {
+ throw new Error(`Run ${row.id} is not finalized`);
+ }
+
+ return {
+ runId: row.id,
+ startedAt: row.started_at,
+ endedAt: row.ended_at,
+ roundsRequested: row.rounds_requested,
+ roundsCompleted: row.rounds_completed,
+ failures: row.failures,
+ concurrency: row.concurrency,
+ eloK: row.elo_k,
+ initialElo: row.initial_elo,
+ seed: row.seed,
+ };
+}
diff --git a/bench/elo.test.ts b/bench/elo.test.ts
new file mode 100644
index 0000000..080581b
--- /dev/null
+++ b/bench/elo.test.ts
@@ -0,0 +1,21 @@
+import { expect, test } from "bun:test";
+import { expectedScore, updatePairElo } from "./elo";
+
+test("expected score is symmetric", () => {
+ const a = expectedScore(1600, 1500);
+ const b = expectedScore(1500, 1600);
+ expect(Number((a + b).toFixed(8))).toBe(1);
+ expect(a).toBeGreaterThan(0.5);
+});
+
+test("winner gains rating and loser drops", () => {
+ const { nextA, nextB } = updatePairElo(1500, 1500, 1, 24);
+ expect(nextA).toBeGreaterThan(1500);
+ expect(nextB).toBeLessThan(1500);
+});
+
+test("tie moves ratings toward each other", () => {
+ const { nextA, nextB } = updatePairElo(1700, 1500, 0.5, 24);
+ expect(nextA).toBeLessThan(1700);
+ expect(nextB).toBeGreaterThan(1500);
+});
diff --git a/bench/elo.ts b/bench/elo.ts
new file mode 100644
index 0000000..f20dd3f
--- /dev/null
+++ b/bench/elo.ts
@@ -0,0 +1,26 @@
+export function expectedScore(playerElo: number, opponentElo: number): number {
+ return 1 / (1 + 10 ** ((opponentElo - playerElo) / 400));
+}
+
+export function nextElo(
+ playerElo: number,
+ opponentElo: number,
+ actualScore: number,
+ k: number,
+): number {
+ const exp = expectedScore(playerElo, opponentElo);
+ return playerElo + k * (actualScore - exp);
+}
+
+export function updatePairElo(
+ eloA: number,
+ eloB: number,
+ actualA: number,
+ k: number,
+): { nextA: number; nextB: number } {
+ const actualB = 1 - actualA;
+ return {
+ nextA: nextElo(eloA, eloB, actualA, k),
+ nextB: nextElo(eloB, eloA, actualB, k),
+ };
+}
diff --git a/bench/export.ts b/bench/export.ts
new file mode 100644
index 0000000..394bc51
--- /dev/null
+++ b/bench/export.ts
@@ -0,0 +1,96 @@
+import { mkdirSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import { DEFAULT_DB_PATH, DEFAULT_OUTPUT_DIR } from "./config";
+import {
+ getLatestCompletedRunId,
+ getRatingsForRun,
+ getRunRow,
+ openBenchDb,
+ runRowToMeta,
+} from "./db";
+import type { QuipbenchSnapshot } from "./types";
+
+function parseArgs(argv: string[]) {
+ const args: Record = {};
+ for (const raw of argv) {
+ if (!raw.startsWith("--")) continue;
+ const stripped = raw.slice(2);
+ const [key, ...rest] = stripped.split("=");
+ if (!key) continue;
+ args[key] = rest.length === 0 ? "true" : rest.join("=");
+ }
+ return args;
+}
+
+export async function exportLatestSnapshot(options?: {
+ dbPath?: string;
+ outputDir?: string;
+ runId?: string;
+}): Promise<{
+ snapshot: QuipbenchSnapshot;
+ latestJsonPath: string;
+ latestJsPath: string;
+}> {
+ const dbPath = options?.dbPath ?? DEFAULT_DB_PATH;
+ const outputDir = options?.outputDir ?? DEFAULT_OUTPUT_DIR;
+
+ mkdirSync(outputDir, { recursive: true });
+
+ const db = openBenchDb(dbPath);
+ try {
+ const runId = options?.runId ?? getLatestCompletedRunId(db);
+ if (!runId) {
+ throw new Error("No completed Quipbench runs found");
+ }
+
+ const runRow = getRunRow(db, runId);
+ if (!runRow) {
+ throw new Error(`Run not found: ${runId}`);
+ }
+
+ const leaderboard = getRatingsForRun(db, runId);
+ if (leaderboard.length === 0) {
+ throw new Error(`Run ${runId} has no ratings`);
+ }
+
+ const snapshot: QuipbenchSnapshot = {
+ runMeta: runRowToMeta(runRow),
+ leaderboard,
+ chart: leaderboard.map((row) => ({
+ modelName: row.modelName,
+ elo: row.elo,
+ })),
+ };
+
+ const latestJsonPath = join(outputDir, "latest.json");
+ const latestJsPath = join(outputDir, "latest.js");
+
+ writeFileSync(latestJsonPath, JSON.stringify(snapshot, null, 2));
+ writeFileSync(
+ latestJsPath,
+ `window.__QUIPBENCH_LATEST__ = ${JSON.stringify(snapshot, null, 2)};\n`,
+ );
+
+ return { snapshot, latestJsonPath, latestJsPath };
+ } finally {
+ db.close();
+ }
+}
+
+async function main() {
+ const args = parseArgs(process.argv.slice(2));
+ const dbPath = args.db ? join(process.cwd(), args.db) : DEFAULT_DB_PATH;
+ const outputDir = args.out ? join(process.cwd(), args.out) : DEFAULT_OUTPUT_DIR;
+ const runId = args.runId;
+
+ const result = await exportLatestSnapshot({ dbPath, outputDir, runId });
+ console.log(`Exported snapshot: ${result.latestJsonPath}`);
+ console.log(`Exported snapshot script: ${result.latestJsPath}`);
+}
+
+if (import.meta.main) {
+ main().catch((error) => {
+ console.error(error instanceof Error ? error.message : String(error));
+ process.exit(1);
+ });
+}
diff --git a/bench/finalize-partial.ts b/bench/finalize-partial.ts
new file mode 100644
index 0000000..9a4e46c
--- /dev/null
+++ b/bench/finalize-partial.ts
@@ -0,0 +1,105 @@
+import { QUIPBENCH_MODELS } from "./models";
+import { updatePairElo } from "./elo";
+import { buildLeaderboard } from "./leaderboard";
+import { openBenchDb, replaceRatings } from "./db";
+import { DEFAULT_DB_PATH, DEFAULT_OUTPUT_DIR } from "./config";
+import { exportLatestSnapshot } from "./export";
+import type { RatingState } from "./types";
+
+const db = openBenchDb(DEFAULT_DB_PATH);
+
+try {
+ const run = db.query(`
+ SELECT id, initial_elo, elo_k, rounds_requested
+ FROM runs
+ WHERE status = 'running'
+ ORDER BY datetime(started_at) DESC
+ LIMIT 1
+ `).get() as { id: string; initial_elo: number; elo_k: number; rounds_requested: number } | null;
+
+ if (!run) {
+ throw new Error("No running Quipbench run found to finalize");
+ }
+
+ const ratings = new Map();
+ for (const model of QUIPBENCH_MODELS) {
+ ratings.set(model.id, {
+ model,
+ elo: run.initial_elo,
+ wins: 0,
+ games: 0,
+ });
+ }
+
+ const matches = db.query(`
+ SELECT contestant_a_id, contestant_b_id, winner
+ FROM matches
+ WHERE run_id = $run_id
+ ORDER BY round_num ASC, id ASC
+ `).all({ $run_id: run.id }) as Array<{
+ contestant_a_id: string;
+ contestant_b_id: string;
+ winner: "A" | "B" | "TIE" | "ERROR";
+ }>;
+
+ let completed = 0;
+ let failures = 0;
+
+ for (const match of matches) {
+ const a = ratings.get(match.contestant_a_id);
+ const b = ratings.get(match.contestant_b_id);
+ if (!a || !b) continue;
+
+ if (match.winner === "ERROR") {
+ failures += 1;
+ continue;
+ }
+
+ completed += 1;
+ a.games += 1;
+ b.games += 1;
+
+ let scoreA = 0.5;
+ if (match.winner === "A") {
+ scoreA = 1;
+ a.wins += 1;
+ } else if (match.winner === "B") {
+ scoreA = 0;
+ b.wins += 1;
+ }
+
+ const next = updatePairElo(a.elo, b.elo, scoreA, run.elo_k);
+ a.elo = next.nextA;
+ b.elo = next.nextB;
+ }
+
+ const leaderboard = buildLeaderboard(Array.from(ratings.values()));
+ replaceRatings(db, run.id, leaderboard);
+
+ db.prepare(`
+ UPDATE runs
+ SET status = 'completed',
+ ended_at = $ended_at,
+ rounds_completed = $rounds_completed,
+ failures = $failures
+ WHERE id = $run_id
+ `).run({
+ $run_id: run.id,
+ $ended_at: new Date().toISOString(),
+ $rounds_completed: completed,
+ $failures: failures,
+ });
+
+ const out = await exportLatestSnapshot({
+ dbPath: DEFAULT_DB_PATH,
+ outputDir: DEFAULT_OUTPUT_DIR,
+ runId: run.id,
+ });
+
+ console.log(`Finalized partial run: ${run.id}`);
+ console.log(`Completed rounds: ${completed}`);
+ console.log(`Failures: ${failures}`);
+ console.log(`Snapshot JS: ${out.latestJsPath}`);
+} finally {
+ db.close();
+}
diff --git a/bench/integration.test.ts b/bench/integration.test.ts
new file mode 100644
index 0000000..bbe31cb
--- /dev/null
+++ b/bench/integration.test.ts
@@ -0,0 +1,120 @@
+import { expect, test } from "bun:test";
+import { mkdtempSync, readFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { Database } from "bun:sqlite";
+import { runQuipbench } from "./run";
+import type { BenchModel, QuipbenchAi } from "./types";
+
+const TEST_MODELS: BenchModel[] = [
+ { id: "model/a", name: "Model A" },
+ { id: "model/b", name: "Model B" },
+ { id: "model/c", name: "Model C" },
+ { id: "model/d", name: "Model D" },
+];
+
+const STUB_AI: QuipbenchAi = {
+ async generatePrompt(model) {
+ return `Prompt by ${model.name}`;
+ },
+ async generateAnswer(model, prompt) {
+ return `${model.name} answer for ${prompt}`;
+ },
+ async vote(_voter, _prompt, answerA, answerB) {
+ return answerA.length >= answerB.length ? "A" : "B";
+ },
+};
+
+test("run writes DB rows and latest snapshot", async () => {
+ const tempRoot = mkdtempSync(join(tmpdir(), "quipbench-it-"));
+ const dbPath = join(tempRoot, "bench.sqlite");
+ const outDir = join(tempRoot, "out");
+
+ const result = await runQuipbench({
+ rounds: 8,
+ concurrency: 2,
+ seed: 123,
+ models: TEST_MODELS,
+ ai: STUB_AI,
+ dbPath,
+ outputDir: outDir,
+ });
+
+ expect(result.runMeta.roundsRequested).toBe(8);
+ expect(result.runMeta.roundsCompleted + result.runMeta.failures).toBe(8);
+ expect(result.leaderboard.length).toBe(TEST_MODELS.length);
+
+ const db = new Database(dbPath);
+ const runsCount = (db.query("SELECT COUNT(*) as c FROM runs").get() as { c: number }).c;
+ const matchesCount = (
+ db.query("SELECT COUNT(*) as c FROM matches WHERE run_id = $run_id").get({
+ $run_id: result.runMeta.runId,
+ }) as { c: number }
+ ).c;
+ const ratingsCount = (
+ db.query("SELECT COUNT(*) as c FROM ratings WHERE run_id = $run_id").get({
+ $run_id: result.runMeta.runId,
+ }) as { c: number }
+ ).c;
+ db.close();
+
+ expect(runsCount).toBe(1);
+ expect(matchesCount).toBe(8);
+ expect(ratingsCount).toBe(TEST_MODELS.length);
+
+ const latest = JSON.parse(readFileSync(result.snapshotPathJson, "utf8")) as {
+ runMeta: { runId: string };
+ leaderboard: unknown[];
+ chart: unknown[];
+ };
+
+ expect(latest.runMeta.runId).toBe(result.runMeta.runId);
+ expect(latest.leaderboard.length).toBe(TEST_MODELS.length);
+ expect(latest.chart.length).toBe(TEST_MODELS.length);
+});
+
+test("round failures are persisted and run still finalizes", async () => {
+ const tempRoot = mkdtempSync(join(tmpdir(), "quipbench-it-fail-"));
+ const dbPath = join(tempRoot, "bench.sqlite");
+ const outDir = join(tempRoot, "out");
+
+ const failingAi: QuipbenchAi = {
+ async generatePrompt() {
+ throw new Error("forced prompt failure");
+ },
+ async generateAnswer() {
+ return "unused";
+ },
+ async vote() {
+ return "A";
+ },
+ };
+
+ const result = await runQuipbench({
+ rounds: 3,
+ concurrency: 2,
+ seed: 7,
+ models: TEST_MODELS,
+ ai: failingAi,
+ dbPath,
+ outputDir: outDir,
+ });
+
+ expect(result.runMeta.roundsCompleted).toBe(0);
+ expect(result.runMeta.failures).toBe(3);
+
+ const db = new Database(dbPath);
+ const errorMatches = (
+ db
+ .query("SELECT COUNT(*) as c FROM matches WHERE run_id = $run_id AND winner = 'ERROR'")
+ .get({ $run_id: result.runMeta.runId }) as { c: number }
+ ).c;
+ db.close();
+
+ expect(errorMatches).toBe(3);
+
+ const latest = JSON.parse(readFileSync(result.snapshotPathJson, "utf8")) as {
+ runMeta: { failures: number };
+ };
+ expect(latest.runMeta.failures).toBe(3);
+});
diff --git a/bench/leaderboard.test.ts b/bench/leaderboard.test.ts
new file mode 100644
index 0000000..27cfa0b
--- /dev/null
+++ b/bench/leaderboard.test.ts
@@ -0,0 +1,25 @@
+import { expect, test } from "bun:test";
+import { buildLeaderboard } from "./leaderboard";
+
+const model = (id: string, name: string) => ({ id, name });
+
+test("leaderboard sorts by elo, then wins, then win rate, then name", () => {
+ const rows = buildLeaderboard([
+ { model: model("a", "Alpha"), elo: 1500, wins: 5, games: 10 },
+ { model: model("b", "Beta"), elo: 1600, wins: 1, games: 1 },
+ { model: model("c", "Charlie"), elo: 1500, wins: 6, games: 12 },
+ { model: model("d", "Delta"), elo: 1500, wins: 6, games: 8 },
+ { model: model("e", "Echo"), elo: 1500, wins: 6, games: 8 },
+ ]);
+
+ expect(rows.map((r) => r.modelName)).toEqual([
+ "Beta",
+ "Delta",
+ "Echo",
+ "Charlie",
+ "Alpha",
+ ]);
+ expect(rows[0]?.rank).toBe(1);
+ expect(rows[4]?.rank).toBe(5);
+ expect(rows[1]?.winRate).toBe(75);
+});
diff --git a/bench/leaderboard.ts b/bench/leaderboard.ts
new file mode 100644
index 0000000..6e73f81
--- /dev/null
+++ b/bench/leaderboard.ts
@@ -0,0 +1,25 @@
+import type { LeaderboardRow, RatingState } from "./types";
+
+export function buildLeaderboard(rows: RatingState[]): LeaderboardRow[] {
+ const sorted = [...rows].sort((a, b) => {
+ if (b.elo !== a.elo) return b.elo - a.elo;
+ if (b.wins !== a.wins) return b.wins - a.wins;
+ const wrA = a.games > 0 ? a.wins / a.games : 0;
+ const wrB = b.games > 0 ? b.wins / b.games : 0;
+ if (wrB !== wrA) return wrB - wrA;
+ return a.model.name.localeCompare(b.model.name);
+ });
+
+ return sorted.map((entry, index) => ({
+ rank: index + 1,
+ modelId: entry.model.id,
+ modelName: entry.model.name,
+ elo: Number(entry.elo.toFixed(2)),
+ wins: entry.wins,
+ games: entry.games,
+ winRate:
+ entry.games > 0
+ ? Number(((entry.wins / entry.games) * 100).toFixed(2))
+ : 0,
+ }));
+}
diff --git a/bench/models.ts b/bench/models.ts
new file mode 100644
index 0000000..1b4115d
--- /dev/null
+++ b/bench/models.ts
@@ -0,0 +1,12 @@
+import type { BenchModel } from "./types";
+
+// Quipbench source-of-truth model roster.
+export const QUIPBENCH_MODELS: BenchModel[] = [
+ { id: "google/gemini-3.1-pro-preview", name: "Gemini 3.1 Pro" },
+ { id: "moonshotai/kimi-k2", name: "Kimi K2" },
+ { id: "deepseek/deepseek-v3.2", name: "DeepSeek 3.2" },
+ { id: "openai/gpt-5.2", name: "GPT-5.2" },
+ { id: "anthropic/claude-opus-4.6", name: "Opus 4.6" },
+ { id: "anthropic/claude-sonnet-4.6", name: "Sonnet 4.6" },
+ { id: "x-ai/grok-4.1-fast", name: "Grok 4.1" },
+];
diff --git a/bench/open.ts b/bench/open.ts
new file mode 100644
index 0000000..4f14248
--- /dev/null
+++ b/bench/open.ts
@@ -0,0 +1,30 @@
+import { spawnSync } from "node:child_process";
+import { resolve } from "node:path";
+
+const dashboardPath = resolve(import.meta.dir, "dashboard", "index.html");
+
+function openPath(target: string) {
+ if (process.platform === "darwin") {
+ return spawnSync("open", [target], { stdio: "inherit" });
+ }
+
+ if (process.platform === "win32") {
+ return spawnSync("cmd", ["/c", "start", "", target], {
+ stdio: "inherit",
+ shell: false,
+ });
+ }
+
+ return spawnSync("xdg-open", [target], { stdio: "inherit" });
+}
+
+const result = openPath(dashboardPath);
+if (result.error) {
+ console.error(`Could not open dashboard: ${result.error.message}`);
+ process.exit(1);
+}
+if (typeof result.status === "number" && result.status !== 0) {
+ process.exit(result.status);
+}
+
+console.log(`Opened Quipbench dashboard: ${dashboardPath}`);
diff --git a/bench/run.ts b/bench/run.ts
new file mode 100644
index 0000000..0e1e9cd
--- /dev/null
+++ b/bench/run.ts
@@ -0,0 +1,525 @@
+import { mkdirSync } from "node:fs";
+import { join } from "node:path";
+import {
+ DEFAULT_CONCURRENCY,
+ DEFAULT_DB_PATH,
+ DEFAULT_ELO_K,
+ DEFAULT_INITIAL_ELO,
+ DEFAULT_OUTPUT_DIR,
+ DEFAULT_ROUNDS,
+ parsePositiveInt,
+ parsePositiveNumber,
+} from "./config";
+import { QUIPBENCH_MODELS } from "./models";
+import type {
+ BenchModel,
+ MatchRecord,
+ QuipbenchAi,
+ QuipbenchRunResult,
+ RatingState,
+ VoteRecord,
+} from "./types";
+import { updatePairElo } from "./elo";
+import { buildLeaderboard } from "./leaderboard";
+import {
+ finalizeRun,
+ insertMatch,
+ insertRunStart,
+ openBenchDb,
+ replaceRatings,
+ updateRunProgress,
+} from "./db";
+import { exportLatestSnapshot } from "./export";
+
+type RetryFn = (
+ fn: () => Promise,
+ validate: (result: T) => boolean,
+ retries: number,
+ label: string,
+) => Promise;
+
+function defaultIsRealString(value: string, minLength = 5): boolean {
+ return value.trim().length >= minLength;
+}
+
+const defaultWithRetry: RetryFn = async (fn, validate, retries, label) => {
+ let lastError: unknown;
+
+ for (let attempt = 1; attempt <= retries; attempt++) {
+ try {
+ const result = await fn();
+ if (validate(result)) return result;
+ lastError = new Error(`${label}: validation failed`);
+ } catch (error) {
+ lastError = error;
+ }
+
+ if (attempt < retries) {
+ await new Promise((resolve) => setTimeout(resolve, 250 * attempt));
+ }
+ }
+
+ throw lastError instanceof Error
+ ? lastError
+ : new Error(`${label}: all retry attempts failed`);
+};
+
+async function loadLiveAi(): Promise<{
+ ai: QuipbenchAi;
+ retry: RetryFn;
+ isRealStringFn: (value: string, minLength?: number) => boolean;
+}> {
+ const gameModule = await import("../game.ts");
+ type GameModel = typeof gameModule.MODELS[number];
+
+ function toGameModel(model: BenchModel): GameModel {
+ return model as unknown as GameModel;
+ }
+
+ return {
+ ai: {
+ async generatePrompt(model) {
+ return gameModule.callGeneratePrompt(toGameModel(model));
+ },
+ async generateAnswer(model, prompt) {
+ return gameModule.callGenerateAnswer(toGameModel(model), prompt);
+ },
+ async vote(voter, prompt, answerA, answerB) {
+ return gameModule.callVote(
+ toGameModel(voter),
+ prompt,
+ { answer: answerA },
+ { answer: answerB },
+ );
+ },
+ },
+ retry: gameModule.withRetry as RetryFn,
+ isRealStringFn: gameModule.isRealString,
+ };
+}
+
+function mulberry32(seed: number): () => number {
+ let t = seed >>> 0;
+ return () => {
+ t += 0x6d2b79f5;
+ let r = Math.imul(t ^ (t >>> 15), t | 1);
+ r ^= r + Math.imul(r ^ (r >>> 7), r | 61);
+ return ((r ^ (r >>> 14)) >>> 0) / 4294967296;
+ };
+}
+
+function roundRng(seed: number, roundNum: number): () => number {
+ const mixed = (seed ^ Math.imul(roundNum, 0x9e3779b1)) >>> 0;
+ return mulberry32(mixed);
+}
+
+function shuffleWithRng(items: T[], rng: () => number): T[] {
+ const arr = [...items];
+ for (let i = arr.length - 1; i > 0; i--) {
+ const j = Math.floor(rng() * (i + 1));
+ [arr[i], arr[j]] = [arr[j]!, arr[i]!];
+ }
+ return arr;
+}
+
+function parseArgs(argv: string[]) {
+ const args: Record = {};
+ for (const raw of argv) {
+ if (!raw.startsWith("--")) continue;
+ const trimmed = raw.slice(2);
+ const [key, ...rest] = trimmed.split("=");
+ if (!key) continue;
+ args[key] = rest.length === 0 ? "true" : rest.join("=");
+ }
+ return args;
+}
+
+function createRunId(): string {
+ return `quipbench-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
+}
+
+type RoundRunResult = {
+ match: MatchRecord;
+};
+
+async function runRound(options: {
+ runId: string;
+ roundNum: number;
+ models: BenchModel[];
+ ai: QuipbenchAi;
+ seed: number;
+ retry: RetryFn;
+ isRealStringFn: (value: string, minLength?: number) => boolean;
+}): Promise {
+ const { runId, roundNum, models, ai, seed, retry, isRealStringFn } = options;
+ const rng = roundRng(seed, roundNum);
+ const shuffled = shuffleWithRng(models, rng);
+
+ const prompter = shuffled[0]!;
+ const contestantA = shuffled[1]!;
+ const contestantB = shuffled[2]!;
+ const voters = [prompter, ...shuffled.slice(3)];
+
+ const baseMatch: MatchRecord = {
+ runId,
+ roundNum,
+ prompter,
+ contestantA,
+ contestantB,
+ votesA: 0,
+ votesB: 0,
+ winner: "ERROR",
+ votes: [],
+ };
+
+ let prompt = "";
+ try {
+ prompt = await retry(
+ () => ai.generatePrompt(prompter),
+ (value) => isRealStringFn(value, 10),
+ 3,
+ `QB:R${roundNum}:prompt:${prompter.name}`,
+ );
+ } catch (error) {
+ return {
+ match: {
+ ...baseMatch,
+ winner: "ERROR",
+ error:
+ error instanceof Error
+ ? `Prompt failed: ${error.message}`
+ : "Prompt failed",
+ },
+ };
+ }
+
+ let answerA = "";
+ let answerB = "";
+ try {
+ [answerA, answerB] = await Promise.all([
+ retry(
+ () => ai.generateAnswer(contestantA, prompt),
+ (value) => isRealStringFn(value, 3),
+ 3,
+ `QB:R${roundNum}:answer:${contestantA.name}`,
+ ),
+ retry(
+ () => ai.generateAnswer(contestantB, prompt),
+ (value) => isRealStringFn(value, 3),
+ 3,
+ `QB:R${roundNum}:answer:${contestantB.name}`,
+ ),
+ ]);
+ } catch (error) {
+ return {
+ match: {
+ ...baseMatch,
+ prompt,
+ winner: "ERROR",
+ error:
+ error instanceof Error
+ ? `Answer failed: ${error.message}`
+ : "Answer failed",
+ },
+ };
+ }
+
+ const votes: VoteRecord[] = [];
+ let votesA = 0;
+ let votesB = 0;
+
+ await Promise.all(
+ voters.map(async (voter) => {
+ const showAFirst = rng() > 0.5;
+ const first = showAFirst ? answerA : answerB;
+ const second = showAFirst ? answerB : answerA;
+
+ try {
+ const decision = await retry(
+ () => ai.vote(voter, prompt, first, second),
+ (value) => value === "A" || value === "B",
+ 3,
+ `QB:R${roundNum}:vote:${voter.name}`,
+ );
+
+ const votedForA = showAFirst ? decision === "A" : decision === "B";
+ if (votedForA) votesA += 1;
+ else votesB += 1;
+
+ votes.push({
+ voterId: voter.id,
+ voterName: voter.name,
+ votedFor: votedForA ? "A" : "B",
+ showAFirst,
+ });
+ } catch (error) {
+ votes.push({
+ voterId: voter.id,
+ voterName: voter.name,
+ votedFor: null,
+ showAFirst,
+ error: error instanceof Error ? error.message : String(error),
+ });
+ }
+ }),
+ );
+
+ let winner: MatchRecord["winner"] = "TIE";
+ if (votesA > votesB) winner = "A";
+ else if (votesB > votesA) winner = "B";
+
+ return {
+ match: {
+ ...baseMatch,
+ prompt,
+ answerA,
+ answerB,
+ votesA,
+ votesB,
+ winner,
+ votes,
+ },
+ };
+}
+
+export type RunQuipbenchOptions = {
+ rounds?: number;
+ concurrency?: number;
+ eloK?: number;
+ initialElo?: number;
+ seed?: number;
+ outputDir?: string;
+ dbPath?: string;
+ models?: BenchModel[];
+ ai?: QuipbenchAi;
+};
+
+export async function runQuipbench(
+ options: RunQuipbenchOptions = {},
+): Promise {
+ const rounds = options.rounds ?? DEFAULT_ROUNDS;
+ const concurrency = options.concurrency ?? DEFAULT_CONCURRENCY;
+ const eloK = options.eloK ?? DEFAULT_ELO_K;
+ const initialElo = options.initialElo ?? DEFAULT_INITIAL_ELO;
+ const models = options.models ?? QUIPBENCH_MODELS;
+ const outputDir = options.outputDir ?? DEFAULT_OUTPUT_DIR;
+ const dbPath = options.dbPath ?? DEFAULT_DB_PATH;
+ const seed = options.seed ?? Math.floor(Math.random() * 2_000_000_000);
+
+ if (!options.ai && !process.env.OPENROUTER_API_KEY) {
+ throw new Error("OPENROUTER_API_KEY is required for live Quipbench runs");
+ }
+
+ if (models.length < 3) {
+ throw new Error("Quipbench requires at least 3 models");
+ }
+
+ if (rounds <= 0 || concurrency <= 0 || eloK <= 0 || initialElo <= 0) {
+ throw new Error("rounds, concurrency, k, and initialElo must be positive");
+ }
+
+ let ai: QuipbenchAi = options.ai ?? {
+ async generatePrompt() {
+ throw new Error("Live AI is not loaded");
+ },
+ async generateAnswer() {
+ throw new Error("Live AI is not loaded");
+ },
+ async vote() {
+ throw new Error("Live AI is not loaded");
+ },
+ };
+ let retry: RetryFn = defaultWithRetry;
+ let isRealStringFn = defaultIsRealString;
+
+ if (!options.ai) {
+ const live = await loadLiveAi();
+ ai = live.ai;
+ retry = live.retry;
+ isRealStringFn = live.isRealStringFn;
+ }
+
+ mkdirSync(outputDir, { recursive: true });
+
+ const db = openBenchDb(dbPath);
+ const runId = createRunId();
+ const startedAt = new Date().toISOString();
+
+ insertRunStart(db, {
+ id: runId,
+ startedAt,
+ roundsRequested: rounds,
+ concurrency,
+ eloK,
+ initialElo,
+ seed,
+ outputDir,
+ });
+
+ const ratings = new Map();
+ for (const model of models) {
+ ratings.set(model.id, {
+ model,
+ elo: initialElo,
+ wins: 0,
+ games: 0,
+ });
+ }
+
+ let nextRound = 1;
+ let roundsCompleted = 0;
+ let failures = 0;
+
+ let writeLock: Promise = Promise.resolve();
+ async function serializeWrite(fn: () => void | Promise) {
+ writeLock = writeLock.then(fn, fn);
+ await writeLock;
+ }
+
+ async function worker() {
+ while (true) {
+ const roundNum = nextRound;
+ nextRound += 1;
+ if (roundNum > rounds) break;
+
+ const { match } = await runRound({
+ runId,
+ roundNum,
+ models,
+ ai,
+ seed,
+ retry,
+ isRealStringFn,
+ });
+
+ await serializeWrite(() => {
+ insertMatch(db, match);
+
+ if (match.winner === "ERROR") {
+ failures += 1;
+ } else {
+ roundsCompleted += 1;
+
+ const ratingA = ratings.get(match.contestantA.id);
+ const ratingB = ratings.get(match.contestantB.id);
+ if (!ratingA || !ratingB) {
+ throw new Error("Contestant rating state missing");
+ }
+
+ ratingA.games += 1;
+ ratingB.games += 1;
+
+ let scoreA = 0.5;
+ if (match.winner === "A") {
+ scoreA = 1;
+ ratingA.wins += 1;
+ } else if (match.winner === "B") {
+ scoreA = 0;
+ ratingB.wins += 1;
+ }
+
+ const updated = updatePairElo(ratingA.elo, ratingB.elo, scoreA, eloK);
+ ratingA.elo = updated.nextA;
+ ratingB.elo = updated.nextB;
+ }
+
+ updateRunProgress(db, runId, { roundsCompleted, failures });
+ process.stdout.write(
+ `\rQuipbench progress: ${roundsCompleted + failures}/${rounds} (ok=${roundsCompleted}, failed=${failures})`,
+ );
+ });
+ }
+ }
+
+ try {
+ const workers = Array.from(
+ { length: Math.min(concurrency, rounds) },
+ () => worker(),
+ );
+ await Promise.all(workers);
+ await writeLock;
+
+ const leaderboard = buildLeaderboard(Array.from(ratings.values()));
+ replaceRatings(db, runId, leaderboard);
+
+ const endedAt = new Date().toISOString();
+ finalizeRun(db, runId, "completed", endedAt);
+
+ const snapshotPaths = await exportLatestSnapshot({
+ dbPath,
+ outputDir,
+ runId,
+ });
+
+ process.stdout.write("\n");
+
+ return {
+ runMeta: {
+ runId,
+ startedAt,
+ endedAt,
+ roundsRequested: rounds,
+ roundsCompleted,
+ failures,
+ concurrency,
+ eloK,
+ initialElo,
+ seed,
+ },
+ leaderboard,
+ snapshotPathJson: snapshotPaths.latestJsonPath,
+ snapshotPathJs: snapshotPaths.latestJsPath,
+ };
+ } catch (error) {
+ finalizeRun(db, runId, "failed", new Date().toISOString());
+ throw error;
+ } finally {
+ db.close();
+ }
+}
+
+async function main() {
+ const args = parseArgs(process.argv.slice(2));
+
+ const rounds = parsePositiveInt(args.rounds, DEFAULT_ROUNDS);
+ const concurrency = parsePositiveInt(args.concurrency, DEFAULT_CONCURRENCY);
+ const eloK = parsePositiveNumber(args.k, DEFAULT_ELO_K);
+ const initialElo = parsePositiveNumber(args.initialElo, DEFAULT_INITIAL_ELO);
+ const seedArg = args.seed ? Number.parseInt(args.seed, 10) : undefined;
+ const seed = Number.isFinite(seedArg) ? seedArg : undefined;
+
+ const outputDir = args.out ? join(process.cwd(), args.out) : DEFAULT_OUTPUT_DIR;
+ const dbPath = args.db ? join(process.cwd(), args.db) : DEFAULT_DB_PATH;
+
+ const result = await runQuipbench({
+ rounds,
+ concurrency,
+ eloK,
+ initialElo,
+ seed,
+ outputDir,
+ dbPath,
+ });
+
+ console.log("Quipbench complete");
+ console.log(`Run ID: ${result.runMeta.runId}`);
+ console.log(
+ `Rounds: ${result.runMeta.roundsCompleted}/${result.runMeta.roundsRequested} (failures=${result.runMeta.failures})`,
+ );
+ console.log(`Snapshot JSON: ${result.snapshotPathJson}`);
+ console.log(`Snapshot JS: ${result.snapshotPathJs}`);
+
+ const preview = result.leaderboard.slice(0, 10);
+ for (const row of preview) {
+ console.log(
+ `${String(row.rank).padStart(2)}. ${row.modelName.padEnd(20)} Elo ${row.elo.toFixed(2).padStart(8)} | ${row.wins}/${row.games} (${row.winRate.toFixed(2)}%)`,
+ );
+ }
+}
+
+if (import.meta.main) {
+ main().catch((error) => {
+ console.error(error instanceof Error ? error.message : String(error));
+ process.exit(1);
+ });
+}
diff --git a/bench/types.ts b/bench/types.ts
new file mode 100644
index 0000000..1cf5333
--- /dev/null
+++ b/bench/types.ts
@@ -0,0 +1,85 @@
+export type BenchModel = {
+ id: string;
+ name: string;
+};
+
+export type VoteRecord = {
+ voterId: string;
+ voterName: string;
+ votedFor: "A" | "B" | null;
+ showAFirst: boolean;
+ error?: string;
+};
+
+export type MatchRecord = {
+ runId: string;
+ roundNum: number;
+ prompter: BenchModel;
+ contestantA: BenchModel;
+ contestantB: BenchModel;
+ prompt?: string;
+ answerA?: string;
+ answerB?: string;
+ votesA: number;
+ votesB: number;
+ winner: "A" | "B" | "TIE" | "ERROR";
+ votes: VoteRecord[];
+ error?: string;
+};
+
+export type RatingState = {
+ model: BenchModel;
+ elo: number;
+ wins: number;
+ games: number;
+};
+
+export type LeaderboardRow = {
+ rank: number;
+ modelId: string;
+ modelName: string;
+ elo: number;
+ wins: number;
+ games: number;
+ winRate: number;
+};
+
+export type RunMeta = {
+ runId: string;
+ startedAt: string;
+ endedAt: string;
+ roundsRequested: number;
+ roundsCompleted: number;
+ failures: number;
+ concurrency: number;
+ eloK: number;
+ initialElo: number;
+ seed: number;
+};
+
+export type QuipbenchSnapshot = {
+ runMeta: RunMeta;
+ leaderboard: LeaderboardRow[];
+ chart: Array<{
+ modelName: string;
+ elo: number;
+ }>;
+};
+
+export type QuipbenchRunResult = {
+ runMeta: RunMeta;
+ leaderboard: LeaderboardRow[];
+ snapshotPathJson: string;
+ snapshotPathJs: string;
+};
+
+export type QuipbenchAi = {
+ generatePrompt: (model: BenchModel) => Promise;
+ generateAnswer: (model: BenchModel, prompt: string) => Promise;
+ vote: (
+ voter: BenchModel,
+ prompt: string,
+ answerA: string,
+ answerB: string,
+ ) => Promise<"A" | "B">;
+};
diff --git a/bun.lock b/bun.lock
index ae913d8..d6f2a45 100644
--- a/bun.lock
+++ b/bun.lock
@@ -1,6 +1,5 @@
{
"lockfileVersion": 1,
- "configVersion": 1,
"workspaces": {
"": {
"name": "quipslop",
@@ -11,6 +10,7 @@
"puppeteer": "^24.2.0",
"react": "^19.2.4",
"react-dom": "^19.2.4",
+ "zod": "^4.3.6",
},
"devDependencies": {
"@types/bun": "latest",
diff --git a/package.json b/package.json
index 50f949b..208af5c 100644
--- a/package.json
+++ b/package.json
@@ -8,7 +8,10 @@
"start:cli": "bun quipslop.tsx",
"start:web": "bun --hot server.ts",
"start:stream": "bun ./scripts/stream-browser.ts live",
- "start:stream:dryrun": "bun ./scripts/stream-browser.ts dryrun"
+ "start:stream:dryrun": "bun ./scripts/stream-browser.ts dryrun",
+ "quipbench:run": "bun bench/run.ts",
+ "quipbench:export": "bun bench/export.ts",
+ "quipbench:open": "bun bench/open.ts"
},
"devDependencies": {
"@types/bun": "latest",
@@ -24,6 +27,7 @@
"ink": "^6.8.0",
"puppeteer": "^24.2.0",
"react": "^19.2.4",
- "react-dom": "^19.2.4"
+ "react-dom": "^19.2.4",
+ "zod": "^4.3.6"
}
}