-
Notifications
You must be signed in to change notification settings - Fork 38
Add QuipBench :) #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add QuipBench :) #10
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| # Quipbench | ||
|
|
||
| Standalone benchmark runner + static dashboard for Quipslop models. | ||
|
|
||
| ## What it does | ||
|
|
||
| - Runs live OpenRouter self-play rounds (same mechanics as the main game) | ||
| - Computes Elo-first leaderboard with wins/games/win-rate | ||
| - Stores run + match + rating records in `bench/quipbench.sqlite` | ||
| - Exports latest snapshot to `bench/out/latest.json` and `bench/out/latest.js` | ||
| - Renders a standalone dashboard at `bench/dashboard/index.html` | ||
|
|
||
| ## Prerequisites | ||
|
|
||
| - Bun | ||
| - `OPENROUTER_API_KEY` set in environment for live runs | ||
|
|
||
| ## Commands | ||
|
|
||
| From repo root: | ||
|
|
||
| - `bun run quipbench:run` | ||
| - `bun run quipbench:export` | ||
| - `bun run quipbench:open` | ||
|
|
||
| ### Run options | ||
|
|
||
| `quipbench:run` supports CLI flags: | ||
|
|
||
| - `--rounds=100` | ||
| - `--concurrency=4` | ||
| - `--k=24` | ||
| - `--initialElo=1500` | ||
| - `--seed=12345` | ||
| - `--out=bench/out` | ||
| - `--db=bench/quipbench.sqlite` | ||
|
|
||
| Example: | ||
|
|
||
| ```bash | ||
| bun bench/run.ts --rounds=150 --concurrency=6 --seed=42 | ||
| ``` | ||
|
|
||
| ## Output contract (`latest` snapshot) | ||
|
|
||
| `bench/out/latest.json` and `bench/out/latest.js` contain: | ||
|
|
||
| - `runMeta`: `runId`, `startedAt`, `endedAt`, `roundsRequested`, `roundsCompleted`, `failures`, `concurrency`, `eloK`, `initialElo`, `seed` | ||
| - `leaderboard[]`: `rank`, `modelId`, `modelName`, `elo`, `wins`, `games`, `winRate` | ||
| - `chart[]`: `{ modelName, elo }` | ||
|
|
||
| ## Dashboard | ||
|
|
||
| Open `bench/dashboard/index.html` directly or via: | ||
|
|
||
| ```bash | ||
| bun run quipbench:open | ||
| ``` | ||
|
|
||
| The dashboard reads `../out/latest.js` and shows: | ||
|
|
||
| - run metadata summary | ||
| - vertical Elo bar chart with model names under each bar | ||
| - leaderboard table |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| import { join } from "node:path"; | ||
|
|
||
| export const BENCH_DIR = import.meta.dir; | ||
|
|
||
| export const DEFAULT_ROUNDS = 100; | ||
| export const DEFAULT_CONCURRENCY = 4; | ||
| export const DEFAULT_ELO_K = 24; | ||
| export const DEFAULT_INITIAL_ELO = 1500; | ||
|
|
||
| export const DEFAULT_DB_PATH = join(BENCH_DIR, "quipbench.sqlite"); | ||
| export const DEFAULT_OUTPUT_DIR = join(BENCH_DIR, "out"); | ||
| export const DEFAULT_LATEST_JSON_PATH = join(DEFAULT_OUTPUT_DIR, "latest.json"); | ||
| export const DEFAULT_LATEST_JS_PATH = join(DEFAULT_OUTPUT_DIR, "latest.js"); | ||
|
|
||
| export function parsePositiveInt( | ||
| value: string | undefined, | ||
| fallback: number, | ||
| ): number { | ||
| const parsed = Number.parseInt(value ?? "", 10); | ||
| return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; | ||
| } | ||
|
|
||
| export function parsePositiveNumber( | ||
| value: string | undefined, | ||
| fallback: number, | ||
| ): number { | ||
| const parsed = Number.parseFloat(value ?? ""); | ||
| return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,277 @@ | ||
| const root = document.getElementById("app"); | ||
| const snapshot = window.__QUIPBENCH_LATEST__; | ||
| const logoImageCache = new Map(); | ||
|
|
||
| function logoFor(name) { | ||
| if (name.includes("Gemini")) return "./assets/logos/gemini.svg"; | ||
| if (name.includes("Kimi")) return "./assets/logos/kimi.svg"; | ||
| if (name.includes("DeepSeek")) return "./assets/logos/deepseek.svg"; | ||
| if (name.includes("GLM")) return "./assets/logos/glm.svg"; | ||
| if (name.includes("GPT")) return "./assets/logos/openai.svg"; | ||
| if (name.includes("Opus") || name.includes("Sonnet")) return "./assets/logos/claude.svg"; | ||
| if (name.includes("Grok")) return "./assets/logos/grok.svg"; | ||
| if (name.includes("MiniMax")) return "./assets/logos/minimax.svg"; | ||
| return null; | ||
| } | ||
|
|
||
| function formatDate(ts) { | ||
| const date = new Date(ts); | ||
| if (Number.isNaN(date.getTime())) return ts; | ||
| return date.toLocaleString(); | ||
| } | ||
|
|
||
| function rowHtml(row) { | ||
| const logo = logoFor(row.modelName); | ||
| return ` | ||
| <tr> | ||
| <td class="mono rank">${row.rank}</td> | ||
| <td> | ||
| <div class="model-cell"> | ||
| ${logo ? `<img src="${logo}" alt="" />` : ""} | ||
| <span>${row.modelName}</span> | ||
| </div> | ||
| </td> | ||
| <td class="mono">${row.elo.toFixed(2)}</td> | ||
| <td class="mono">${row.wins}</td> | ||
| <td class="mono">${row.games}</td> | ||
| <td class="mono">${row.winRate.toFixed(2)}%</td> | ||
| </tr> | ||
| `; | ||
| } | ||
|
Comment on lines
+23
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unescaped strings interpolated into
♻️ Proposed escape helperfunction esc(str) {
return String(str)
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, """);
}Then wrap all string interpolations: Also applies to: 190-269 🤖 Prompt for AI Agents |
||
|
|
||
| function renderChart(rows) { | ||
| const chartCanvas = document.getElementById("elo-chart"); | ||
| if (!chartCanvas || typeof Chart === "undefined" || rows.length === 0) return; | ||
|
|
||
| const sorted = [...rows].sort((a, b) => b.elo - a.elo); | ||
| const labels = sorted.map((row) => row.modelName); | ||
| const data = sorted.map((row) => Number(row.elo.toFixed(2))); | ||
| const max = Math.max(...data); | ||
| const min = Math.min(...data); | ||
| const yMin = Math.floor(min - 20); | ||
| const yMax = Math.ceil(max + 10); | ||
|
|
||
| const iconPlugin = { | ||
| id: "barIcons", | ||
| afterDatasetsDraw(chart) { | ||
| const { ctx } = chart; | ||
| const meta = chart.getDatasetMeta(0); | ||
| const topY = chart.scales.y.getPixelForValue(yMax); | ||
|
|
||
| meta.data.forEach((bar, index) => { | ||
| const modelName = labels[index]; | ||
| const iconUrl = logoFor(modelName); | ||
| if (!iconUrl) return; | ||
|
|
||
| let img = logoImageCache.get(iconUrl); | ||
| if (!img) { | ||
| img = new Image(); | ||
| img.src = iconUrl; | ||
| img.onload = () => chart.draw(); | ||
| logoImageCache.set(iconUrl, img); | ||
| } | ||
| if (!img.complete || !img.naturalWidth) return; | ||
|
|
||
| const iconSize = 18; | ||
| const x = bar.x - iconSize / 2; | ||
| const y = Math.max(topY + 4, bar.y - iconSize - 6); | ||
|
|
||
| ctx.save(); | ||
| ctx.fillStyle = "#0a0a0a"; | ||
| ctx.strokeStyle = "#2a2a2a"; | ||
| ctx.lineWidth = 1; | ||
| if (typeof ctx.roundRect === "function") { | ||
| ctx.beginPath(); | ||
| ctx.roundRect(x - 3, y - 3, iconSize + 6, iconSize + 6, 6); | ||
| ctx.fill(); | ||
| ctx.stroke(); | ||
| } else { | ||
| ctx.fillRect(x - 3, y - 3, iconSize + 6, iconSize + 6); | ||
| ctx.strokeRect(x - 3, y - 3, iconSize + 6, iconSize + 6); | ||
| } | ||
| ctx.drawImage(img, x, y, iconSize, iconSize); | ||
| ctx.restore(); | ||
| }); | ||
| }, | ||
| }; | ||
|
|
||
| new Chart(chartCanvas, { | ||
| type: "bar", | ||
| data: { | ||
| labels, | ||
| datasets: [ | ||
| { | ||
| label: "Elo", | ||
| data, | ||
| borderWidth: 1, | ||
| borderColor: "#3c2018", | ||
| backgroundColor: [ | ||
| "#e8ab97", | ||
| "#e09a81", | ||
| "#d98367", | ||
| "#d97757", | ||
| "#ca6b4b", | ||
| "#bc6141", | ||
| "#ae5637", | ||
| "#9f4b2d", | ||
| ], | ||
|
Comment on lines
+108
to
+117
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hardcoded 8-color There are 8 hardcoded colors but Chart.js won't cycle them for bar charts — additional bars beyond index 7 get ♻️ Proposed fix- backgroundColor: [
- "#e8ab97",
- "#e09a81",
- "#d98367",
- "#d97757",
- "#ca6b4b",
- "#bc6141",
- "#ae5637",
- "#9f4b2d",
- ],
+ backgroundColor: sorted.map((_, i) => {
+ const t = sorted.length > 1 ? i / (sorted.length - 1) : 0;
+ // interpolate from light (`#e8ab97`) to dark (`#9f4b2d`)
+ const r = Math.round(0xe8 + t * (0x9f - 0xe8));
+ const g = Math.round(0xab + t * (0x4b - 0xab));
+ const b = Math.round(0x97 + t * (0x2d - 0x97));
+ return `rgb(${r},${g},${b})`;
+ }),🤖 Prompt for AI Agents |
||
| borderRadius: 6, | ||
| maxBarThickness: 72, | ||
| }, | ||
| ], | ||
| }, | ||
| options: { | ||
| responsive: true, | ||
| maintainAspectRatio: false, | ||
| plugins: { | ||
| legend: { display: false }, | ||
| tooltip: { | ||
| displayColors: false, | ||
| backgroundColor: "#101010", | ||
| borderColor: "#2d2d2d", | ||
| borderWidth: 1, | ||
| titleColor: "#f0f0f0", | ||
| bodyColor: "#d4d4d4", | ||
| callbacks: { | ||
| label(context) { | ||
| return `Elo ${Number(context.raw).toFixed(2)}`; | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| scales: { | ||
| x: { | ||
| ticks: { | ||
| color: "#b8b8b8", | ||
| maxRotation: 0, | ||
| autoSkip: false, | ||
| font: { family: "JetBrains Mono", size: 11 }, | ||
| }, | ||
| grid: { color: "rgba(255,255,255,0.04)" }, | ||
| }, | ||
| y: { | ||
| min: yMin, | ||
| max: yMax, | ||
| ticks: { | ||
| color: "#8b8b8b", | ||
| font: { family: "JetBrains Mono", size: 11 }, | ||
| }, | ||
| grid: { color: "rgba(255,255,255,0.07)" }, | ||
| }, | ||
| }, | ||
| }, | ||
| plugins: [iconPlugin], | ||
| }); | ||
| } | ||
|
|
||
| function renderEmpty() { | ||
| root.innerHTML = ` | ||
| <main class="shell"> | ||
| <header class="header"> | ||
| <div class="brand"> | ||
| <img src="./assets/logo.svg" alt="Quipbench" /> | ||
| <h1>Quipbench</h1> | ||
| </div> | ||
| </header> | ||
| <section class="panel"> | ||
| <h2>No snapshot found</h2> | ||
| <p>Run a benchmark first: <code>bun run quipbench:run</code></p> | ||
| <p>Then refresh this page. Snapshot expected at <code>bench/out/latest.js</code>.</p> | ||
| </section> | ||
| </main> | ||
| `; | ||
| } | ||
|
|
||
| function render(snapshotData) { | ||
| const meta = snapshotData.runMeta; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Medium The guard at line 272 only checks 🚀 Reply "fix it for me" or copy this AI Prompt for your agent: |
||
| const leaderboard = snapshotData.leaderboard; | ||
| const champion = leaderboard[0]; | ||
|
|
||
| root.innerHTML = ` | ||
| <main class="shell"> | ||
| <header class="header"> | ||
| <div class="brand"> | ||
| <img src="./assets/logo.svg" alt="Quipbench" /> | ||
| <h1>Quipbench</h1> | ||
| </div> | ||
| <div class="header-links"> | ||
| <a href="https://github.com/T3-Content/quipslop" target="_blank" rel="noreferrer"> | ||
| <svg viewBox="0 0 24 24" aria-hidden="true"> | ||
| <path d="M12 .5a12 12 0 0 0-3.79 23.39c.6.11.82-.26.82-.58v-2.05c-3.34.73-4.04-1.41-4.04-1.41-.55-1.36-1.33-1.72-1.33-1.72-1.09-.73.08-.72.08-.72 1.2.09 1.83 1.2 1.83 1.2 1.07 1.79 2.81 1.27 3.49.97.11-.75.42-1.27.76-1.56-2.67-.3-5.47-1.31-5.47-5.84 0-1.29.47-2.35 1.24-3.18-.13-.3-.54-1.52.12-3.16 0 0 1.01-.32 3.3 1.21a11.63 11.63 0 0 1 6 0c2.28-1.53 3.29-1.21 3.29-1.21.66 1.64.25 2.86.12 3.16.77.83 1.24 1.89 1.24 3.18 0 4.54-2.8 5.54-5.48 5.84.43.37.81 1.09.81 2.21v3.27c0 .32.22.7.83.58A12 12 0 0 0 12 .5Z"/> | ||
| </svg> | ||
| <span>GitHub</span> | ||
| </a> | ||
| <a href="https://quipslop.com/" target="_blank" rel="noreferrer"> | ||
| <svg viewBox="0 0 24 24" aria-hidden="true"> | ||
| <path d="M12 2a10 10 0 1 0 10 10A10 10 0 0 0 12 2Zm7.88 9h-3.07a15.9 15.9 0 0 0-1.18-5A8.04 8.04 0 0 1 19.88 11ZM12 4.04c1.04 1.17 1.92 3.24 2.35 5.96H9.65C10.08 7.28 10.96 5.21 12 4.04ZM4.12 13h3.07a15.9 15.9 0 0 0 1.18 5A8.04 8.04 0 0 1 4.12 13Zm3.07-2H4.12a8.04 8.04 0 0 1 4.25-5 15.9 15.9 0 0 0-1.18 5ZM12 19.96c-1.04-1.17-1.92-3.24-2.35-5.96h4.7c-.43 2.72-1.31 4.79-2.35 5.96ZM14.57 13H9.43a14.4 14.4 0 0 1 0-2h5.14a14.4 14.4 0 0 1 0 2Zm1.06 5a15.9 15.9 0 0 0 1.18-5h3.07a8.04 8.04 0 0 1-4.25 5Z"/> | ||
| </svg> | ||
| <span>Website</span> | ||
| </a> | ||
| </div> | ||
| <div class="meta-pills mono"> | ||
| <span class="pill">Run ${meta.runId}</span> | ||
| <span class="pill">${meta.roundsCompleted}/${meta.roundsRequested} rounds</span> | ||
| <span class="pill">${meta.failures} failures</span> | ||
| </div> | ||
| </header> | ||
|
|
||
| <section class="panel panel--summary"> | ||
| <div class="summary-block"> | ||
| <div class="summary-label mono">Started</div> | ||
| <div class="summary-value">${formatDate(meta.startedAt)}</div> | ||
| </div> | ||
| <div class="summary-block"> | ||
| <div class="summary-label mono">Ended</div> | ||
| <div class="summary-value">${formatDate(meta.endedAt)}</div> | ||
| </div> | ||
| <div class="summary-block"> | ||
| <div class="summary-label mono">Champion</div> | ||
| <div class="summary-value">${champion ? champion.modelName : "-"}</div> | ||
| </div> | ||
| <div class="summary-block"> | ||
| <div class="summary-label mono">Seed</div> | ||
| <div class="summary-value mono">${meta.seed}</div> | ||
| </div> | ||
| </section> | ||
|
|
||
| <section class="panel"> | ||
| <div class="panel-head"> | ||
| <h2>Elo Leaderboard</h2> | ||
| </div> | ||
| <div class="chart-shell"> | ||
| <canvas id="elo-chart" aria-label="Elo leaderboard bar chart"></canvas> | ||
| </div> | ||
| </section> | ||
|
|
||
| <section class="panel"> | ||
| <div class="panel-head"> | ||
| <h2>Leaderboard Table</h2> | ||
| </div> | ||
| <div class="table-wrap"> | ||
| <table> | ||
| <thead> | ||
| <tr> | ||
| <th class="mono">#</th> | ||
| <th>Model</th> | ||
| <th class="mono">Elo</th> | ||
| <th class="mono">Wins</th> | ||
| <th class="mono">Games</th> | ||
| <th class="mono">Win Rate</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody> | ||
| ${leaderboard.map(rowHtml).join("")} | ||
| </tbody> | ||
| </table> | ||
| </div> | ||
| </section> | ||
| </main> | ||
| `; | ||
| } | ||
|
|
||
| if (!snapshot || !snapshot.leaderboard) { | ||
| renderEmpty(); | ||
| } else { | ||
| render(snapshot); | ||
| renderChart(snapshot.leaderboard); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| // Source shim to satisfy TypeScript-based workflows. | ||
| // The standalone dashboard intentionally runs plain JS for direct browser compatibility. | ||
| import "./app.js"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟢 Low
dashboard/app.js:1Consider adding a null check for
rootbefore usinginnerHTML, or document that the HTML file is expected to always contain an element withid="app".🚀 Reply "fix it for me" or copy this AI Prompt for your agent: