AllenInstitute · pgarrison · Jun 4, 2026 · May 20, 2026 · May 20, 2026 · May 21, 2026
@@ -1,10 +1,8 @@
 import { BENCHMARK_TASKS, createServices } from "./tasks";
 import { BenchmarkConfig, BenchmarkResults, QueryResult, SourceResult } from "./types";
+import { DEFAULT_ITERATIONS, DEFAULT_WARMUP_ROUNDS, buildQueryResult } from "./stats";
 import DatabaseServiceWebWorker from "../../src/services/DatabaseServiceWeb/duckdb-worker.worker";
 
-const DEFAULT_ITERATIONS = 5;
-const DEFAULT_WARMUP_ROUNDS = 1;
-
 // Updates the #status element in the benchmark HTML page and mirrors to console.
 // The page can run headlessly in CI (Playwright), so the console log is the
 // only visible progress signal when there is no browser UI to observe.
@@ -14,14 +12,6 @@ function setStatus(msg: string) {
     console.log("[benchmark]", msg);
 }
 
-// Nearest-rank percentile over a pre-sorted array. Used to report p50 and p95
-// across timed iterations — p95 surfaces occasional slow outliers (GC pauses,
-// DuckDB cache misses) that the median would hide.
-function percentile(sorted: number[], p: number): number {
-    const idx = Math.ceil((p / 100) * sorted.length) - 1;
-    return sorted[Math.max(0, idx)];
-}
-
 // Fisher-Yates shuffle — randomizes task order each timed iteration so that a
 // consistently slow task doesn't inflate the times of everything that follows it
 // (DuckDB buffer pool and OS page cache warm up over repeated runs).
@@ -53,7 +43,8 @@ async function benchmarkSource(
     service: DatabaseServiceWebWorker,
     sourceNames: string[],
     iterations: number,
-    warmupRounds: number
+    warmupRounds: number,
+    tasks: typeof BENCHMARK_TASKS
 ): Promise<QueryResult[]> {
     const { annotationSvc, fileSvc } = createServices(service, sourceNames);
 
@@ -64,17 +55,17 @@ async function benchmarkSource(
     // of every task reflect cold-start overhead rather than steady-state cost.
     setStatus(`Warming up ${sourceNames.join(", ")} (${warmupRounds} rounds)...`);
     for (let w = 0; w < warmupRounds; w++) {
-        for (const task of BENCHMARK_TASKS) {
+        for (const task of tasks) {
             service.clearTimings();
             await task.run(annotationSvc, fileSvc);
         }
     }
 
-    const timingsMap = new Map<string, number[]>(BENCHMARK_TASKS.map(({ name }) => [name, []]));
+    const timingsMap = new Map<string, number[]>(tasks.map(({ name }) => [name, []]));
 
     for (let i = 0; i < iterations; i++) {
         setStatus(`Timing ${sourceNames.join(", ")} — iteration ${i + 1}/${iterations}...`);
-        for (const task of shuffle(BENCHMARK_TASKS)) {
+        for (const task of shuffle(tasks)) {
             if (task.resetAnnotationCache) {
                 for (const sourceName of sourceNames) {
                     service.clearAnnotationCache(sourceName);
@@ -94,16 +85,7 @@ async function benchmarkSource(
         }
     }
 
-    return BENCHMARK_TASKS.map(({ name }) => {
-        const timings = [...(timingsMap.get(name) ?? [])].sort((a, b) => a - b);
-        return {
-            name,
-            timings,
-            p50: percentile(timings, 50),
-            p95: percentile(timings, 95),
-            p99: percentile(timings, 99),
-        };
-    });
+    return tasks.map(({ name }) => buildQueryResult(name, timingsMap.get(name) ?? []));
 }
 
 async function main() {
@@ -113,6 +95,16 @@ async function main() {
     }
     const iterations = config.iterations ?? DEFAULT_ITERATIONS;
     const warmupRounds = config.warmupRounds ?? DEFAULT_WARMUP_ROUNDS;
+    const taskFilter = config.taskFilter;
+
+    // When a taskFilter is provided, only run the requested tasks.
+    if (taskFilter) {
+        const validNames = new Set(BENCHMARK_TASKS.map((t) => t.name));
+        const invalid = taskFilter.filter((n) => !validNames.has(n));
+        if (invalid.length) {
+            throw new Error(`Unknown task(s) in taskFilter: ${invalid.join(", ")}`);
+        }
+    }
 
     setStatus("Initializing DuckDB-WASM...");
     const initStart = performance.now();
@@ -149,6 +141,10 @@ async function main() {
         await service.execute('DROP VIEW IF EXISTS "__bff_warmup__"');
     }
 
+    const activeTasks = taskFilter
+        ? BENCHMARK_TASKS.filter((t) => taskFilter.includes(t.name))
+        : BENCHMARK_TASKS;
+
     const sourceResults: SourceResult[] = [];
 
     for (const sources of config.testCases) {
@@ -171,7 +167,13 @@ async function main() {
         const registrationMs = performance.now() - regStart;
 
         const labels = sources.map((source) => source.label);
-        const queries = await benchmarkSource(service, labels, iterations, warmupRounds);
+        const queries = await benchmarkSource(
+            service,
+            labels,
+            iterations,
+            warmupRounds,
+            activeTasks
+        );
         sourceResults.push({ labels, registrationMs, queries });
 
         for (const source of sources) {

@@ -0,0 +1,20 @@
+import { QueryResult } from "./types";
+
+export const DEFAULT_ITERATIONS = 5;
+export const DEFAULT_WARMUP_ROUNDS = 1;
+
+export function percentile(sorted: number[], p: number): number {
+    const idx = Math.ceil((p / 100) * sorted.length) - 1;
+    return sorted[Math.max(0, idx)];
+}
+
+export function buildQueryResult(name: string, rawTimings: number[]): QueryResult {
+    const timings = [...rawTimings].sort((a, b) => a - b);
+    return {
+        name,
+        timings,
+        p50: percentile(timings, 50),
+        p95: percentile(timings, 95),
+        p99: percentile(timings, 99),
+    };
+}
@@ -15,6 +15,7 @@ export interface BenchmarkConfig {
     testCases: TestCase[];
     iterations?: number;
     warmupRounds?: number;
+    taskFilter?: string[];
 }
 
 export interface QueryResult {

@@ -26,8 +26,9 @@
 import fs from "fs";
 import path from "path";
 import { execSync } from "child_process";
-import { BenchmarkResults, TestCase } from "../../benchmark/src/types";
-
+import { BenchmarkResults, QueryResult, SourceResult, TestCase } from "../../benchmark/src/types";
+import { BENCHMARK_TASKS } from "../../benchmark/src/tasks";
+import { DEFAULT_ITERATIONS, buildQueryResult } from "../../benchmark/src/stats";
 const DIST_DIR = path.join(__dirname, "..", "..", "benchmark", "dist");
 const FIXTURES_DIR = path.join(__dirname, "..", "..", "fixtures");
 const PORT = 18765;
@@ -157,7 +158,7 @@
     iterations?: number;
     warmupRounds?: number;
     channel?: string;
-}): Promise<BenchmarkResults> {
+}) {
     if (!skipBuild) buildBenchmark();
 
     if (!fs.existsSync(path.join(DIST_DIR, "index.html"))) {
@@ -167,6 +168,109 @@
     }
 
     const server = await startServer();
+
+    try {
+        const allTaskNames = BENCHMARK_TASKS.map((task) => task.name);
+        const iterationCount = iterations ?? DEFAULT_ITERATIONS;
+
+        let initTimeMs = 0;
+        const sourceResults: SourceResult[] = [];
+
+        for (const testCase of testCases) {
+            let registrationMs = 0;
+            let queries: QueryResult[];
+
+            if (warmupRounds === 0) {
+                // Each (task, iteration) pair gets a fresh browser so every
+                // measurement is a cold start.
+                console.log(
+                    `[playwright] warmupRounds=0: running ${allTaskNames.length} task(s) × ` +
+                        `${iterationCount} iteration(s) in separate browser instances`
+                );
+
+                const timingsMap = new Map<string, number[]>(
+                    allTaskNames.map((name) => [name, []])
+                );
+
+                for (const taskName of allTaskNames) {
+                    for (let i = 0; i < iterationCount; i++) {
+                        console.log(
+                            `[playwright] Launching browser for "${taskName}" ` +
+                                `iteration ${i + 1}/${iterationCount} ` +
+                                `(${testCase.map((source) => source.label).join(", ")})`
+                        );
+                        const run = await runSingleBenchmark({
+                            testCase,
+                            iterations: 1,
+                            warmupRounds: 0,
+                            channel,
+                            taskFilter: [taskName],
+                        });
+
+                        if (initTimeMs === 0) initTimeMs = run.initTimeMs;
+                        if (registrationMs === 0) registrationMs = run.registrationMs;
+
+                        const timing = run.queries[0]?.timings[0];
+                        if (timing !== undefined) {
+                            timingsMap.get(taskName)!.push(timing);
+                        }
+                    }
+                }
+
+                queries = allTaskNames.map((name) =>
+                    buildQueryResult(name, timingsMap.get(name) ?? [])
+                );
+            } else {
+                // Warmups > 0: all tasks share a single browser instance.
+                console.log(
+                    `[playwright] Launching browser for task(s): ${allTaskNames.join(", ")} ` +
+                        `(${testCase.map((source) => source.label).join(", ")})`
+                );
+                const run = await runSingleBenchmark({
+                    testCase,
+                    iterations: iterationCount,
+                    warmupRounds,
+                    channel,
+                    taskFilter: allTaskNames,
+                });
+
+                initTimeMs = initTimeMs || run.initTimeMs;
+                registrationMs = run.registrationMs;
+                queries = run.queries;
+            }
+
+            sourceResults.push({
+                labels: testCase.map((source) => source.label),
+                registrationMs,
+                queries,
+            });
+        }
+
+        return {
+            timestamp: new Date().toISOString(),
+            commit: "unknown",
+            branch: "unknown",
+            initTimeMs,
+            results: sourceResults,
+        } as BenchmarkResults;
+    } finally {
+        await new Promise((res) => server.close(res));
+    }
+}
+
+async function runSingleBenchmark({
+    testCase,
+    iterations,
+    warmupRounds,
+    channel,
+    taskFilter,
+}: {
+    testCase: TestCase;
+    iterations: number;
+    warmupRounds?: number;
+    channel?: string;
+    taskFilter: string[];
+}): Promise<{ initTimeMs: number; registrationMs: number; queries: QueryResult[] }> {
     const browser = await chromium.launch({
         channel,
         headless: true,
@@ -185,13 +289,14 @@
         // synchronously on startup — no callback handshake needed.
         await page.addInitScript({
             content: `window.__benchmarkConfig = ${JSON.stringify({
-                testCases,
+                testCases: [testCase],
                 iterations,
                 warmupRounds,
+                taskFilter,
             })};`,
         });
 
-        console.log(`[playwright] Starting benchmark (${testCases.length} test case(s))...`);
+        console.log(`[playwright] Starting benchmark...`);
         await page.goto(`http://localhost:${PORT}/`, { waitUntil: "domcontentloaded" });
 
         // Wait for the benchmark to signal it's ready for file injection
@@ -204,7 +309,7 @@
         // which is identical to how the real app loads files via the file picker —
         // no HTTP range-request overhead, so DuckDB sort performance matches real-user timing.
         const loaded = new Set();
-        for (const source of testCases.flat()) {
+        for (const source of testCase) {
             if (source.label in loaded) continue; // Don't add duplicate sources
             const localMatch = source.url.match(
                 new RegExp(`^http://localhost:${PORT}/fixtures/(.+)$`)
@@ -254,10 +359,17 @@
         const error = await page.evaluate(() => window.__benchmarkError ?? null);
         if (error) throw new Error(`Benchmark failed in browser: ${error}`);
 
-        return await page.evaluate(() => window.__benchmarkResults);
+        const benchmarkResults: BenchmarkResults = await page.evaluate(
+            () => window.__benchmarkResults
+        );
+        const result = benchmarkResults.results[0];
+        return {
+            initTimeMs: benchmarkResults.initTimeMs,
+            registrationMs: result.registrationMs,
+            queries: result.queries,
+        };
     } finally {
         await browser.close();
-        await new Promise((res) => server.close(res));
     }
 }