fix: bmj list now counts finished scenarios from in-progress runs (#168)

jrvb-rl · claude · web-flow · commit 123f1d44f846 · 2026-03-11T08:37:38.000-07:00
Previously only counted scenarios from completed benchmark runs
(benchmark_outcomes), missing finished scenarios in still-running
benchmark runs. Now fetches scenario run details for in-progress runs to
get accurate done/error/score counts.

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/commands/benchmark-job/list.ts b/src/commands/benchmark-job/list.ts
@@ -5,6 +5,7 @@
 import chalk from "chalk";
 import {
   listBenchmarkJobs,
+  listBenchmarkRunScenarioRuns,
   type BenchmarkJob,
 } from "../../services/benchmarkJobService.js";
 import { output, outputError } from "../../utils/output.js";
@@ -54,7 +55,16 @@ interface JobStats {
   avgScore: number | null;
 }
 
-function aggregateJobStats(job: BenchmarkJob): JobStats {
+// Scenario run states that count as finished
+const SCENARIO_DONE_STATES = new Set([
+  "completed",
+  "failed",
+  "canceled",
+  "timeout",
+  "error",
+]);
+
+async function aggregateJobStats(job: BenchmarkJob): Promise<JobStats> {
   const outcomes = job.benchmark_outcomes || [];
   const scenarioCount = job.job_spec?.scenario_ids?.length || 0;
   const agentCount = job.job_spec?.agent_configs?.length || 1;
@@ -65,6 +75,7 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
   let scoreSum = 0;
   let scoreCount = 0;
 
+  // Count from completed benchmark runs
   for (const outcome of outcomes) {
     done += outcome.n_completed + outcome.n_failed + outcome.n_timeout;
     errors += outcome.n_failed + outcome.n_timeout;
@@ -74,6 +85,38 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
     }
   }
 
+  // Count finished scenarios from in-progress benchmark runs
+  const inProgressRuns = job.in_progress_runs || [];
+  if (inProgressRuns.length > 0) {
+    const runResults = await Promise.all(
+      inProgressRuns.map((run) =>
+        listBenchmarkRunScenarioRuns(run.benchmark_run_id),
+      ),
+    );
+    for (const scenarioRuns of runResults) {
+      let runScoreSum = 0;
+      let runScoreCount = 0;
+      for (const sr of scenarioRuns) {
+        const state = sr.state?.toLowerCase() || "";
+        if (SCENARIO_DONE_STATES.has(state)) {
+          done++;
+          if (state !== "completed") {
+            errors++;
+          }
+          const score = sr.scoring_contract_result?.score;
+          if (score !== undefined && score !== null) {
+            runScoreSum += score;
+            runScoreCount++;
+          }
+        }
+      }
+      if (runScoreCount > 0) {
+        scoreSum += runScoreSum / runScoreCount;
+        scoreCount++;
+      }
+    }
+  }
+
   return {
     done,
     total: total || done,
@@ -120,7 +163,7 @@ function truncate(str: string, maxLen: number): string {
   return str.slice(0, maxLen - 1) + "…";
 }
 
-function printTable(jobs: BenchmarkJob[]): void {
+async function printTable(jobs: BenchmarkJob[]): Promise<void> {
   if (jobs.length === 0) {
     console.log(chalk.dim("No benchmark jobs found"));
     return;
@@ -149,7 +192,7 @@ function printTable(jobs: BenchmarkJob[]): void {
 
   // Rows
   for (const job of jobs) {
-    const stats = aggregateJobStats(job);
+    const stats = await aggregateJobStats(job);
 
     const id = truncate(job.id, COL_ID).padEnd(COL_ID);
     const name = truncate(job.name || "", nameWidth).padEnd(nameWidth);
@@ -262,7 +305,7 @@ export async function listBenchmarkJobsCommand(
     if (format !== "text") {
       output(jobs, { format, defaultFormat: "json" });
     } else {
-      printTable(jobs);
+      await printTable(jobs);
     }
   } catch (error) {
     outputError("Failed to list benchmark jobs", error);