Skip to content

Commit 123f1d4

Browse files
jrvb-rlclaude
andauthored
fix: bmj list now counts finished scenarios from in-progress runs (#168)
Previously only counted scenarios from completed benchmark runs (benchmark_outcomes), missing finished scenarios in still-running benchmark runs. Now fetches scenario run details for in-progress runs to get accurate done/error/score counts. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9b1deed commit 123f1d4

File tree

1 file changed

+47
-4
lines changed

1 file changed

+47
-4
lines changed

src/commands/benchmark-job/list.ts

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import chalk from "chalk";
66
import {
77
listBenchmarkJobs,
8+
listBenchmarkRunScenarioRuns,
89
type BenchmarkJob,
910
} from "../../services/benchmarkJobService.js";
1011
import { output, outputError } from "../../utils/output.js";
@@ -54,7 +55,16 @@ interface JobStats {
5455
avgScore: number | null;
5556
}
5657

57-
function aggregateJobStats(job: BenchmarkJob): JobStats {
58+
// Scenario run states that count as finished
59+
const SCENARIO_DONE_STATES = new Set([
60+
"completed",
61+
"failed",
62+
"canceled",
63+
"timeout",
64+
"error",
65+
]);
66+
67+
async function aggregateJobStats(job: BenchmarkJob): Promise<JobStats> {
5868
const outcomes = job.benchmark_outcomes || [];
5969
const scenarioCount = job.job_spec?.scenario_ids?.length || 0;
6070
const agentCount = job.job_spec?.agent_configs?.length || 1;
@@ -65,6 +75,7 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
6575
let scoreSum = 0;
6676
let scoreCount = 0;
6777

78+
// Count from completed benchmark runs
6879
for (const outcome of outcomes) {
6980
done += outcome.n_completed + outcome.n_failed + outcome.n_timeout;
7081
errors += outcome.n_failed + outcome.n_timeout;
@@ -74,6 +85,38 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
7485
}
7586
}
7687

88+
// Count finished scenarios from in-progress benchmark runs
89+
const inProgressRuns = job.in_progress_runs || [];
90+
if (inProgressRuns.length > 0) {
91+
const runResults = await Promise.all(
92+
inProgressRuns.map((run) =>
93+
listBenchmarkRunScenarioRuns(run.benchmark_run_id),
94+
),
95+
);
96+
for (const scenarioRuns of runResults) {
97+
let runScoreSum = 0;
98+
let runScoreCount = 0;
99+
for (const sr of scenarioRuns) {
100+
const state = sr.state?.toLowerCase() || "";
101+
if (SCENARIO_DONE_STATES.has(state)) {
102+
done++;
103+
if (state !== "completed") {
104+
errors++;
105+
}
106+
const score = sr.scoring_contract_result?.score;
107+
if (score !== undefined && score !== null) {
108+
runScoreSum += score;
109+
runScoreCount++;
110+
}
111+
}
112+
}
113+
if (runScoreCount > 0) {
114+
scoreSum += runScoreSum / runScoreCount;
115+
scoreCount++;
116+
}
117+
}
118+
}
119+
77120
return {
78121
done,
79122
total: total || done,
@@ -120,7 +163,7 @@ function truncate(str: string, maxLen: number): string {
120163
return str.slice(0, maxLen - 1) + "…";
121164
}
122165

123-
function printTable(jobs: BenchmarkJob[]): void {
166+
async function printTable(jobs: BenchmarkJob[]): Promise<void> {
124167
if (jobs.length === 0) {
125168
console.log(chalk.dim("No benchmark jobs found"));
126169
return;
@@ -149,7 +192,7 @@ function printTable(jobs: BenchmarkJob[]): void {
149192

150193
// Rows
151194
for (const job of jobs) {
152-
const stats = aggregateJobStats(job);
195+
const stats = await aggregateJobStats(job);
153196

154197
const id = truncate(job.id, COL_ID).padEnd(COL_ID);
155198
const name = truncate(job.name || "", nameWidth).padEnd(nameWidth);
@@ -262,7 +305,7 @@ export async function listBenchmarkJobsCommand(
262305
if (format !== "text") {
263306
output(jobs, { format, defaultFormat: "json" });
264307
} else {
265-
printTable(jobs);
308+
await printTable(jobs);
266309
}
267310
} catch (error) {
268311
outputError("Failed to list benchmark jobs", error);

0 commit comments

Comments
 (0)