55import chalk from "chalk" ;
66import {
77 listBenchmarkJobs ,
8+ listBenchmarkRunScenarioRuns ,
89 type BenchmarkJob ,
910} from "../../services/benchmarkJobService.js" ;
1011import { output , outputError } from "../../utils/output.js" ;
@@ -54,7 +55,16 @@ interface JobStats {
5455 avgScore : number | null ;
5556}
5657
57- function aggregateJobStats ( job : BenchmarkJob ) : JobStats {
58+ // Scenario run states that count as finished
59+ const SCENARIO_DONE_STATES = new Set ( [
60+ "completed" ,
61+ "failed" ,
62+ "canceled" ,
63+ "timeout" ,
64+ "error" ,
65+ ] ) ;
66+
67+ async function aggregateJobStats ( job : BenchmarkJob ) : Promise < JobStats > {
5868 const outcomes = job . benchmark_outcomes || [ ] ;
5969 const scenarioCount = job . job_spec ?. scenario_ids ?. length || 0 ;
6070 const agentCount = job . job_spec ?. agent_configs ?. length || 1 ;
@@ -65,6 +75,7 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
6575 let scoreSum = 0 ;
6676 let scoreCount = 0 ;
6777
78+ // Count from completed benchmark runs
6879 for ( const outcome of outcomes ) {
6980 done += outcome . n_completed + outcome . n_failed + outcome . n_timeout ;
7081 errors += outcome . n_failed + outcome . n_timeout ;
@@ -74,6 +85,38 @@ function aggregateJobStats(job: BenchmarkJob): JobStats {
7485 }
7586 }
7687
88+ // Count finished scenarios from in-progress benchmark runs
89+ const inProgressRuns = job . in_progress_runs || [ ] ;
90+ if ( inProgressRuns . length > 0 ) {
91+ const runResults = await Promise . all (
92+ inProgressRuns . map ( ( run ) =>
93+ listBenchmarkRunScenarioRuns ( run . benchmark_run_id ) ,
94+ ) ,
95+ ) ;
96+ for ( const scenarioRuns of runResults ) {
97+ let runScoreSum = 0 ;
98+ let runScoreCount = 0 ;
99+ for ( const sr of scenarioRuns ) {
100+ const state = sr . state ?. toLowerCase ( ) || "" ;
101+ if ( SCENARIO_DONE_STATES . has ( state ) ) {
102+ done ++ ;
103+ if ( state !== "completed" ) {
104+ errors ++ ;
105+ }
106+ const score = sr . scoring_contract_result ?. score ;
107+ if ( score !== undefined && score !== null ) {
108+ runScoreSum += score ;
109+ runScoreCount ++ ;
110+ }
111+ }
112+ }
113+ if ( runScoreCount > 0 ) {
114+ scoreSum += runScoreSum / runScoreCount ;
115+ scoreCount ++ ;
116+ }
117+ }
118+ }
119+
77120 return {
78121 done,
79122 total : total || done ,
@@ -120,7 +163,7 @@ function truncate(str: string, maxLen: number): string {
120163 return str . slice ( 0 , maxLen - 1 ) + "…" ;
121164}
122165
123- function printTable ( jobs : BenchmarkJob [ ] ) : void {
166+ async function printTable ( jobs : BenchmarkJob [ ] ) : Promise < void > {
124167 if ( jobs . length === 0 ) {
125168 console . log ( chalk . dim ( "No benchmark jobs found" ) ) ;
126169 return ;
@@ -149,7 +192,7 @@ function printTable(jobs: BenchmarkJob[]): void {
149192
150193 // Rows
151194 for ( const job of jobs ) {
152- const stats = aggregateJobStats ( job ) ;
195+ const stats = await aggregateJobStats ( job ) ;
153196
154197 const id = truncate ( job . id , COL_ID ) . padEnd ( COL_ID ) ;
155198 const name = truncate ( job . name || "" , nameWidth ) . padEnd ( nameWidth ) ;
@@ -262,7 +305,7 @@ export async function listBenchmarkJobsCommand(
262305 if ( format !== "text" ) {
263306 output ( jobs , { format, defaultFormat : "json" } ) ;
264307 } else {
265- printTable ( jobs ) ;
308+ await printTable ( jobs ) ;
266309 }
267310 } catch ( error ) {
268311 outputError ( "Failed to list benchmark jobs" , error ) ;
0 commit comments