From fb7f21206b670593d4339fc5db1ba00554f11780 Mon Sep 17 00:00:00 2001 From: James Chainey Date: Fri, 13 Mar 2026 14:02:47 -0700 Subject: [PATCH 1/4] james/krafton-bmj-preview --- src/components/MainMenu.tsx | 4 ---- src/store/betaFeatureStore.tsx | 12 ++++++------ tests/__tests__/components/MainMenu.test.tsx | 12 ++++++++++++ tests/__tests__/integration/navigationFlows.test.tsx | 1 + tests/__tests__/router/Router.test.tsx | 8 ++++++++ tests/__tests__/screens/MenuScreen.test.tsx | 1 + 6 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/components/MainMenu.tsx b/src/components/MainMenu.tsx index 78a5a1c0..8b37037e 100644 --- a/src/components/MainMenu.tsx +++ b/src/components/MainMenu.tsx @@ -24,7 +24,6 @@ interface MenuItem { } const allMenuItems: MenuItem[] = [ - /** { key: "benchmarks", label: "Benchmarks", @@ -32,7 +31,6 @@ const allMenuItems: MenuItem[] = [ icon: "▷", color: colors.success, }, - */ { key: "devboxes", label: "Devboxes", @@ -190,10 +188,8 @@ export const MainMenu = ({ onSelect }: MainMenuProps) => { selectByKey("snapshots"); } else if (input === "o") { selectByKey("objects"); - /** } else if (input === "e") { selectByKey("benchmarks"); - */ } else if (input === "n") { selectByKey("settings"); } else if (input >= "1" && input <= "9") { diff --git a/src/store/betaFeatureStore.tsx b/src/store/betaFeatureStore.tsx index 40e0bb56..93a9d10f 100644 --- a/src/store/betaFeatureStore.tsx +++ b/src/store/betaFeatureStore.tsx @@ -16,14 +16,15 @@ interface BetaFeatureContextValue { /** * Check if a specific feature flag is enabled - * Currently all beta features are controlled by the single RL_CLI_BETA flag, - * but this allows for future granular control + * This stays in place so future beta-only features can be wired through + * the shared RL_CLI_BETA gate without changing consuming components. */ isFeatureEnabled: (feature: BetaFeature) => boolean; } /** - * Known beta features that can be enabled + * Known beta features that can be enabled. + * There are currently no named beta-only features configured. */ export type BetaFeature = never; @@ -41,9 +42,8 @@ export function BetaFeatureProvider({ children }: BetaFeatureProviderProps) { const isFeatureEnabled = React.useCallback( (feature: BetaFeature): boolean => { - // Currently all beta features are gated by the same flag - // This can be extended to support per-feature flags in the future - // Add cases here when new beta features are introduced + // No named beta features exist yet, but keep the shared env-based + // gate so future feature flags can opt into it centrally. void feature; return betaEnabled; }, diff --git a/tests/__tests__/components/MainMenu.test.tsx b/tests/__tests__/components/MainMenu.test.tsx index a59c5b48..9d2c15b3 100644 --- a/tests/__tests__/components/MainMenu.test.tsx +++ b/tests/__tests__/components/MainMenu.test.tsx @@ -25,6 +25,7 @@ describe('MainMenu', () => { const { lastFrame } = renderMainMenu(); const frame = lastFrame() || ''; + expect(frame).toContain('Benchmarks'); expect(frame).toContain('Devboxes'); expect(frame).toContain('Blueprints'); expect(frame).toContain('Snapshots'); @@ -48,6 +49,17 @@ describe('MainMenu', () => { expect(frame).toContain('[3]'); }); + it('selects benchmarks with the e shortcut', () => { + let selectedKey = ''; + const { stdin } = renderMainMenu((key: string) => { + selectedKey = key; + }); + + stdin.write('e'); + + expect(selectedKey).toBe('benchmarks'); + }); + it('shows navigation help', () => { const { lastFrame } = renderMainMenu(); diff --git a/tests/__tests__/integration/navigationFlows.test.tsx b/tests/__tests__/integration/navigationFlows.test.tsx index c65dfb40..4908fdca 100644 --- a/tests/__tests__/integration/navigationFlows.test.tsx +++ b/tests/__tests__/integration/navigationFlows.test.tsx @@ -24,6 +24,7 @@ describe("navigation flows", () => { const { lastFrame, stdin } = renderApp("menu"); let frame = lastFrame() ?? ""; + expect(frame).toContain("Benchmarks"); expect(frame).toContain("Devboxes"); expect(frame).toContain("Blueprints"); diff --git a/tests/__tests__/router/Router.test.tsx b/tests/__tests__/router/Router.test.tsx index a9dec66e..df03b99c 100644 --- a/tests/__tests__/router/Router.test.tsx +++ b/tests/__tests__/router/Router.test.tsx @@ -26,10 +26,18 @@ describe("Router", () => { it("renders menu screen when initialScreen is menu", () => { const { lastFrame } = renderWithApp("menu"); const frame = lastFrame() ?? ""; + expect(frame).toContain("Benchmarks"); expect(frame).toContain("Devboxes"); expect(frame).toContain("Blueprints"); }); + it("renders benchmark-menu screen when initialScreen is benchmark-menu", () => { + const { lastFrame } = renderWithApp("benchmark-menu"); + const frame = lastFrame() ?? ""; + expect(frame).toContain("Benchmarks"); + expect(frame).toContain("Benchmark Defs"); + }); + it("renders devbox-list screen when initialScreen is devbox-list", () => { const { lastFrame } = renderWithApp("devbox-list"); const frame = lastFrame() ?? ""; diff --git a/tests/__tests__/screens/MenuScreen.test.tsx b/tests/__tests__/screens/MenuScreen.test.tsx index c1f858dd..729bbc85 100644 --- a/tests/__tests__/screens/MenuScreen.test.tsx +++ b/tests/__tests__/screens/MenuScreen.test.tsx @@ -26,6 +26,7 @@ describe("MenuScreen", () => { it("displays main menu items", () => { const { lastFrame } = renderMenuScreen(); const frame = lastFrame() ?? ""; + expect(frame).toContain("Benchmarks"); expect(frame).toContain("Devboxes"); expect(frame).toContain("Blueprints"); expect(frame).toContain("Snapshots"); From 3d45525bcae60ab08d34fe615d613eeca14fcb89 Mon Sep 17 00:00:00 2001 From: James Chainey Date: Mon, 16 Mar 2026 17:12:01 -0700 Subject: [PATCH 2/4] refactored progress bar and fixed a dup bug that made it looks like a scenario inside a job was stuck running --- src/commands/benchmark-job/progress.ts | 237 ++++++++++++++ src/commands/benchmark-job/summary.ts | 158 +-------- src/commands/benchmark-job/watch.ts | 196 ++---------- src/screens/BenchmarkJobDetailScreen.tsx | 29 +- .../commands/benchmark-job/progress.test.ts | 301 ++++++++++++++++++ 5 files changed, 591 insertions(+), 330 deletions(-) create mode 100644 src/commands/benchmark-job/progress.ts create mode 100644 tests/__tests__/commands/benchmark-job/progress.test.ts diff --git a/src/commands/benchmark-job/progress.ts b/src/commands/benchmark-job/progress.ts new file mode 100644 index 00000000..a086e1c2 --- /dev/null +++ b/src/commands/benchmark-job/progress.ts @@ -0,0 +1,237 @@ +/** + * Shared benchmark job progress helpers + * + * Provides terminal-state classification and run reconciliation logic + * used by watch, summary, and detail views. + */ + +import type { + BenchmarkJob, + ScenarioRun, +} from "../../services/benchmarkJobService.js"; + +// --------------------------------------------------------------------------- +// State Classification +// --------------------------------------------------------------------------- + +/** Job states that indicate completion */ +export const JOB_COMPLETED_STATES = [ + "completed", + "failed", + "canceled", + "cancelled", + "timeout", +]; + +/** Scenario run states that count as finished (no longer running) */ +export const SCENARIO_COMPLETED_STATES = [ + "completed", + "failed", + "canceled", + "cancelled", + "timeout", + "error", + "scored", // treat scored as complete, consistent with run detail screen +]; + +/** Check if a job is in a terminal state */ +export function isJobCompleted(state: string | undefined | null): boolean { + if (!state) return false; + return JOB_COMPLETED_STATES.includes(state.toLowerCase()); +} + +/** Check if a scenario is in a terminal state */ +export function isScenarioCompleted(state: string | undefined | null): boolean { + if (!state) return false; + return SCENARIO_COMPLETED_STATES.includes(state.toLowerCase()); +} + +// --------------------------------------------------------------------------- +// Progress Stats +// --------------------------------------------------------------------------- + +/** In-progress scenario info for display */ +export interface InProgressScenario { + name: string; + state: string; + startTimeMs?: number; +} + +/** Progress stats for a benchmark run */ +export interface RunProgress { + benchmarkRunId: string; + agentName: string; + modelName?: string; + state: string; + expectedTotal: number; + started: number; + running: number; + scoring: number; + finished: number; + avgScore: number | null; + inProgressScenarios: InProgressScenario[]; +} + +// --------------------------------------------------------------------------- +// Agent Info Extraction +// --------------------------------------------------------------------------- + +type InProgressRun = NonNullable[number]; + +/** Get agent info from in_progress_run */ +export function getAgentInfo(run: InProgressRun): { + name: string; + model?: string; +} { + const agentConfig = run.agent_config; + if (agentConfig && agentConfig.type === "job_agent") { + return { + name: agentConfig.name, + model: agentConfig.model_name ?? undefined, + }; + } + return { name: "unknown" }; +} + +// --------------------------------------------------------------------------- +// Progress Calculation +// --------------------------------------------------------------------------- + +/** Calculate progress from scenario runs */ +export function calculateRunProgress( + benchmarkRunId: string, + agentName: string, + modelName: string | undefined, + state: string, + expectedTotal: number, + scenarioRuns: ScenarioRun[], +): RunProgress { + let running = 0; + let scoring = 0; + let finished = 0; + let scoreSum = 0; + let scoreCount = 0; + const inProgressScenarios: InProgressScenario[] = []; + + for (const scenario of scenarioRuns) { + const scenarioState = scenario.state?.toLowerCase() || ""; + + if (isScenarioCompleted(scenarioState)) { + finished++; + const score = scenario.scoring_contract_result?.score; + if (score !== undefined && score !== null) { + scoreSum += score; + scoreCount++; + } + } else if (scenarioState === "scoring") { + scoring++; + inProgressScenarios.push({ + name: scenario.name || scenario.scenario_id || "unknown", + state: scenarioState, + startTimeMs: scenario.start_time_ms, + }); + } else if (scenarioState === "running") { + running++; + inProgressScenarios.push({ + name: scenario.name || scenario.scenario_id || "unknown", + state: scenarioState, + startTimeMs: scenario.start_time_ms, + }); + } else if (scenarioState && scenarioState !== "pending") { + inProgressScenarios.push({ + name: scenario.name || scenario.scenario_id || "unknown", + state: scenarioState, + startTimeMs: scenario.start_time_ms, + }); + } + } + + return { + benchmarkRunId, + agentName, + modelName, + state, + expectedTotal, + started: scenarioRuns.length, + running, + scoring, + finished, + avgScore: scoreCount > 0 ? scoreSum / scoreCount : null, + inProgressScenarios, + }; +} + +// --------------------------------------------------------------------------- +// Run Reconciliation +// --------------------------------------------------------------------------- + +/** + * Build progress for all runs in a job, preferring completed outcomes over + * stale in-progress data for the same benchmark_run_id. + * + * @param job The benchmark job + * @param fetchScenarioRuns Callback to fetch scenario runs for a benchmark run + */ +export async function fetchAllRunsProgress( + job: BenchmarkJob, + fetchScenarioRuns: (benchmarkRunId: string) => Promise, +): Promise { + const results: RunProgress[] = []; + + // Get expected scenario count from job spec + const expectedTotal = job.job_spec?.scenario_ids?.length || 0; + + // Track which runs we've already added from completed outcomes + const completedRunIds = new Set(); + + // First, add completed runs from benchmark_outcomes (authoritative) + const completedOutcomes = job.benchmark_outcomes || []; + for (const outcome of completedOutcomes) { + completedRunIds.add(outcome.benchmark_run_id); + + const scenarioOutcomes = outcome.scenario_outcomes || []; + let scoreSum = 0; + let scoreCount = 0; + for (const s of scenarioOutcomes) { + if (s.score !== undefined && s.score !== null) { + scoreSum += s.score; + scoreCount++; + } + } + results.push({ + benchmarkRunId: outcome.benchmark_run_id, + agentName: outcome.agent_name, + modelName: outcome.model_name ?? undefined, + state: "completed", + expectedTotal: expectedTotal || scenarioOutcomes.length, + started: scenarioOutcomes.length, + running: 0, + scoring: 0, + finished: scenarioOutcomes.length, + avgScore: scoreCount > 0 ? scoreSum / scoreCount : null, + inProgressScenarios: [], + }); + } + + // Then, fetch progress for in-progress runs that are NOT already in outcomes + const inProgressRuns = job.in_progress_runs || []; + const progressPromises = inProgressRuns + .filter((run) => !completedRunIds.has(run.benchmark_run_id)) + .map(async (run) => { + const agentInfo = getAgentInfo(run); + const scenarioRuns = await fetchScenarioRuns(run.benchmark_run_id); + return calculateRunProgress( + run.benchmark_run_id, + agentInfo.name, + agentInfo.model, + run.state, + expectedTotal, + scenarioRuns, + ); + }); + + const inProgressResults = await Promise.all(progressPromises); + results.push(...inProgressResults); + + return results; +} diff --git a/src/commands/benchmark-job/summary.ts b/src/commands/benchmark-job/summary.ts index 3ab04aa9..b492fbb9 100644 --- a/src/commands/benchmark-job/summary.ts +++ b/src/commands/benchmark-job/summary.ts @@ -7,167 +7,25 @@ import { getBenchmarkJob, listBenchmarkRunScenarioRuns, type BenchmarkJob, - type ScenarioRun, } from "../../services/benchmarkJobService.js"; import { output, outputError } from "../../utils/output.js"; +import { + isJobCompleted, + fetchAllRunsProgress, + type RunProgress, +} from "./progress.js"; interface SummaryOptions { output?: string; extended?: boolean; } -// Job states that indicate completion -const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"]; - -// Scenario run states that indicate completion -const SCENARIO_COMPLETED_STATES = [ - "completed", - "failed", - "canceled", - "timeout", - "error", -]; - // Format percentage function formatPercent(count: number, total: number): string { if (total === 0) return "0.0%"; return ((count / total) * 100).toFixed(1) + "%"; } -// Progress stats for a benchmark run -interface RunProgress { - benchmarkRunId: string; - agentName: string; - modelName?: string; - state: string; - expectedTotal: number; - started: number; - running: number; - scoring: number; - finished: number; - avgScore: number | null; -} - -// Calculate progress from scenario runs -function calculateRunProgress( - benchmarkRunId: string, - agentName: string, - modelName: string | undefined, - state: string, - expectedTotal: number, - scenarioRuns: ScenarioRun[], -): RunProgress { - let running = 0; - let scoring = 0; - let finished = 0; - let scoreSum = 0; - let scoreCount = 0; - - for (const scenario of scenarioRuns) { - const scenarioState = scenario.state?.toLowerCase() || ""; - - if (SCENARIO_COMPLETED_STATES.includes(scenarioState)) { - finished++; - const score = scenario.scoring_contract_result?.score; - if (score !== undefined && score !== null) { - scoreSum += score; - scoreCount++; - } - } else if (scenarioState === "scoring" || scenarioState === "scored") { - scoring++; - } else if (scenarioState === "running") { - running++; - } - } - - return { - benchmarkRunId, - agentName, - modelName, - state, - expectedTotal, - started: scenarioRuns.length, - running, - scoring, - finished, - avgScore: scoreCount > 0 ? scoreSum / scoreCount : null, - }; -} - -// In-progress run type -type InProgressRun = NonNullable[number]; - -// Get agent info from in_progress_run -function getAgentInfo(run: InProgressRun): { - name: string; - model?: string; -} { - const agentConfig = run.agent_config; - if (agentConfig && agentConfig.type === "job_agent") { - return { - name: agentConfig.name, - model: agentConfig.model_name ?? undefined, - }; - } - return { name: "unknown" }; -} - -// Fetch progress for all runs (in-progress and completed) -async function fetchAllRunsProgress(job: BenchmarkJob): Promise { - const results: RunProgress[] = []; - - // Get expected scenario count from job spec - const expectedTotal = job.job_spec?.scenario_ids?.length || 0; - - // First, add completed runs from benchmark_outcomes - const completedOutcomes = job.benchmark_outcomes || []; - for (const outcome of completedOutcomes) { - const scenarioOutcomes = outcome.scenario_outcomes || []; - let scoreSum = 0; - let scoreCount = 0; - for (const s of scenarioOutcomes) { - if (s.score !== undefined && s.score !== null) { - scoreSum += s.score; - scoreCount++; - } - } - results.push({ - benchmarkRunId: outcome.benchmark_run_id, - agentName: outcome.agent_name, - modelName: outcome.model_name ?? undefined, - state: "completed", - expectedTotal: expectedTotal || scenarioOutcomes.length, - started: scenarioOutcomes.length, - running: 0, - scoring: 0, - finished: scenarioOutcomes.length, - avgScore: scoreCount > 0 ? scoreSum / scoreCount : null, - }); - } - - // Then, fetch progress for in-progress runs - const inProgressRuns = job.in_progress_runs || []; - const progressPromises = inProgressRuns.map(async (run) => { - const agentInfo = getAgentInfo(run); - const scenarioRuns = await listBenchmarkRunScenarioRuns( - run.benchmark_run_id, - ); - return calculateRunProgress( - run.benchmark_run_id, - agentInfo.name, - agentInfo.model, - run.state, - expectedTotal, - scenarioRuns, - ); - }); - - const inProgressResults = await Promise.all(progressPromises); - results.push(...inProgressResults); - - return results; -} - // Format a single run's progress line function formatRunProgressLine(progress: RunProgress): string { // Format agent/model label @@ -237,10 +95,10 @@ async function printStatus(job: BenchmarkJob): Promise { console.log(`ID: ${job.id}`); console.log(`State: ${state}`); - if (!COMPLETED_STATES.includes(state)) { + if (!isJobCompleted(state)) { // Fetch and show progress for in-progress runs console.log(); - const progressList = await fetchAllRunsProgress(job); + const progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); printProgress(progressList); } } @@ -420,7 +278,7 @@ export async function summaryBenchmarkJob( ) { try { const job = await getBenchmarkJob(id); - const isComplete = COMPLETED_STATES.includes(job.state || ""); + const isComplete = isJobCompleted(job.state); if (options.output && options.output !== "text") { output(job, { format: options.output, defaultFormat: "json" }); diff --git a/src/commands/benchmark-job/watch.ts b/src/commands/benchmark-job/watch.ts index 8755bead..c0bc1e0e 100644 --- a/src/commands/benchmark-job/watch.ts +++ b/src/commands/benchmark-job/watch.ts @@ -7,21 +7,14 @@ import { getBenchmarkJob, listBenchmarkRunScenarioRuns, type BenchmarkJob, - type ScenarioRun, } from "../../services/benchmarkJobService.js"; import { outputError } from "../../utils/output.js"; - -// Job states that indicate completion -const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"]; - -// Scenario run states that indicate completion -const SCENARIO_COMPLETED_STATES = [ - "completed", - "failed", - "canceled", - "timeout", - "error", -]; +import { + isJobCompleted, + fetchAllRunsProgress, + type RunProgress, + type InProgressScenario, +} from "./progress.js"; // Polling config const POLL_INTERVAL_MS = 5 * 1000; // 5 seconds @@ -100,27 +93,8 @@ function formatDuration(ms: number): string { return `${seconds}s`; } -// In-progress scenario info for display -interface InProgressScenario { - name: string; - state: string; - startTimeMs?: number; -} - -// Progress stats for a benchmark run -interface RunProgress { - benchmarkRunId: string; - agentName: string; - modelName?: string; - state: string; - expectedTotal: number; - started: number; - running: number; - scoring: number; - finished: number; - avgScore: number | null; - inProgressScenarios: InProgressScenario[]; -} +// Re-export types from shared module for internal use +export type { RunProgress, InProgressScenario }; // Spinner frames for running indicators const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]; @@ -140,144 +114,6 @@ function getMaxScenariosPerRun(numRuns: number): number { return Math.max(Math.floor(availableLines / Math.max(numRuns, 1)), 3); } -// Calculate progress from scenario runs -function calculateRunProgress( - benchmarkRunId: string, - agentName: string, - modelName: string | undefined, - state: string, - expectedTotal: number, - scenarioRuns: ScenarioRun[], -): RunProgress { - let running = 0; - let scoring = 0; - let finished = 0; - let scoreSum = 0; - let scoreCount = 0; - const inProgressScenarios: InProgressScenario[] = []; - - for (const scenario of scenarioRuns) { - const scenarioState = scenario.state?.toLowerCase() || ""; - - if (SCENARIO_COMPLETED_STATES.includes(scenarioState)) { - finished++; - const score = scenario.scoring_contract_result?.score; - if (score !== undefined && score !== null) { - scoreSum += score; - scoreCount++; - } - } else if (scenarioState === "scoring" || scenarioState === "scored") { - scoring++; - inProgressScenarios.push({ - name: scenario.name || scenario.scenario_id || "unknown", - state: scenarioState, - startTimeMs: scenario.start_time_ms, - }); - } else if (scenarioState === "running") { - running++; - inProgressScenarios.push({ - name: scenario.name || scenario.scenario_id || "unknown", - state: scenarioState, - startTimeMs: scenario.start_time_ms, - }); - } else if (scenarioState && scenarioState !== "pending") { - inProgressScenarios.push({ - name: scenario.name || scenario.scenario_id || "unknown", - state: scenarioState, - startTimeMs: scenario.start_time_ms, - }); - } - } - - return { - benchmarkRunId, - agentName, - modelName, - state, - expectedTotal, - started: scenarioRuns.length, - running, - scoring, - finished, - avgScore: scoreCount > 0 ? scoreSum / scoreCount : null, - inProgressScenarios, - }; -} - -// In-progress run type -type InProgressRun = NonNullable[number]; - -// Get agent info from in_progress_run -function getAgentInfo(run: InProgressRun): { - name: string; - model?: string; -} { - const agentConfig = run.agent_config; - if (agentConfig && agentConfig.type === "job_agent") { - return { - name: agentConfig.name, - model: agentConfig.model_name ?? undefined, - }; - } - return { name: "unknown" }; -} - -// Fetch progress for all runs (in-progress and completed) -async function fetchAllRunsProgress(job: BenchmarkJob): Promise { - const results: RunProgress[] = []; - - // Get expected scenario count from job spec - const expectedTotal = job.job_spec?.scenario_ids?.length || 0; - - // First, add completed runs from benchmark_outcomes - const completedOutcomes = job.benchmark_outcomes || []; - for (const outcome of completedOutcomes) { - const scenarioOutcomes = outcome.scenario_outcomes || []; - let scoreSum = 0; - let scoreCount = 0; - for (const s of scenarioOutcomes) { - if (s.score !== undefined && s.score !== null) { - scoreSum += s.score; - scoreCount++; - } - } - results.push({ - benchmarkRunId: outcome.benchmark_run_id, - agentName: outcome.agent_name, - modelName: outcome.model_name ?? undefined, - state: "completed", - expectedTotal: expectedTotal || scenarioOutcomes.length, - started: scenarioOutcomes.length, - running: 0, - scoring: 0, - finished: scenarioOutcomes.length, - avgScore: scoreCount > 0 ? scoreSum / scoreCount : null, - inProgressScenarios: [], - }); - } - - // Then, fetch progress for in-progress runs - const inProgressRuns = job.in_progress_runs || []; - const progressPromises = inProgressRuns.map(async (run) => { - const agentInfo = getAgentInfo(run); - const scenarioRuns = await listBenchmarkRunScenarioRuns( - run.benchmark_run_id, - ); - return calculateRunProgress( - run.benchmark_run_id, - agentInfo.name, - agentInfo.model, - run.state, - expectedTotal, - scenarioRuns, - ); - }); - - const inProgressResults = await Promise.all(progressPromises); - results.push(...inProgressResults); - - return results; -} // Format a single run's progress line function formatRunProgressLine(progress: RunProgress): string { @@ -514,7 +350,7 @@ export async function watchBenchmarkJob(id: string) { let job = await getBenchmarkJob(id); // If job is already complete, just show results - if (COMPLETED_STATES.includes(job.state || "")) { + if (isJobCompleted(job.state)) { printResultsTable(job); return; } @@ -562,9 +398,9 @@ export async function watchBenchmarkJob(id: string) { try { let tick = 0; - let progressList = await fetchAllRunsProgress(job); + let progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); - while (!COMPLETED_STATES.includes(job.state || "")) { + while (!isJobCompleted(job.state)) { // Check timeout if (Date.now() - jobStartMs > MAX_WAIT_MS) { cleanup(); @@ -612,13 +448,19 @@ export async function watchBenchmarkJob(id: string) { // Every UPDATES_PER_POLL ticks, poll the API for fresh data if (tick % UPDATES_PER_POLL === 0) { job = await getBenchmarkJob(id); - if (!COMPLETED_STATES.includes(job.state || "")) { - progressList = await fetchAllRunsProgress(job); + if (!isJobCompleted(job.state)) { + progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); } } await sleep(SPINNER_INTERVAL_MS); } + + // Final reconciliation: refresh job and progress one more time to ensure + // we have the most up-to-date data before printing results. This prevents + // stale in-progress state from persisting after the job completes. + job = await getBenchmarkJob(id); + progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); } finally { process.stdout.off("resize", handleResize); cleanup(); diff --git a/src/screens/BenchmarkJobDetailScreen.tsx b/src/screens/BenchmarkJobDetailScreen.tsx index d71ce721..26eac88c 100644 --- a/src/screens/BenchmarkJobDetailScreen.tsx +++ b/src/screens/BenchmarkJobDetailScreen.tsx @@ -243,6 +243,8 @@ export function BenchmarkJobDetailScreen({ // Build a unified view of benchmark runs per agent // Collect all agents from job_spec, in_progress_runs, and benchmark_outcomes + // NOTE: We dedupe by benchmark_run_id, preferring completed outcomes over + // stale in-progress data to avoid showing mixed completed/running state. interface AgentRunInfo { agentName: string; modelName?: string; @@ -258,9 +260,14 @@ export function BenchmarkJobDetailScreen({ const agentRuns: AgentRunInfo[] = []; - // First, add completed runs from benchmark_outcomes + // Track which benchmark_run_ids we've already added from completed outcomes + const completedRunIds = new Set(); + + // First, add completed runs from benchmark_outcomes (authoritative) if (job.benchmark_outcomes) { job.benchmark_outcomes.forEach((outcome) => { + completedRunIds.add(outcome.benchmark_run_id); + const total = outcome.n_completed + outcome.n_failed + outcome.n_timeout; const status = outcome.n_failed > 0 || outcome.n_timeout > 0 @@ -283,9 +290,14 @@ export function BenchmarkJobDetailScreen({ }); } - // Add in-progress runs + // Add in-progress runs that are NOT already in completed outcomes if (job.in_progress_runs) { job.in_progress_runs.forEach((run) => { + // Skip if we already have this run from benchmark_outcomes + if (completedRunIds.has(run.benchmark_run_id)) { + return; + } + // Get agent name from agent_config if available let agentName = "Unknown Agent"; if (run.agent_config && "name" in run.agent_config) { @@ -805,9 +817,20 @@ export function BenchmarkJobDetailScreen({ }); } - // In-progress runs + // In-progress runs (skip any that are already in completed outcomes) if (j.in_progress_runs && j.in_progress_runs.length > 0) { + // Build set of completed run IDs to skip + const detailCompletedRunIds = new Set(); + j.benchmark_outcomes?.forEach((o) => + detailCompletedRunIds.add(o.benchmark_run_id), + ); + j.in_progress_runs.forEach((run, idx) => { + // Skip if already in completed outcomes + if (detailCompletedRunIds.has(run.benchmark_run_id)) { + return; + } + let agentName = "Unknown Agent"; if (run.agent_config && "name" in run.agent_config) { agentName = (run.agent_config as any).name; diff --git a/tests/__tests__/commands/benchmark-job/progress.test.ts b/tests/__tests__/commands/benchmark-job/progress.test.ts new file mode 100644 index 00000000..915efdc3 --- /dev/null +++ b/tests/__tests__/commands/benchmark-job/progress.test.ts @@ -0,0 +1,301 @@ +/** + * Tests for benchmark job progress helpers + * + * These tests verify that the progress reconciliation logic correctly + * handles the case where a job is completed but the prior progress + * snapshot still had scenario runs marked as running. + */ + +import { describe, it, expect } from "@jest/globals"; +import { + isJobCompleted, + isScenarioCompleted, + JOB_COMPLETED_STATES, + SCENARIO_COMPLETED_STATES, + fetchAllRunsProgress, + calculateRunProgress, + type RunProgress, +} from "@/commands/benchmark-job/progress.js"; +import type { BenchmarkJob } from "@/services/benchmarkJobService.js"; + +describe("isJobCompleted", () => { + it("should return true for completed states", () => { + expect(isJobCompleted("completed")).toBe(true); + expect(isJobCompleted("failed")).toBe(true); + expect(isJobCompleted("canceled")).toBe(true); + expect(isJobCompleted("cancelled")).toBe(true); + expect(isJobCompleted("timeout")).toBe(true); + }); + + it("should return false for in-progress states", () => { + expect(isJobCompleted("running")).toBe(false); + expect(isJobCompleted("queued")).toBe(false); + expect(isJobCompleted("initializing")).toBe(false); + }); + + it("should return false for null/undefined", () => { + expect(isJobCompleted(null)).toBe(false); + expect(isJobCompleted(undefined)).toBe(false); + expect(isJobCompleted("")).toBe(false); + }); + + it("should be case insensitive", () => { + expect(isJobCompleted("COMPLETED")).toBe(true); + expect(isJobCompleted("Completed")).toBe(true); + expect(isJobCompleted("FAILED")).toBe(true); + }); +}); + +describe("isScenarioCompleted", () => { + it("should return true for completed states", () => { + expect(isScenarioCompleted("completed")).toBe(true); + expect(isScenarioCompleted("failed")).toBe(true); + expect(isScenarioCompleted("canceled")).toBe(true); + expect(isScenarioCompleted("cancelled")).toBe(true); + expect(isScenarioCompleted("timeout")).toBe(true); + expect(isScenarioCompleted("error")).toBe(true); + }); + + it("should treat scored as completed", () => { + expect(isScenarioCompleted("scored")).toBe(true); + }); + + it("should return false for in-progress states", () => { + expect(isScenarioCompleted("running")).toBe(false); + expect(isScenarioCompleted("scoring")).toBe(false); + expect(isScenarioCompleted("pending")).toBe(false); + }); + + it("should return false for null/undefined", () => { + expect(isScenarioCompleted(null)).toBe(false); + expect(isScenarioCompleted(undefined)).toBe(false); + expect(isScenarioCompleted("")).toBe(false); + }); +}); + +describe("calculateRunProgress", () => { + it("should count scored scenarios as finished", () => { + const scenarioRuns = [ + { state: "completed", scoring_contract_result: { score: 1.0 } }, + { state: "scored", scoring_contract_result: { score: 0.5 } }, + { state: "running" }, + ] as any[]; + + const progress = calculateRunProgress( + "run_123", + "test-agent", + undefined, + "running", + 3, + scenarioRuns, + ); + + expect(progress.finished).toBe(2); + expect(progress.running).toBe(1); + expect(progress.scoring).toBe(0); + }); + + it("should count scoring scenarios separately", () => { + const scenarioRuns = [ + { state: "completed", scoring_contract_result: { score: 1.0 } }, + { state: "scoring" }, + { state: "running" }, + ] as any[]; + + const progress = calculateRunProgress( + "run_123", + "test-agent", + undefined, + "running", + 3, + scenarioRuns, + ); + + expect(progress.finished).toBe(1); + expect(progress.scoring).toBe(1); + expect(progress.running).toBe(1); + }); + + it("should calculate average score from completed scenarios", () => { + const scenarioRuns = [ + { state: "completed", scoring_contract_result: { score: 1.0 } }, + { state: "completed", scoring_contract_result: { score: 0.5 } }, + { state: "scored", scoring_contract_result: { score: 0.0 } }, + ] as any[]; + + const progress = calculateRunProgress( + "run_123", + "test-agent", + undefined, + "completed", + 3, + scenarioRuns, + ); + + expect(progress.avgScore).toBe(0.5); + }); +}); + +describe("fetchAllRunsProgress", () => { + it("should prefer completed outcomes over in-progress runs for same benchmark_run_id", async () => { + const job: Partial = { + job_spec: { + scenario_ids: ["s1", "s2", "s3"], + } as any, + benchmark_outcomes: [ + { + benchmark_run_id: "run_123", + agent_name: "test-agent", + model_name: "gpt-4", + average_score: 0.8, + n_completed: 3, + n_failed: 0, + n_timeout: 0, + scenario_outcomes: [ + { score: 1.0 }, + { score: 0.7 }, + { score: 0.7 }, + ] as any[], + }, + ], + in_progress_runs: [ + { + benchmark_run_id: "run_123", + state: "running", + agent_config: { type: "job_agent", name: "test-agent" }, + } as any, + ], + }; + + const mockFetchScenarioRuns = async () => []; + const progress = await fetchAllRunsProgress( + job as BenchmarkJob, + mockFetchScenarioRuns, + ); + + expect(progress).toHaveLength(1); + expect(progress[0].state).toBe("completed"); + expect(progress[0].running).toBe(0); + expect(progress[0].finished).toBe(3); + }); + + it("should include in-progress runs that are not in completed outcomes", async () => { + const job: Partial = { + job_spec: { + scenario_ids: ["s1", "s2", "s3"], + } as any, + benchmark_outcomes: [ + { + benchmark_run_id: "run_completed", + agent_name: "agent-1", + average_score: 0.8, + n_completed: 3, + n_failed: 0, + n_timeout: 0, + scenario_outcomes: [{ score: 0.8 }] as any[], + }, + ], + in_progress_runs: [ + { + benchmark_run_id: "run_in_progress", + state: "running", + agent_config: { type: "job_agent", name: "agent-2" }, + } as any, + ], + }; + + const mockFetchScenarioRuns = async () => [ + { state: "completed", scoring_contract_result: { score: 1.0 } }, + { state: "running" }, + ] as any[]; + + const progress = await fetchAllRunsProgress( + job as BenchmarkJob, + mockFetchScenarioRuns, + ); + + expect(progress).toHaveLength(2); + expect(progress[0].benchmarkRunId).toBe("run_completed"); + expect(progress[0].state).toBe("completed"); + expect(progress[1].benchmarkRunId).toBe("run_in_progress"); + expect(progress[1].running).toBe(1); + expect(progress[1].finished).toBe(1); + }); + + it("should handle job with no runs", async () => { + const job: Partial = { + benchmark_outcomes: [], + in_progress_runs: [], + }; + + const mockFetchScenarioRuns = async () => []; + const progress = await fetchAllRunsProgress( + job as BenchmarkJob, + mockFetchScenarioRuns, + ); + + expect(progress).toHaveLength(0); + }); + + it("should handle the bug case: completed job with stale in-progress snapshot", async () => { + // This is the specific bug case from the user report: + // - Job state is completed + // - benchmark_outcomes shows all scenarios finished + // - in_progress_runs still contains a run (stale data from previous poll) + const job: Partial = { + state: "completed", + job_spec: { + scenario_ids: Array(60).fill("s"), + } as any, + benchmark_outcomes: [ + { + benchmark_run_id: "run_123", + agent_name: "claude-code:claude-haiku-4", + average_score: 0.88, + n_completed: 60, + n_failed: 0, + n_timeout: 0, + scenario_outcomes: Array(60).fill({ + score: 0.88, + state: "completed", + }), + }, + ], + in_progress_runs: [ + { + benchmark_run_id: "run_123", + state: "running", + agent_config: { + type: "job_agent", + name: "claude-code:claude-haiku-4", + }, + } as any, + ], + }; + + const mockFetchScenarioRuns = async () => { + // Even if the scenario runs endpoint returns stale data... + return [ + ...Array(59).fill({ + state: "completed", + scoring_contract_result: { score: 0.88 }, + }), + { state: "running" }, + ] as any[]; + }; + + const progress = await fetchAllRunsProgress( + job as BenchmarkJob, + mockFetchScenarioRuns, + ); + + // Should only have 1 run (from completed outcomes) + expect(progress).toHaveLength(1); + + // Should show as completed, not running + expect(progress[0].state).toBe("completed"); + expect(progress[0].finished).toBe(60); + expect(progress[0].running).toBe(0); + expect(progress[0].avgScore).toBeCloseTo(0.88); + }); +}); From 1fb2c177c0933b7c886e1ff0aae2a7f2752b3f8a Mon Sep 17 00:00:00 2001 From: James Chainey Date: Mon, 16 Mar 2026 17:15:00 -0700 Subject: [PATCH 3/4] re-gated bmjs --- src/components/MainMenu.tsx | 4 ++++ src/store/betaFeatureStore.tsx | 12 ++++++------ tests/__tests__/components/MainMenu.test.tsx | 12 ------------ tests/__tests__/integration/navigationFlows.test.tsx | 1 - tests/__tests__/router/Router.test.tsx | 8 -------- tests/__tests__/screens/MenuScreen.test.tsx | 1 - 6 files changed, 10 insertions(+), 28 deletions(-) diff --git a/src/components/MainMenu.tsx b/src/components/MainMenu.tsx index 8b37037e..78a5a1c0 100644 --- a/src/components/MainMenu.tsx +++ b/src/components/MainMenu.tsx @@ -24,6 +24,7 @@ interface MenuItem { } const allMenuItems: MenuItem[] = [ + /** { key: "benchmarks", label: "Benchmarks", @@ -31,6 +32,7 @@ const allMenuItems: MenuItem[] = [ icon: "▷", color: colors.success, }, + */ { key: "devboxes", label: "Devboxes", @@ -188,8 +190,10 @@ export const MainMenu = ({ onSelect }: MainMenuProps) => { selectByKey("snapshots"); } else if (input === "o") { selectByKey("objects"); + /** } else if (input === "e") { selectByKey("benchmarks"); + */ } else if (input === "n") { selectByKey("settings"); } else if (input >= "1" && input <= "9") { diff --git a/src/store/betaFeatureStore.tsx b/src/store/betaFeatureStore.tsx index 93a9d10f..40e0bb56 100644 --- a/src/store/betaFeatureStore.tsx +++ b/src/store/betaFeatureStore.tsx @@ -16,15 +16,14 @@ interface BetaFeatureContextValue { /** * Check if a specific feature flag is enabled - * This stays in place so future beta-only features can be wired through - * the shared RL_CLI_BETA gate without changing consuming components. + * Currently all beta features are controlled by the single RL_CLI_BETA flag, + * but this allows for future granular control */ isFeatureEnabled: (feature: BetaFeature) => boolean; } /** - * Known beta features that can be enabled. - * There are currently no named beta-only features configured. + * Known beta features that can be enabled */ export type BetaFeature = never; @@ -42,8 +41,9 @@ export function BetaFeatureProvider({ children }: BetaFeatureProviderProps) { const isFeatureEnabled = React.useCallback( (feature: BetaFeature): boolean => { - // No named beta features exist yet, but keep the shared env-based - // gate so future feature flags can opt into it centrally. + // Currently all beta features are gated by the same flag + // This can be extended to support per-feature flags in the future + // Add cases here when new beta features are introduced void feature; return betaEnabled; }, diff --git a/tests/__tests__/components/MainMenu.test.tsx b/tests/__tests__/components/MainMenu.test.tsx index 9d2c15b3..a59c5b48 100644 --- a/tests/__tests__/components/MainMenu.test.tsx +++ b/tests/__tests__/components/MainMenu.test.tsx @@ -25,7 +25,6 @@ describe('MainMenu', () => { const { lastFrame } = renderMainMenu(); const frame = lastFrame() || ''; - expect(frame).toContain('Benchmarks'); expect(frame).toContain('Devboxes'); expect(frame).toContain('Blueprints'); expect(frame).toContain('Snapshots'); @@ -49,17 +48,6 @@ describe('MainMenu', () => { expect(frame).toContain('[3]'); }); - it('selects benchmarks with the e shortcut', () => { - let selectedKey = ''; - const { stdin } = renderMainMenu((key: string) => { - selectedKey = key; - }); - - stdin.write('e'); - - expect(selectedKey).toBe('benchmarks'); - }); - it('shows navigation help', () => { const { lastFrame } = renderMainMenu(); diff --git a/tests/__tests__/integration/navigationFlows.test.tsx b/tests/__tests__/integration/navigationFlows.test.tsx index 4908fdca..c65dfb40 100644 --- a/tests/__tests__/integration/navigationFlows.test.tsx +++ b/tests/__tests__/integration/navigationFlows.test.tsx @@ -24,7 +24,6 @@ describe("navigation flows", () => { const { lastFrame, stdin } = renderApp("menu"); let frame = lastFrame() ?? ""; - expect(frame).toContain("Benchmarks"); expect(frame).toContain("Devboxes"); expect(frame).toContain("Blueprints"); diff --git a/tests/__tests__/router/Router.test.tsx b/tests/__tests__/router/Router.test.tsx index df03b99c..a9dec66e 100644 --- a/tests/__tests__/router/Router.test.tsx +++ b/tests/__tests__/router/Router.test.tsx @@ -26,18 +26,10 @@ describe("Router", () => { it("renders menu screen when initialScreen is menu", () => { const { lastFrame } = renderWithApp("menu"); const frame = lastFrame() ?? ""; - expect(frame).toContain("Benchmarks"); expect(frame).toContain("Devboxes"); expect(frame).toContain("Blueprints"); }); - it("renders benchmark-menu screen when initialScreen is benchmark-menu", () => { - const { lastFrame } = renderWithApp("benchmark-menu"); - const frame = lastFrame() ?? ""; - expect(frame).toContain("Benchmarks"); - expect(frame).toContain("Benchmark Defs"); - }); - it("renders devbox-list screen when initialScreen is devbox-list", () => { const { lastFrame } = renderWithApp("devbox-list"); const frame = lastFrame() ?? ""; diff --git a/tests/__tests__/screens/MenuScreen.test.tsx b/tests/__tests__/screens/MenuScreen.test.tsx index 729bbc85..c1f858dd 100644 --- a/tests/__tests__/screens/MenuScreen.test.tsx +++ b/tests/__tests__/screens/MenuScreen.test.tsx @@ -26,7 +26,6 @@ describe("MenuScreen", () => { it("displays main menu items", () => { const { lastFrame } = renderMenuScreen(); const frame = lastFrame() ?? ""; - expect(frame).toContain("Benchmarks"); expect(frame).toContain("Devboxes"); expect(frame).toContain("Blueprints"); expect(frame).toContain("Snapshots"); From 51a6d14f9014a88b76ce067be5dcba90fd5b0deb Mon Sep 17 00:00:00 2001 From: James Chainey Date: Mon, 16 Mar 2026 17:29:31 -0700 Subject: [PATCH 4/4] fmt --- src/commands/benchmark-job/summary.ts | 5 ++++- src/commands/benchmark-job/watch.ts | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/commands/benchmark-job/summary.ts b/src/commands/benchmark-job/summary.ts index b492fbb9..a144e3e4 100644 --- a/src/commands/benchmark-job/summary.ts +++ b/src/commands/benchmark-job/summary.ts @@ -98,7 +98,10 @@ async function printStatus(job: BenchmarkJob): Promise { if (!isJobCompleted(state)) { // Fetch and show progress for in-progress runs console.log(); - const progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); + const progressList = await fetchAllRunsProgress( + job, + listBenchmarkRunScenarioRuns, + ); printProgress(progressList); } } diff --git a/src/commands/benchmark-job/watch.ts b/src/commands/benchmark-job/watch.ts index c0bc1e0e..fe005782 100644 --- a/src/commands/benchmark-job/watch.ts +++ b/src/commands/benchmark-job/watch.ts @@ -114,7 +114,6 @@ function getMaxScenariosPerRun(numRuns: number): number { return Math.max(Math.floor(availableLines / Math.max(numRuns, 1)), 3); } - // Format a single run's progress line function formatRunProgressLine(progress: RunProgress): string { let label = progress.agentName; @@ -398,7 +397,10 @@ export async function watchBenchmarkJob(id: string) { try { let tick = 0; - let progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); + let progressList = await fetchAllRunsProgress( + job, + listBenchmarkRunScenarioRuns, + ); while (!isJobCompleted(job.state)) { // Check timeout @@ -449,7 +451,10 @@ export async function watchBenchmarkJob(id: string) { if (tick % UPDATES_PER_POLL === 0) { job = await getBenchmarkJob(id); if (!isJobCompleted(job.state)) { - progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); + progressList = await fetchAllRunsProgress( + job, + listBenchmarkRunScenarioRuns, + ); } } @@ -460,7 +465,10 @@ export async function watchBenchmarkJob(id: string) { // we have the most up-to-date data before printing results. This prevents // stale in-progress state from persisting after the job completes. job = await getBenchmarkJob(id); - progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns); + progressList = await fetchAllRunsProgress( + job, + listBenchmarkRunScenarioRuns, + ); } finally { process.stdout.off("resize", handleResize); cleanup();