From fb7f21206b670593d4339fc5db1ba00554f11780 Mon Sep 17 00:00:00 2001
From: James Chainey <james@runloop.ai>
Date: Fri, 13 Mar 2026 14:02:47 -0700
Subject: [PATCH 1/4] james/krafton-bmj-preview

---
 src/components/MainMenu.tsx                          |  4 ----
 src/store/betaFeatureStore.tsx                       | 12 ++++++------
 tests/__tests__/components/MainMenu.test.tsx         | 12 ++++++++++++
 tests/__tests__/integration/navigationFlows.test.tsx |  1 +
 tests/__tests__/router/Router.test.tsx               |  8 ++++++++
 tests/__tests__/screens/MenuScreen.test.tsx          |  1 +
 6 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/components/MainMenu.tsx b/src/components/MainMenu.tsx
index 78a5a1c0..8b37037e 100644
--- a/src/components/MainMenu.tsx
+++ b/src/components/MainMenu.tsx
@@ -24,7 +24,6 @@ interface MenuItem {
 }
 
 const allMenuItems: MenuItem[] = [
-  /**
   {
     key: "benchmarks",
     label: "Benchmarks",
@@ -32,7 +31,6 @@ const allMenuItems: MenuItem[] = [
     icon: "▷",
     color: colors.success,
   },
-  */
   {
     key: "devboxes",
     label: "Devboxes",
@@ -190,10 +188,8 @@ export const MainMenu = ({ onSelect }: MainMenuProps) => {
       selectByKey("snapshots");
     } else if (input === "o") {
       selectByKey("objects");
-      /**
     } else if (input === "e") {
       selectByKey("benchmarks");
-    */
     } else if (input === "n") {
       selectByKey("settings");
     } else if (input >= "1" && input <= "9") {
diff --git a/src/store/betaFeatureStore.tsx b/src/store/betaFeatureStore.tsx
index 40e0bb56..93a9d10f 100644
--- a/src/store/betaFeatureStore.tsx
+++ b/src/store/betaFeatureStore.tsx
@@ -16,14 +16,15 @@ interface BetaFeatureContextValue {
 
   /**
    * Check if a specific feature flag is enabled
-   * Currently all beta features are controlled by the single RL_CLI_BETA flag,
-   * but this allows for future granular control
+   * This stays in place so future beta-only features can be wired through
+   * the shared RL_CLI_BETA gate without changing consuming components.
    */
   isFeatureEnabled: (feature: BetaFeature) => boolean;
 }
 
 /**
- * Known beta features that can be enabled
+ * Known beta features that can be enabled.
+ * There are currently no named beta-only features configured.
  */
 export type BetaFeature = never;
 
@@ -41,9 +42,8 @@ export function BetaFeatureProvider({ children }: BetaFeatureProviderProps) {
 
   const isFeatureEnabled = React.useCallback(
     (feature: BetaFeature): boolean => {
-      // Currently all beta features are gated by the same flag
-      // This can be extended to support per-feature flags in the future
-      // Add cases here when new beta features are introduced
+      // No named beta features exist yet, but keep the shared env-based
+      // gate so future feature flags can opt into it centrally.
       void feature;
       return betaEnabled;
     },
diff --git a/tests/__tests__/components/MainMenu.test.tsx b/tests/__tests__/components/MainMenu.test.tsx
index a59c5b48..9d2c15b3 100644
--- a/tests/__tests__/components/MainMenu.test.tsx
+++ b/tests/__tests__/components/MainMenu.test.tsx
@@ -25,6 +25,7 @@ describe('MainMenu', () => {
     const { lastFrame } = renderMainMenu();
     
     const frame = lastFrame() || '';
+    expect(frame).toContain('Benchmarks');
     expect(frame).toContain('Devboxes');
     expect(frame).toContain('Blueprints');
     expect(frame).toContain('Snapshots');
@@ -48,6 +49,17 @@ describe('MainMenu', () => {
     expect(frame).toContain('[3]');
   });
 
+  it('selects benchmarks with the e shortcut', () => {
+    let selectedKey = '';
+    const { stdin } = renderMainMenu((key: string) => {
+      selectedKey = key;
+    });
+
+    stdin.write('e');
+
+    expect(selectedKey).toBe('benchmarks');
+  });
+
   it('shows navigation help', () => {
     const { lastFrame } = renderMainMenu();
     
diff --git a/tests/__tests__/integration/navigationFlows.test.tsx b/tests/__tests__/integration/navigationFlows.test.tsx
index c65dfb40..4908fdca 100644
--- a/tests/__tests__/integration/navigationFlows.test.tsx
+++ b/tests/__tests__/integration/navigationFlows.test.tsx
@@ -24,6 +24,7 @@ describe("navigation flows", () => {
     const { lastFrame, stdin } = renderApp("menu");
 
     let frame = lastFrame() ?? "";
+    expect(frame).toContain("Benchmarks");
     expect(frame).toContain("Devboxes");
     expect(frame).toContain("Blueprints");
 
diff --git a/tests/__tests__/router/Router.test.tsx b/tests/__tests__/router/Router.test.tsx
index a9dec66e..df03b99c 100644
--- a/tests/__tests__/router/Router.test.tsx
+++ b/tests/__tests__/router/Router.test.tsx
@@ -26,10 +26,18 @@ describe("Router", () => {
   it("renders menu screen when initialScreen is menu", () => {
     const { lastFrame } = renderWithApp("menu");
     const frame = lastFrame() ?? "";
+    expect(frame).toContain("Benchmarks");
     expect(frame).toContain("Devboxes");
     expect(frame).toContain("Blueprints");
   });
 
+  it("renders benchmark-menu screen when initialScreen is benchmark-menu", () => {
+    const { lastFrame } = renderWithApp("benchmark-menu");
+    const frame = lastFrame() ?? "";
+    expect(frame).toContain("Benchmarks");
+    expect(frame).toContain("Benchmark Defs");
+  });
+
   it("renders devbox-list screen when initialScreen is devbox-list", () => {
     const { lastFrame } = renderWithApp("devbox-list");
     const frame = lastFrame() ?? "";
diff --git a/tests/__tests__/screens/MenuScreen.test.tsx b/tests/__tests__/screens/MenuScreen.test.tsx
index c1f858dd..729bbc85 100644
--- a/tests/__tests__/screens/MenuScreen.test.tsx
+++ b/tests/__tests__/screens/MenuScreen.test.tsx
@@ -26,6 +26,7 @@ describe("MenuScreen", () => {
   it("displays main menu items", () => {
     const { lastFrame } = renderMenuScreen();
     const frame = lastFrame() ?? "";
+    expect(frame).toContain("Benchmarks");
     expect(frame).toContain("Devboxes");
     expect(frame).toContain("Blueprints");
     expect(frame).toContain("Snapshots");

From 3d45525bcae60ab08d34fe615d613eeca14fcb89 Mon Sep 17 00:00:00 2001
From: James Chainey <james@runloop.ai>
Date: Mon, 16 Mar 2026 17:12:01 -0700
Subject: [PATCH 2/4] refactored progress bar and fixed a dup bug that made it
 looks like a scenario inside a job was stuck running

---
 src/commands/benchmark-job/progress.ts        | 237 ++++++++++++++
 src/commands/benchmark-job/summary.ts         | 158 +--------
 src/commands/benchmark-job/watch.ts           | 196 ++----------
 src/screens/BenchmarkJobDetailScreen.tsx      |  29 +-
 .../commands/benchmark-job/progress.test.ts   | 301 ++++++++++++++++++
 5 files changed, 591 insertions(+), 330 deletions(-)
 create mode 100644 src/commands/benchmark-job/progress.ts
 create mode 100644 tests/__tests__/commands/benchmark-job/progress.test.ts

diff --git a/src/commands/benchmark-job/progress.ts b/src/commands/benchmark-job/progress.ts
new file mode 100644
index 00000000..a086e1c2
--- /dev/null
+++ b/src/commands/benchmark-job/progress.ts
@@ -0,0 +1,237 @@
+/**
+ * Shared benchmark job progress helpers
+ *
+ * Provides terminal-state classification and run reconciliation logic
+ * used by watch, summary, and detail views.
+ */
+
+import type {
+  BenchmarkJob,
+  ScenarioRun,
+} from "../../services/benchmarkJobService.js";
+
+// ---------------------------------------------------------------------------
+// State Classification
+// ---------------------------------------------------------------------------
+
+/** Job states that indicate completion */
+export const JOB_COMPLETED_STATES = [
+  "completed",
+  "failed",
+  "canceled",
+  "cancelled",
+  "timeout",
+];
+
+/** Scenario run states that count as finished (no longer running) */
+export const SCENARIO_COMPLETED_STATES = [
+  "completed",
+  "failed",
+  "canceled",
+  "cancelled",
+  "timeout",
+  "error",
+  "scored", // treat scored as complete, consistent with run detail screen
+];
+
+/** Check if a job is in a terminal state */
+export function isJobCompleted(state: string | undefined | null): boolean {
+  if (!state) return false;
+  return JOB_COMPLETED_STATES.includes(state.toLowerCase());
+}
+
+/** Check if a scenario is in a terminal state */
+export function isScenarioCompleted(state: string | undefined | null): boolean {
+  if (!state) return false;
+  return SCENARIO_COMPLETED_STATES.includes(state.toLowerCase());
+}
+
+// ---------------------------------------------------------------------------
+// Progress Stats
+// ---------------------------------------------------------------------------
+
+/** In-progress scenario info for display */
+export interface InProgressScenario {
+  name: string;
+  state: string;
+  startTimeMs?: number;
+}
+
+/** Progress stats for a benchmark run */
+export interface RunProgress {
+  benchmarkRunId: string;
+  agentName: string;
+  modelName?: string;
+  state: string;
+  expectedTotal: number;
+  started: number;
+  running: number;
+  scoring: number;
+  finished: number;
+  avgScore: number | null;
+  inProgressScenarios: InProgressScenario[];
+}
+
+// ---------------------------------------------------------------------------
+// Agent Info Extraction
+// ---------------------------------------------------------------------------
+
+type InProgressRun = NonNullable<BenchmarkJob["in_progress_runs"]>[number];
+
+/** Get agent info from in_progress_run */
+export function getAgentInfo(run: InProgressRun): {
+  name: string;
+  model?: string;
+} {
+  const agentConfig = run.agent_config;
+  if (agentConfig && agentConfig.type === "job_agent") {
+    return {
+      name: agentConfig.name,
+      model: agentConfig.model_name ?? undefined,
+    };
+  }
+  return { name: "unknown" };
+}
+
+// ---------------------------------------------------------------------------
+// Progress Calculation
+// ---------------------------------------------------------------------------
+
+/** Calculate progress from scenario runs */
+export function calculateRunProgress(
+  benchmarkRunId: string,
+  agentName: string,
+  modelName: string | undefined,
+  state: string,
+  expectedTotal: number,
+  scenarioRuns: ScenarioRun[],
+): RunProgress {
+  let running = 0;
+  let scoring = 0;
+  let finished = 0;
+  let scoreSum = 0;
+  let scoreCount = 0;
+  const inProgressScenarios: InProgressScenario[] = [];
+
+  for (const scenario of scenarioRuns) {
+    const scenarioState = scenario.state?.toLowerCase() || "";
+
+    if (isScenarioCompleted(scenarioState)) {
+      finished++;
+      const score = scenario.scoring_contract_result?.score;
+      if (score !== undefined && score !== null) {
+        scoreSum += score;
+        scoreCount++;
+      }
+    } else if (scenarioState === "scoring") {
+      scoring++;
+      inProgressScenarios.push({
+        name: scenario.name || scenario.scenario_id || "unknown",
+        state: scenarioState,
+        startTimeMs: scenario.start_time_ms,
+      });
+    } else if (scenarioState === "running") {
+      running++;
+      inProgressScenarios.push({
+        name: scenario.name || scenario.scenario_id || "unknown",
+        state: scenarioState,
+        startTimeMs: scenario.start_time_ms,
+      });
+    } else if (scenarioState && scenarioState !== "pending") {
+      inProgressScenarios.push({
+        name: scenario.name || scenario.scenario_id || "unknown",
+        state: scenarioState,
+        startTimeMs: scenario.start_time_ms,
+      });
+    }
+  }
+
+  return {
+    benchmarkRunId,
+    agentName,
+    modelName,
+    state,
+    expectedTotal,
+    started: scenarioRuns.length,
+    running,
+    scoring,
+    finished,
+    avgScore: scoreCount > 0 ? scoreSum / scoreCount : null,
+    inProgressScenarios,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Run Reconciliation
+// ---------------------------------------------------------------------------
+
+/**
+ * Build progress for all runs in a job, preferring completed outcomes over
+ * stale in-progress data for the same benchmark_run_id.
+ *
+ * @param job The benchmark job
+ * @param fetchScenarioRuns Callback to fetch scenario runs for a benchmark run
+ */
+export async function fetchAllRunsProgress(
+  job: BenchmarkJob,
+  fetchScenarioRuns: (benchmarkRunId: string) => Promise<ScenarioRun[]>,
+): Promise<RunProgress[]> {
+  const results: RunProgress[] = [];
+
+  // Get expected scenario count from job spec
+  const expectedTotal = job.job_spec?.scenario_ids?.length || 0;
+
+  // Track which runs we've already added from completed outcomes
+  const completedRunIds = new Set<string>();
+
+  // First, add completed runs from benchmark_outcomes (authoritative)
+  const completedOutcomes = job.benchmark_outcomes || [];
+  for (const outcome of completedOutcomes) {
+    completedRunIds.add(outcome.benchmark_run_id);
+
+    const scenarioOutcomes = outcome.scenario_outcomes || [];
+    let scoreSum = 0;
+    let scoreCount = 0;
+    for (const s of scenarioOutcomes) {
+      if (s.score !== undefined && s.score !== null) {
+        scoreSum += s.score;
+        scoreCount++;
+      }
+    }
+    results.push({
+      benchmarkRunId: outcome.benchmark_run_id,
+      agentName: outcome.agent_name,
+      modelName: outcome.model_name ?? undefined,
+      state: "completed",
+      expectedTotal: expectedTotal || scenarioOutcomes.length,
+      started: scenarioOutcomes.length,
+      running: 0,
+      scoring: 0,
+      finished: scenarioOutcomes.length,
+      avgScore: scoreCount > 0 ? scoreSum / scoreCount : null,
+      inProgressScenarios: [],
+    });
+  }
+
+  // Then, fetch progress for in-progress runs that are NOT already in outcomes
+  const inProgressRuns = job.in_progress_runs || [];
+  const progressPromises = inProgressRuns
+    .filter((run) => !completedRunIds.has(run.benchmark_run_id))
+    .map(async (run) => {
+      const agentInfo = getAgentInfo(run);
+      const scenarioRuns = await fetchScenarioRuns(run.benchmark_run_id);
+      return calculateRunProgress(
+        run.benchmark_run_id,
+        agentInfo.name,
+        agentInfo.model,
+        run.state,
+        expectedTotal,
+        scenarioRuns,
+      );
+    });
+
+  const inProgressResults = await Promise.all(progressPromises);
+  results.push(...inProgressResults);
+
+  return results;
+}
diff --git a/src/commands/benchmark-job/summary.ts b/src/commands/benchmark-job/summary.ts
index 3ab04aa9..b492fbb9 100644
--- a/src/commands/benchmark-job/summary.ts
+++ b/src/commands/benchmark-job/summary.ts
@@ -7,167 +7,25 @@ import {
   getBenchmarkJob,
   listBenchmarkRunScenarioRuns,
   type BenchmarkJob,
-  type ScenarioRun,
 } from "../../services/benchmarkJobService.js";
 import { output, outputError } from "../../utils/output.js";
+import {
+  isJobCompleted,
+  fetchAllRunsProgress,
+  type RunProgress,
+} from "./progress.js";
 
 interface SummaryOptions {
   output?: string;
   extended?: boolean;
 }
 
-// Job states that indicate completion
-const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"];
-
-// Scenario run states that indicate completion
-const SCENARIO_COMPLETED_STATES = [
-  "completed",
-  "failed",
-  "canceled",
-  "timeout",
-  "error",
-];
-
 // Format percentage
 function formatPercent(count: number, total: number): string {
   if (total === 0) return "0.0%";
   return ((count / total) * 100).toFixed(1) + "%";
 }
 
-// Progress stats for a benchmark run
-interface RunProgress {
-  benchmarkRunId: string;
-  agentName: string;
-  modelName?: string;
-  state: string;
-  expectedTotal: number;
-  started: number;
-  running: number;
-  scoring: number;
-  finished: number;
-  avgScore: number | null;
-}
-
-// Calculate progress from scenario runs
-function calculateRunProgress(
-  benchmarkRunId: string,
-  agentName: string,
-  modelName: string | undefined,
-  state: string,
-  expectedTotal: number,
-  scenarioRuns: ScenarioRun[],
-): RunProgress {
-  let running = 0;
-  let scoring = 0;
-  let finished = 0;
-  let scoreSum = 0;
-  let scoreCount = 0;
-
-  for (const scenario of scenarioRuns) {
-    const scenarioState = scenario.state?.toLowerCase() || "";
-
-    if (SCENARIO_COMPLETED_STATES.includes(scenarioState)) {
-      finished++;
-      const score = scenario.scoring_contract_result?.score;
-      if (score !== undefined && score !== null) {
-        scoreSum += score;
-        scoreCount++;
-      }
-    } else if (scenarioState === "scoring" || scenarioState === "scored") {
-      scoring++;
-    } else if (scenarioState === "running") {
-      running++;
-    }
-  }
-
-  return {
-    benchmarkRunId,
-    agentName,
-    modelName,
-    state,
-    expectedTotal,
-    started: scenarioRuns.length,
-    running,
-    scoring,
-    finished,
-    avgScore: scoreCount > 0 ? scoreSum / scoreCount : null,
-  };
-}
-
-// In-progress run type
-type InProgressRun = NonNullable<BenchmarkJob["in_progress_runs"]>[number];
-
-// Get agent info from in_progress_run
-function getAgentInfo(run: InProgressRun): {
-  name: string;
-  model?: string;
-} {
-  const agentConfig = run.agent_config;
-  if (agentConfig && agentConfig.type === "job_agent") {
-    return {
-      name: agentConfig.name,
-      model: agentConfig.model_name ?? undefined,
-    };
-  }
-  return { name: "unknown" };
-}
-
-// Fetch progress for all runs (in-progress and completed)
-async function fetchAllRunsProgress(job: BenchmarkJob): Promise<RunProgress[]> {
-  const results: RunProgress[] = [];
-
-  // Get expected scenario count from job spec
-  const expectedTotal = job.job_spec?.scenario_ids?.length || 0;
-
-  // First, add completed runs from benchmark_outcomes
-  const completedOutcomes = job.benchmark_outcomes || [];
-  for (const outcome of completedOutcomes) {
-    const scenarioOutcomes = outcome.scenario_outcomes || [];
-    let scoreSum = 0;
-    let scoreCount = 0;
-    for (const s of scenarioOutcomes) {
-      if (s.score !== undefined && s.score !== null) {
-        scoreSum += s.score;
-        scoreCount++;
-      }
-    }
-    results.push({
-      benchmarkRunId: outcome.benchmark_run_id,
-      agentName: outcome.agent_name,
-      modelName: outcome.model_name ?? undefined,
-      state: "completed",
-      expectedTotal: expectedTotal || scenarioOutcomes.length,
-      started: scenarioOutcomes.length,
-      running: 0,
-      scoring: 0,
-      finished: scenarioOutcomes.length,
-      avgScore: scoreCount > 0 ? scoreSum / scoreCount : null,
-    });
-  }
-
-  // Then, fetch progress for in-progress runs
-  const inProgressRuns = job.in_progress_runs || [];
-  const progressPromises = inProgressRuns.map(async (run) => {
-    const agentInfo = getAgentInfo(run);
-    const scenarioRuns = await listBenchmarkRunScenarioRuns(
-      run.benchmark_run_id,
-    );
-    return calculateRunProgress(
-      run.benchmark_run_id,
-      agentInfo.name,
-      agentInfo.model,
-      run.state,
-      expectedTotal,
-      scenarioRuns,
-    );
-  });
-
-  const inProgressResults = await Promise.all(progressPromises);
-  results.push(...inProgressResults);
-
-  return results;
-}
-
 // Format a single run's progress line
 function formatRunProgressLine(progress: RunProgress): string {
   // Format agent/model label
@@ -237,10 +95,10 @@ async function printStatus(job: BenchmarkJob): Promise<void> {
   console.log(`ID: ${job.id}`);
   console.log(`State: ${state}`);
 
-  if (!COMPLETED_STATES.includes(state)) {
+  if (!isJobCompleted(state)) {
     // Fetch and show progress for in-progress runs
     console.log();
-    const progressList = await fetchAllRunsProgress(job);
+    const progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
     printProgress(progressList);
   }
 }
@@ -420,7 +278,7 @@ export async function summaryBenchmarkJob(
 ) {
   try {
     const job = await getBenchmarkJob(id);
-    const isComplete = COMPLETED_STATES.includes(job.state || "");
+    const isComplete = isJobCompleted(job.state);
 
     if (options.output && options.output !== "text") {
       output(job, { format: options.output, defaultFormat: "json" });
diff --git a/src/commands/benchmark-job/watch.ts b/src/commands/benchmark-job/watch.ts
index 8755bead..c0bc1e0e 100644
--- a/src/commands/benchmark-job/watch.ts
+++ b/src/commands/benchmark-job/watch.ts
@@ -7,21 +7,14 @@ import {
   getBenchmarkJob,
   listBenchmarkRunScenarioRuns,
   type BenchmarkJob,
-  type ScenarioRun,
 } from "../../services/benchmarkJobService.js";
 import { outputError } from "../../utils/output.js";
-
-// Job states that indicate completion
-const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"];
-
-// Scenario run states that indicate completion
-const SCENARIO_COMPLETED_STATES = [
-  "completed",
-  "failed",
-  "canceled",
-  "timeout",
-  "error",
-];
+import {
+  isJobCompleted,
+  fetchAllRunsProgress,
+  type RunProgress,
+  type InProgressScenario,
+} from "./progress.js";
 
 // Polling config
 const POLL_INTERVAL_MS = 5 * 1000; // 5 seconds
@@ -100,27 +93,8 @@ function formatDuration(ms: number): string {
   return `${seconds}s`;
 }
 
-// In-progress scenario info for display
-interface InProgressScenario {
-  name: string;
-  state: string;
-  startTimeMs?: number;
-}
-
-// Progress stats for a benchmark run
-interface RunProgress {
-  benchmarkRunId: string;
-  agentName: string;
-  modelName?: string;
-  state: string;
-  expectedTotal: number;
-  started: number;
-  running: number;
-  scoring: number;
-  finished: number;
-  avgScore: number | null;
-  inProgressScenarios: InProgressScenario[];
-}
+// Re-export types from shared module for internal use
+export type { RunProgress, InProgressScenario };
 
 // Spinner frames for running indicators
 const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
@@ -140,144 +114,6 @@ function getMaxScenariosPerRun(numRuns: number): number {
   return Math.max(Math.floor(availableLines / Math.max(numRuns, 1)), 3);
 }
 
-// Calculate progress from scenario runs
-function calculateRunProgress(
-  benchmarkRunId: string,
-  agentName: string,
-  modelName: string | undefined,
-  state: string,
-  expectedTotal: number,
-  scenarioRuns: ScenarioRun[],
-): RunProgress {
-  let running = 0;
-  let scoring = 0;
-  let finished = 0;
-  let scoreSum = 0;
-  let scoreCount = 0;
-  const inProgressScenarios: InProgressScenario[] = [];
-
-  for (const scenario of scenarioRuns) {
-    const scenarioState = scenario.state?.toLowerCase() || "";
-
-    if (SCENARIO_COMPLETED_STATES.includes(scenarioState)) {
-      finished++;
-      const score = scenario.scoring_contract_result?.score;
-      if (score !== undefined && score !== null) {
-        scoreSum += score;
-        scoreCount++;
-      }
-    } else if (scenarioState === "scoring" || scenarioState === "scored") {
-      scoring++;
-      inProgressScenarios.push({
-        name: scenario.name || scenario.scenario_id || "unknown",
-        state: scenarioState,
-        startTimeMs: scenario.start_time_ms,
-      });
-    } else if (scenarioState === "running") {
-      running++;
-      inProgressScenarios.push({
-        name: scenario.name || scenario.scenario_id || "unknown",
-        state: scenarioState,
-        startTimeMs: scenario.start_time_ms,
-      });
-    } else if (scenarioState && scenarioState !== "pending") {
-      inProgressScenarios.push({
-        name: scenario.name || scenario.scenario_id || "unknown",
-        state: scenarioState,
-        startTimeMs: scenario.start_time_ms,
-      });
-    }
-  }
-
-  return {
-    benchmarkRunId,
-    agentName,
-    modelName,
-    state,
-    expectedTotal,
-    started: scenarioRuns.length,
-    running,
-    scoring,
-    finished,
-    avgScore: scoreCount > 0 ? scoreSum / scoreCount : null,
-    inProgressScenarios,
-  };
-}
-
-// In-progress run type
-type InProgressRun = NonNullable<BenchmarkJob["in_progress_runs"]>[number];
-
-// Get agent info from in_progress_run
-function getAgentInfo(run: InProgressRun): {
-  name: string;
-  model?: string;
-} {
-  const agentConfig = run.agent_config;
-  if (agentConfig && agentConfig.type === "job_agent") {
-    return {
-      name: agentConfig.name,
-      model: agentConfig.model_name ?? undefined,
-    };
-  }
-  return { name: "unknown" };
-}
-
-// Fetch progress for all runs (in-progress and completed)
-async function fetchAllRunsProgress(job: BenchmarkJob): Promise<RunProgress[]> {
-  const results: RunProgress[] = [];
-
-  // Get expected scenario count from job spec
-  const expectedTotal = job.job_spec?.scenario_ids?.length || 0;
-
-  // First, add completed runs from benchmark_outcomes
-  const completedOutcomes = job.benchmark_outcomes || [];
-  for (const outcome of completedOutcomes) {
-    const scenarioOutcomes = outcome.scenario_outcomes || [];
-    let scoreSum = 0;
-    let scoreCount = 0;
-    for (const s of scenarioOutcomes) {
-      if (s.score !== undefined && s.score !== null) {
-        scoreSum += s.score;
-        scoreCount++;
-      }
-    }
-    results.push({
-      benchmarkRunId: outcome.benchmark_run_id,
-      agentName: outcome.agent_name,
-      modelName: outcome.model_name ?? undefined,
-      state: "completed",
-      expectedTotal: expectedTotal || scenarioOutcomes.length,
-      started: scenarioOutcomes.length,
-      running: 0,
-      scoring: 0,
-      finished: scenarioOutcomes.length,
-      avgScore: scoreCount > 0 ? scoreSum / scoreCount : null,
-      inProgressScenarios: [],
-    });
-  }
-
-  // Then, fetch progress for in-progress runs
-  const inProgressRuns = job.in_progress_runs || [];
-  const progressPromises = inProgressRuns.map(async (run) => {
-    const agentInfo = getAgentInfo(run);
-    const scenarioRuns = await listBenchmarkRunScenarioRuns(
-      run.benchmark_run_id,
-    );
-    return calculateRunProgress(
-      run.benchmark_run_id,
-      agentInfo.name,
-      agentInfo.model,
-      run.state,
-      expectedTotal,
-      scenarioRuns,
-    );
-  });
-
-  const inProgressResults = await Promise.all(progressPromises);
-  results.push(...inProgressResults);
-
-  return results;
-}
 
 // Format a single run's progress line
 function formatRunProgressLine(progress: RunProgress): string {
@@ -514,7 +350,7 @@ export async function watchBenchmarkJob(id: string) {
     let job = await getBenchmarkJob(id);
 
     // If job is already complete, just show results
-    if (COMPLETED_STATES.includes(job.state || "")) {
+    if (isJobCompleted(job.state)) {
       printResultsTable(job);
       return;
     }
@@ -562,9 +398,9 @@ export async function watchBenchmarkJob(id: string) {
 
     try {
       let tick = 0;
-      let progressList = await fetchAllRunsProgress(job);
+      let progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
 
-      while (!COMPLETED_STATES.includes(job.state || "")) {
+      while (!isJobCompleted(job.state)) {
         // Check timeout
         if (Date.now() - jobStartMs > MAX_WAIT_MS) {
           cleanup();
@@ -612,13 +448,19 @@ export async function watchBenchmarkJob(id: string) {
         // Every UPDATES_PER_POLL ticks, poll the API for fresh data
         if (tick % UPDATES_PER_POLL === 0) {
           job = await getBenchmarkJob(id);
-          if (!COMPLETED_STATES.includes(job.state || "")) {
-            progressList = await fetchAllRunsProgress(job);
+          if (!isJobCompleted(job.state)) {
+            progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
           }
         }
 
         await sleep(SPINNER_INTERVAL_MS);
       }
+
+      // Final reconciliation: refresh job and progress one more time to ensure
+      // we have the most up-to-date data before printing results. This prevents
+      // stale in-progress state from persisting after the job completes.
+      job = await getBenchmarkJob(id);
+      progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
     } finally {
       process.stdout.off("resize", handleResize);
       cleanup();
diff --git a/src/screens/BenchmarkJobDetailScreen.tsx b/src/screens/BenchmarkJobDetailScreen.tsx
index d71ce721..26eac88c 100644
--- a/src/screens/BenchmarkJobDetailScreen.tsx
+++ b/src/screens/BenchmarkJobDetailScreen.tsx
@@ -243,6 +243,8 @@ export function BenchmarkJobDetailScreen({
 
   // Build a unified view of benchmark runs per agent
   // Collect all agents from job_spec, in_progress_runs, and benchmark_outcomes
+  // NOTE: We dedupe by benchmark_run_id, preferring completed outcomes over
+  // stale in-progress data to avoid showing mixed completed/running state.
   interface AgentRunInfo {
     agentName: string;
     modelName?: string;
@@ -258,9 +260,14 @@ export function BenchmarkJobDetailScreen({
 
   const agentRuns: AgentRunInfo[] = [];
 
-  // First, add completed runs from benchmark_outcomes
+  // Track which benchmark_run_ids we've already added from completed outcomes
+  const completedRunIds = new Set<string>();
+
+  // First, add completed runs from benchmark_outcomes (authoritative)
   if (job.benchmark_outcomes) {
     job.benchmark_outcomes.forEach((outcome) => {
+      completedRunIds.add(outcome.benchmark_run_id);
+
       const total = outcome.n_completed + outcome.n_failed + outcome.n_timeout;
       const status =
         outcome.n_failed > 0 || outcome.n_timeout > 0
@@ -283,9 +290,14 @@ export function BenchmarkJobDetailScreen({
     });
   }
 
-  // Add in-progress runs
+  // Add in-progress runs that are NOT already in completed outcomes
   if (job.in_progress_runs) {
     job.in_progress_runs.forEach((run) => {
+      // Skip if we already have this run from benchmark_outcomes
+      if (completedRunIds.has(run.benchmark_run_id)) {
+        return;
+      }
+
       // Get agent name from agent_config if available
       let agentName = "Unknown Agent";
       if (run.agent_config && "name" in run.agent_config) {
@@ -805,9 +817,20 @@ export function BenchmarkJobDetailScreen({
       });
     }
 
-    // In-progress runs
+    // In-progress runs (skip any that are already in completed outcomes)
     if (j.in_progress_runs && j.in_progress_runs.length > 0) {
+      // Build set of completed run IDs to skip
+      const detailCompletedRunIds = new Set<string>();
+      j.benchmark_outcomes?.forEach((o) =>
+        detailCompletedRunIds.add(o.benchmark_run_id),
+      );
+
       j.in_progress_runs.forEach((run, idx) => {
+        // Skip if already in completed outcomes
+        if (detailCompletedRunIds.has(run.benchmark_run_id)) {
+          return;
+        }
+
         let agentName = "Unknown Agent";
         if (run.agent_config && "name" in run.agent_config) {
           agentName = (run.agent_config as any).name;
diff --git a/tests/__tests__/commands/benchmark-job/progress.test.ts b/tests/__tests__/commands/benchmark-job/progress.test.ts
new file mode 100644
index 00000000..915efdc3
--- /dev/null
+++ b/tests/__tests__/commands/benchmark-job/progress.test.ts
@@ -0,0 +1,301 @@
+/**
+ * Tests for benchmark job progress helpers
+ *
+ * These tests verify that the progress reconciliation logic correctly
+ * handles the case where a job is completed but the prior progress
+ * snapshot still had scenario runs marked as running.
+ */
+
+import { describe, it, expect } from "@jest/globals";
+import {
+  isJobCompleted,
+  isScenarioCompleted,
+  JOB_COMPLETED_STATES,
+  SCENARIO_COMPLETED_STATES,
+  fetchAllRunsProgress,
+  calculateRunProgress,
+  type RunProgress,
+} from "@/commands/benchmark-job/progress.js";
+import type { BenchmarkJob } from "@/services/benchmarkJobService.js";
+
+describe("isJobCompleted", () => {
+  it("should return true for completed states", () => {
+    expect(isJobCompleted("completed")).toBe(true);
+    expect(isJobCompleted("failed")).toBe(true);
+    expect(isJobCompleted("canceled")).toBe(true);
+    expect(isJobCompleted("cancelled")).toBe(true);
+    expect(isJobCompleted("timeout")).toBe(true);
+  });
+
+  it("should return false for in-progress states", () => {
+    expect(isJobCompleted("running")).toBe(false);
+    expect(isJobCompleted("queued")).toBe(false);
+    expect(isJobCompleted("initializing")).toBe(false);
+  });
+
+  it("should return false for null/undefined", () => {
+    expect(isJobCompleted(null)).toBe(false);
+    expect(isJobCompleted(undefined)).toBe(false);
+    expect(isJobCompleted("")).toBe(false);
+  });
+
+  it("should be case insensitive", () => {
+    expect(isJobCompleted("COMPLETED")).toBe(true);
+    expect(isJobCompleted("Completed")).toBe(true);
+    expect(isJobCompleted("FAILED")).toBe(true);
+  });
+});
+
+describe("isScenarioCompleted", () => {
+  it("should return true for completed states", () => {
+    expect(isScenarioCompleted("completed")).toBe(true);
+    expect(isScenarioCompleted("failed")).toBe(true);
+    expect(isScenarioCompleted("canceled")).toBe(true);
+    expect(isScenarioCompleted("cancelled")).toBe(true);
+    expect(isScenarioCompleted("timeout")).toBe(true);
+    expect(isScenarioCompleted("error")).toBe(true);
+  });
+
+  it("should treat scored as completed", () => {
+    expect(isScenarioCompleted("scored")).toBe(true);
+  });
+
+  it("should return false for in-progress states", () => {
+    expect(isScenarioCompleted("running")).toBe(false);
+    expect(isScenarioCompleted("scoring")).toBe(false);
+    expect(isScenarioCompleted("pending")).toBe(false);
+  });
+
+  it("should return false for null/undefined", () => {
+    expect(isScenarioCompleted(null)).toBe(false);
+    expect(isScenarioCompleted(undefined)).toBe(false);
+    expect(isScenarioCompleted("")).toBe(false);
+  });
+});
+
+describe("calculateRunProgress", () => {
+  it("should count scored scenarios as finished", () => {
+    const scenarioRuns = [
+      { state: "completed", scoring_contract_result: { score: 1.0 } },
+      { state: "scored", scoring_contract_result: { score: 0.5 } },
+      { state: "running" },
+    ] as any[];
+
+    const progress = calculateRunProgress(
+      "run_123",
+      "test-agent",
+      undefined,
+      "running",
+      3,
+      scenarioRuns,
+    );
+
+    expect(progress.finished).toBe(2);
+    expect(progress.running).toBe(1);
+    expect(progress.scoring).toBe(0);
+  });
+
+  it("should count scoring scenarios separately", () => {
+    const scenarioRuns = [
+      { state: "completed", scoring_contract_result: { score: 1.0 } },
+      { state: "scoring" },
+      { state: "running" },
+    ] as any[];
+
+    const progress = calculateRunProgress(
+      "run_123",
+      "test-agent",
+      undefined,
+      "running",
+      3,
+      scenarioRuns,
+    );
+
+    expect(progress.finished).toBe(1);
+    expect(progress.scoring).toBe(1);
+    expect(progress.running).toBe(1);
+  });
+
+  it("should calculate average score from completed scenarios", () => {
+    const scenarioRuns = [
+      { state: "completed", scoring_contract_result: { score: 1.0 } },
+      { state: "completed", scoring_contract_result: { score: 0.5 } },
+      { state: "scored", scoring_contract_result: { score: 0.0 } },
+    ] as any[];
+
+    const progress = calculateRunProgress(
+      "run_123",
+      "test-agent",
+      undefined,
+      "completed",
+      3,
+      scenarioRuns,
+    );
+
+    expect(progress.avgScore).toBe(0.5);
+  });
+});
+
+describe("fetchAllRunsProgress", () => {
+  it("should prefer completed outcomes over in-progress runs for same benchmark_run_id", async () => {
+    const job: Partial<BenchmarkJob> = {
+      job_spec: {
+        scenario_ids: ["s1", "s2", "s3"],
+      } as any,
+      benchmark_outcomes: [
+        {
+          benchmark_run_id: "run_123",
+          agent_name: "test-agent",
+          model_name: "gpt-4",
+          average_score: 0.8,
+          n_completed: 3,
+          n_failed: 0,
+          n_timeout: 0,
+          scenario_outcomes: [
+            { score: 1.0 },
+            { score: 0.7 },
+            { score: 0.7 },
+          ] as any[],
+        },
+      ],
+      in_progress_runs: [
+        {
+          benchmark_run_id: "run_123",
+          state: "running",
+          agent_config: { type: "job_agent", name: "test-agent" },
+        } as any,
+      ],
+    };
+
+    const mockFetchScenarioRuns = async () => [];
+    const progress = await fetchAllRunsProgress(
+      job as BenchmarkJob,
+      mockFetchScenarioRuns,
+    );
+
+    expect(progress).toHaveLength(1);
+    expect(progress[0].state).toBe("completed");
+    expect(progress[0].running).toBe(0);
+    expect(progress[0].finished).toBe(3);
+  });
+
+  it("should include in-progress runs that are not in completed outcomes", async () => {
+    const job: Partial<BenchmarkJob> = {
+      job_spec: {
+        scenario_ids: ["s1", "s2", "s3"],
+      } as any,
+      benchmark_outcomes: [
+        {
+          benchmark_run_id: "run_completed",
+          agent_name: "agent-1",
+          average_score: 0.8,
+          n_completed: 3,
+          n_failed: 0,
+          n_timeout: 0,
+          scenario_outcomes: [{ score: 0.8 }] as any[],
+        },
+      ],
+      in_progress_runs: [
+        {
+          benchmark_run_id: "run_in_progress",
+          state: "running",
+          agent_config: { type: "job_agent", name: "agent-2" },
+        } as any,
+      ],
+    };
+
+    const mockFetchScenarioRuns = async () => [
+      { state: "completed", scoring_contract_result: { score: 1.0 } },
+      { state: "running" },
+    ] as any[];
+
+    const progress = await fetchAllRunsProgress(
+      job as BenchmarkJob,
+      mockFetchScenarioRuns,
+    );
+
+    expect(progress).toHaveLength(2);
+    expect(progress[0].benchmarkRunId).toBe("run_completed");
+    expect(progress[0].state).toBe("completed");
+    expect(progress[1].benchmarkRunId).toBe("run_in_progress");
+    expect(progress[1].running).toBe(1);
+    expect(progress[1].finished).toBe(1);
+  });
+
+  it("should handle job with no runs", async () => {
+    const job: Partial<BenchmarkJob> = {
+      benchmark_outcomes: [],
+      in_progress_runs: [],
+    };
+
+    const mockFetchScenarioRuns = async () => [];
+    const progress = await fetchAllRunsProgress(
+      job as BenchmarkJob,
+      mockFetchScenarioRuns,
+    );
+
+    expect(progress).toHaveLength(0);
+  });
+
+  it("should handle the bug case: completed job with stale in-progress snapshot", async () => {
+    // This is the specific bug case from the user report:
+    // - Job state is completed
+    // - benchmark_outcomes shows all scenarios finished
+    // - in_progress_runs still contains a run (stale data from previous poll)
+    const job: Partial<BenchmarkJob> = {
+      state: "completed",
+      job_spec: {
+        scenario_ids: Array(60).fill("s"),
+      } as any,
+      benchmark_outcomes: [
+        {
+          benchmark_run_id: "run_123",
+          agent_name: "claude-code:claude-haiku-4",
+          average_score: 0.88,
+          n_completed: 60,
+          n_failed: 0,
+          n_timeout: 0,
+          scenario_outcomes: Array(60).fill({
+            score: 0.88,
+            state: "completed",
+          }),
+        },
+      ],
+      in_progress_runs: [
+        {
+          benchmark_run_id: "run_123",
+          state: "running",
+          agent_config: {
+            type: "job_agent",
+            name: "claude-code:claude-haiku-4",
+          },
+        } as any,
+      ],
+    };
+
+    const mockFetchScenarioRuns = async () => {
+      // Even if the scenario runs endpoint returns stale data...
+      return [
+        ...Array(59).fill({
+          state: "completed",
+          scoring_contract_result: { score: 0.88 },
+        }),
+        { state: "running" },
+      ] as any[];
+    };
+
+    const progress = await fetchAllRunsProgress(
+      job as BenchmarkJob,
+      mockFetchScenarioRuns,
+    );
+
+    // Should only have 1 run (from completed outcomes)
+    expect(progress).toHaveLength(1);
+
+    // Should show as completed, not running
+    expect(progress[0].state).toBe("completed");
+    expect(progress[0].finished).toBe(60);
+    expect(progress[0].running).toBe(0);
+    expect(progress[0].avgScore).toBeCloseTo(0.88);
+  });
+});

From 1fb2c177c0933b7c886e1ff0aae2a7f2752b3f8a Mon Sep 17 00:00:00 2001
From: James Chainey <james@runloop.ai>
Date: Mon, 16 Mar 2026 17:15:00 -0700
Subject: [PATCH 3/4] re-gated bmjs

---
 src/components/MainMenu.tsx                          |  4 ++++
 src/store/betaFeatureStore.tsx                       | 12 ++++++------
 tests/__tests__/components/MainMenu.test.tsx         | 12 ------------
 tests/__tests__/integration/navigationFlows.test.tsx |  1 -
 tests/__tests__/router/Router.test.tsx               |  8 --------
 tests/__tests__/screens/MenuScreen.test.tsx          |  1 -
 6 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/src/components/MainMenu.tsx b/src/components/MainMenu.tsx
index 8b37037e..78a5a1c0 100644
--- a/src/components/MainMenu.tsx
+++ b/src/components/MainMenu.tsx
@@ -24,6 +24,7 @@ interface MenuItem {
 }
 
 const allMenuItems: MenuItem[] = [
+  /**
   {
     key: "benchmarks",
     label: "Benchmarks",
@@ -31,6 +32,7 @@ const allMenuItems: MenuItem[] = [
     icon: "▷",
     color: colors.success,
   },
+  */
   {
     key: "devboxes",
     label: "Devboxes",
@@ -188,8 +190,10 @@ export const MainMenu = ({ onSelect }: MainMenuProps) => {
       selectByKey("snapshots");
     } else if (input === "o") {
       selectByKey("objects");
+      /**
     } else if (input === "e") {
       selectByKey("benchmarks");
+    */
     } else if (input === "n") {
       selectByKey("settings");
     } else if (input >= "1" && input <= "9") {
diff --git a/src/store/betaFeatureStore.tsx b/src/store/betaFeatureStore.tsx
index 93a9d10f..40e0bb56 100644
--- a/src/store/betaFeatureStore.tsx
+++ b/src/store/betaFeatureStore.tsx
@@ -16,15 +16,14 @@ interface BetaFeatureContextValue {
 
   /**
    * Check if a specific feature flag is enabled
-   * This stays in place so future beta-only features can be wired through
-   * the shared RL_CLI_BETA gate without changing consuming components.
+   * Currently all beta features are controlled by the single RL_CLI_BETA flag,
+   * but this allows for future granular control
    */
   isFeatureEnabled: (feature: BetaFeature) => boolean;
 }
 
 /**
- * Known beta features that can be enabled.
- * There are currently no named beta-only features configured.
+ * Known beta features that can be enabled
  */
 export type BetaFeature = never;
 
@@ -42,8 +41,9 @@ export function BetaFeatureProvider({ children }: BetaFeatureProviderProps) {
 
   const isFeatureEnabled = React.useCallback(
     (feature: BetaFeature): boolean => {
-      // No named beta features exist yet, but keep the shared env-based
-      // gate so future feature flags can opt into it centrally.
+      // Currently all beta features are gated by the same flag
+      // This can be extended to support per-feature flags in the future
+      // Add cases here when new beta features are introduced
       void feature;
       return betaEnabled;
     },
diff --git a/tests/__tests__/components/MainMenu.test.tsx b/tests/__tests__/components/MainMenu.test.tsx
index 9d2c15b3..a59c5b48 100644
--- a/tests/__tests__/components/MainMenu.test.tsx
+++ b/tests/__tests__/components/MainMenu.test.tsx
@@ -25,7 +25,6 @@ describe('MainMenu', () => {
     const { lastFrame } = renderMainMenu();
     
     const frame = lastFrame() || '';
-    expect(frame).toContain('Benchmarks');
     expect(frame).toContain('Devboxes');
     expect(frame).toContain('Blueprints');
     expect(frame).toContain('Snapshots');
@@ -49,17 +48,6 @@ describe('MainMenu', () => {
     expect(frame).toContain('[3]');
   });
 
-  it('selects benchmarks with the e shortcut', () => {
-    let selectedKey = '';
-    const { stdin } = renderMainMenu((key: string) => {
-      selectedKey = key;
-    });
-
-    stdin.write('e');
-
-    expect(selectedKey).toBe('benchmarks');
-  });
-
   it('shows navigation help', () => {
     const { lastFrame } = renderMainMenu();
     
diff --git a/tests/__tests__/integration/navigationFlows.test.tsx b/tests/__tests__/integration/navigationFlows.test.tsx
index 4908fdca..c65dfb40 100644
--- a/tests/__tests__/integration/navigationFlows.test.tsx
+++ b/tests/__tests__/integration/navigationFlows.test.tsx
@@ -24,7 +24,6 @@ describe("navigation flows", () => {
     const { lastFrame, stdin } = renderApp("menu");
 
     let frame = lastFrame() ?? "";
-    expect(frame).toContain("Benchmarks");
     expect(frame).toContain("Devboxes");
     expect(frame).toContain("Blueprints");
 
diff --git a/tests/__tests__/router/Router.test.tsx b/tests/__tests__/router/Router.test.tsx
index df03b99c..a9dec66e 100644
--- a/tests/__tests__/router/Router.test.tsx
+++ b/tests/__tests__/router/Router.test.tsx
@@ -26,18 +26,10 @@ describe("Router", () => {
   it("renders menu screen when initialScreen is menu", () => {
     const { lastFrame } = renderWithApp("menu");
     const frame = lastFrame() ?? "";
-    expect(frame).toContain("Benchmarks");
     expect(frame).toContain("Devboxes");
     expect(frame).toContain("Blueprints");
   });
 
-  it("renders benchmark-menu screen when initialScreen is benchmark-menu", () => {
-    const { lastFrame } = renderWithApp("benchmark-menu");
-    const frame = lastFrame() ?? "";
-    expect(frame).toContain("Benchmarks");
-    expect(frame).toContain("Benchmark Defs");
-  });
-
   it("renders devbox-list screen when initialScreen is devbox-list", () => {
     const { lastFrame } = renderWithApp("devbox-list");
     const frame = lastFrame() ?? "";
diff --git a/tests/__tests__/screens/MenuScreen.test.tsx b/tests/__tests__/screens/MenuScreen.test.tsx
index 729bbc85..c1f858dd 100644
--- a/tests/__tests__/screens/MenuScreen.test.tsx
+++ b/tests/__tests__/screens/MenuScreen.test.tsx
@@ -26,7 +26,6 @@ describe("MenuScreen", () => {
   it("displays main menu items", () => {
     const { lastFrame } = renderMenuScreen();
     const frame = lastFrame() ?? "";
-    expect(frame).toContain("Benchmarks");
     expect(frame).toContain("Devboxes");
     expect(frame).toContain("Blueprints");
     expect(frame).toContain("Snapshots");

From 51a6d14f9014a88b76ce067be5dcba90fd5b0deb Mon Sep 17 00:00:00 2001
From: James Chainey <james@runloop.ai>
Date: Mon, 16 Mar 2026 17:29:31 -0700
Subject: [PATCH 4/4] fmt

---
 src/commands/benchmark-job/summary.ts |  5 ++++-
 src/commands/benchmark-job/watch.ts   | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/commands/benchmark-job/summary.ts b/src/commands/benchmark-job/summary.ts
index b492fbb9..a144e3e4 100644
--- a/src/commands/benchmark-job/summary.ts
+++ b/src/commands/benchmark-job/summary.ts
@@ -98,7 +98,10 @@ async function printStatus(job: BenchmarkJob): Promise<void> {
   if (!isJobCompleted(state)) {
     // Fetch and show progress for in-progress runs
     console.log();
-    const progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
+    const progressList = await fetchAllRunsProgress(
+      job,
+      listBenchmarkRunScenarioRuns,
+    );
     printProgress(progressList);
   }
 }
diff --git a/src/commands/benchmark-job/watch.ts b/src/commands/benchmark-job/watch.ts
index c0bc1e0e..fe005782 100644
--- a/src/commands/benchmark-job/watch.ts
+++ b/src/commands/benchmark-job/watch.ts
@@ -114,7 +114,6 @@ function getMaxScenariosPerRun(numRuns: number): number {
   return Math.max(Math.floor(availableLines / Math.max(numRuns, 1)), 3);
 }
 
-
 // Format a single run's progress line
 function formatRunProgressLine(progress: RunProgress): string {
   let label = progress.agentName;
@@ -398,7 +397,10 @@ export async function watchBenchmarkJob(id: string) {
 
     try {
       let tick = 0;
-      let progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
+      let progressList = await fetchAllRunsProgress(
+        job,
+        listBenchmarkRunScenarioRuns,
+      );
 
       while (!isJobCompleted(job.state)) {
         // Check timeout
@@ -449,7 +451,10 @@ export async function watchBenchmarkJob(id: string) {
         if (tick % UPDATES_PER_POLL === 0) {
           job = await getBenchmarkJob(id);
           if (!isJobCompleted(job.state)) {
-            progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
+            progressList = await fetchAllRunsProgress(
+              job,
+              listBenchmarkRunScenarioRuns,
+            );
           }
         }
 
@@ -460,7 +465,10 @@ export async function watchBenchmarkJob(id: string) {
       // we have the most up-to-date data before printing results. This prevents
       // stale in-progress state from persisting after the job completes.
       job = await getBenchmarkJob(id);
-      progressList = await fetchAllRunsProgress(job, listBenchmarkRunScenarioRuns);
+      progressList = await fetchAllRunsProgress(
+        job,
+        listBenchmarkRunScenarioRuns,
+      );
     } finally {
       process.stdout.off("resize", handleResize);
       cleanup();