diff --git a/.github/workflows/perf-probe-baseline.yml b/.github/workflows/perf-probe-baseline.yml index 6798510b7..0744ed3bd 100644 --- a/.github/workflows/perf-probe-baseline.yml +++ b/.github/workflows/perf-probe-baseline.yml @@ -7,6 +7,7 @@ on: - "packages/app/e2e/perf/**" - "packages/app/e2e/fixtures.ts" - "packages/app/package.json" + - "packages/app/script/compare-perf.ts" - "packages/app/script/e2e-local.ts" - "packages/app/src/testing/perf-metrics*" workflow_dispatch: @@ -18,11 +19,20 @@ jobs: perf-probe-baseline: runs-on: ubuntu-latest timeout-minutes: 30 + env: + PERF_ARTIFACT_DIR: ${{ github.workspace }}/perf-artifacts steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # actions/checkout@v6 with: + path: head persist-credentials: false + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # actions/checkout@v6 + with: + path: base + persist-credentials: false + ref: ${{ github.event.pull_request.base.sha || github.sha }} + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # actions/setup-node@v6.4.0 with: node-version: "24" @@ -34,25 +44,52 @@ jobs: - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5 with: path: ~/.bun/install/cache - key: bun-${{ runner.os }}-${{ hashFiles('bun.lock') }} + key: bun-${{ runner.os }}-${{ hashFiles('head/bun.lock', 'base/bun.lock') }} restore-keys: | bun-${{ runner.os }}- - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5 with: path: ${{ github.workspace }}/.playwright-browsers - key: playwright-${{ runner.os }}-${{ hashFiles('packages/app/package.json', 'bun.lock') }} + key: playwright-${{ runner.os }}-${{ hashFiles('head/packages/app/package.json', 'head/bun.lock', 'base/packages/app/package.json', 'base/bun.lock') }} + + - name: Install head dependencies + working-directory: head + run: bun install --frozen-lockfile - - run: bun install --frozen-lockfile + - name: Install base dependencies + working-directory: base + run: bun install --frozen-lockfile - name: Install Playwright browsers - working-directory: packages/app + working-directory: head/packages/app run: bunx playwright install --with-deps chromium - - name: Run perf probe baseline + - name: Sync perf harness into base checkout + run: | + rsync -a --delete head/packages/app/e2e/perf/ base/packages/app/e2e/perf/ + cp head/packages/app/src/testing/perf-metrics.ts base/packages/app/src/testing/perf-metrics.ts + + - name: Run perf probe baseline (base) env: CI: "true" - run: bun --cwd packages/app test:e2e:local:perf + PAWWORK_PERF_BRANCH: base + PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-base.json + run: bun --cwd base/packages/app test:e2e:local:perf + + - name: Run perf probe baseline (head) + env: + CI: "true" + PAWWORK_PERF_BRANCH: head + PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-head.json + run: bun --cwd head/packages/app test:e2e:local:perf + + - name: Compare base and head perf baselines + run: > + bun head/packages/app/script/compare-perf.ts + --base "${PERF_ARTIFACT_DIR}/perf-base.json" + --head "${PERF_ARTIFACT_DIR}/perf-head.json" + --output "${PERF_ARTIFACT_DIR}/perf-compare.json" - name: Upload perf probe artifacts if: always() @@ -62,6 +99,8 @@ jobs: if-no-files-found: ignore retention-days: 7 path: | - packages/app/e2e/perf-results - packages/app/e2e/playwright-report - packages/app/e2e/test-results + perf-artifacts + base/packages/app/e2e/playwright-report + base/packages/app/e2e/test-results + head/packages/app/e2e/playwright-report + head/packages/app/e2e/test-results diff --git a/packages/app/e2e/perf/perf-probe.spec.ts b/packages/app/e2e/perf/perf-probe.spec.ts index e44cfb0dd..f48174ad4 100644 --- a/packages/app/e2e/perf/perf-probe.spec.ts +++ b/packages/app/e2e/perf/perf-probe.spec.ts @@ -8,6 +8,7 @@ import { sessionPath } from "../utils" import { installPerfProbe, resetPerfProbe, snapshotPerfProbe, summarizeScenarioRuns } from "./probe" const outputPath = process.env.PAWWORK_PERF_OUTPUT ?? path.join(process.cwd(), "e2e", "perf-results", "pr0.1-baseline.json") +const perfBranch = process.env.PAWWORK_PERF_BRANCH ?? "dev" const longMarkdown = [ "# Baseline stream", @@ -73,6 +74,11 @@ async function settleFrames(page: Parameters[0], count }, count) } +async function cooldownAfterRun(page: Parameters[0]) { + await settleFrames(page, 6) + await page.waitForTimeout(250) +} + async function navigateProjectHome(page: Parameters[0], directory: string) { await page.goto(sessionPath(directory)) await expect(page.locator('[data-component="session-new-home"]')).toBeVisible() @@ -148,9 +154,10 @@ test.describe("PR0.1 perf probe baseline", () => { await settleFrames(page, 3) await page.keyboard.press("Escape") runs.push(await snapshotPerfProbe(page)) + if (run < 2) await cooldownAfterRun(page) } - scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "homepage-cold", runs })) + scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "homepage-cold", runs })) }) test("session-streaming-long emits a 3-run JSON baseline", async ({ page, project, llm }) => { @@ -192,9 +199,10 @@ test.describe("PR0.1 perf probe baseline", () => { secondWave.resolve() await send runs.push(await snapshotPerfProbe(page)) + if (run < 2) await cooldownAfterRun(page) } - scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-streaming-long", runs })) + scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "session-streaming-long", runs })) }) test("tool-call-expand emits a 3-run JSON baseline", async ({ page, project, llm }) => { @@ -221,9 +229,10 @@ test.describe("PR0.1 perf probe baseline", () => { await expect(trigger).toHaveAttribute("aria-expanded", "true") await settleFrames(page, 4) runs.push(await snapshotPerfProbe(page)) + if (run < 2) await cooldownAfterRun(page) } - scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "tool-call-expand", runs })) + scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "tool-call-expand", runs })) }) test("session-scroll-reading emits a 3-run JSON baseline", async ({ page, project }) => { @@ -258,9 +267,10 @@ test.describe("PR0.1 perf probe baseline", () => { await page.mouse.wheel(0, 3600) await settleFrames(page, 4) runs.push(await snapshotPerfProbe(page)) + if (run < 2) await cooldownAfterRun(page) }) } - scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-scroll-reading", runs })) + scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "session-scroll-reading", runs })) }) }) diff --git a/packages/app/script/compare-perf.ts b/packages/app/script/compare-perf.ts new file mode 100644 index 000000000..d4bb9d837 --- /dev/null +++ b/packages/app/script/compare-perf.ts @@ -0,0 +1,50 @@ +import fs from "node:fs/promises" +import path from "node:path" +import { comparePerfBaselines, type PerfScenarioSummary } from "../src/testing/perf-metrics" + +function readArg(flag: string) { + const index = process.argv.indexOf(flag) + if (index === -1) return undefined + const value = process.argv[index + 1] + if (!value || value.startsWith("--")) return undefined + return value +} + +async function readPerfFile(filePath: string) { + const payload = JSON.parse(await fs.readFile(filePath, "utf8")) as PerfScenarioSummary[] + if (!Array.isArray(payload)) { + throw new Error(`Expected an array of perf scenarios in ${filePath}`) + } + return payload +} + +async function main() { + const basePath = readArg("--base") + const headPath = readArg("--head") + const outputPath = readArg("--output") + + if (!basePath || !headPath) { + throw new Error("Usage: bun script/compare-perf.ts --base --head [--output ]") + } + + const [base, head] = await Promise.all([readPerfFile(basePath), readPerfFile(headPath)]) + const comparison = comparePerfBaselines({ base, head }) + + if (outputPath) { + await fs.mkdir(path.dirname(outputPath), { recursive: true }) + await fs.writeFile(outputPath, `${JSON.stringify(comparison, null, 2)}\n`) + } + + const summary = { + pass: comparison.pass, + failures: comparison.failures, + warnings: comparison.warnings, + } + console.log(JSON.stringify(summary, null, 2)) + + if (!comparison.pass) { + process.exitCode = 1 + } +} + +await main() diff --git a/packages/app/src/testing/perf-metrics.test.ts b/packages/app/src/testing/perf-metrics.test.ts index 8ede5ba95..7852d47d2 100644 --- a/packages/app/src/testing/perf-metrics.test.ts +++ b/packages/app/src/testing/perf-metrics.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from "bun:test" -import { aggregatePerfRuns, summarizePerfRun } from "./perf-metrics" +import { aggregatePerfRuns, comparePerfBaselines, comparePerfScenarioSummaries, summarizePerfRun } from "./perf-metrics" describe("perf metrics", () => { test("summarizes a perf sample window", () => { @@ -116,4 +116,192 @@ describe("perf metrics", () => { expect(summary.heap_used_mb).toBe(93) expect(summary.run_details).toHaveLength(3) }) + + test("fails a scenario when median regression breaks both the ms and percentage budgets", () => { + const result = comparePerfScenarioSummaries({ + scenario: "session-streaming-long", + base: { + branch: "base", + scenario: "session-streaming-long", + runs: 3, + interaction_ms_median: 100, + interaction_ms_worst: 140, + interaction_ms: 100, + interaction_delay_ms: 12, + long_task_count: 1, + long_task_max_ms: 80, + tbt_ms: 30, + frame_gap_p95_ms: 32, + frame_gap_max_ms: 60, + jank_count_50ms: 1, + cls: 0.11, + window_ms: 1200, + run_details: [], + }, + head: { + branch: "head", + scenario: "session-streaming-long", + runs: 3, + interaction_ms_median: 116, + interaction_ms_worst: 168, + interaction_ms: 116, + interaction_delay_ms: 14, + long_task_count: 1, + long_task_max_ms: 88, + tbt_ms: 34, + frame_gap_p95_ms: 35, + frame_gap_max_ms: 70, + jank_count_50ms: 1, + cls: 0.013, + window_ms: 1220, + run_details: [], + }, + }) + + expect(result.pass).toBe(false) + expect(result.failures).toContain("interaction_ms_median") + expect(result.warnings).toHaveLength(0) + }) + + test("fails a scenario on catastrophic absolute thresholds even when there is no regression delta", () => { + const result = comparePerfScenarioSummaries({ + scenario: "tool-call-expand", + base: { + branch: "base", + scenario: "tool-call-expand", + runs: 3, + interaction_ms_median: 120, + interaction_ms_worst: 510, + interaction_ms: 120, + interaction_delay_ms: 12, + long_task_count: 1, + long_task_max_ms: 90, + tbt_ms: 36, + frame_gap_p95_ms: 32, + frame_gap_max_ms: 90, + jank_count_50ms: 1, + cls: 0.01, + window_ms: 900, + run_details: [], + }, + head: { + branch: "head", + scenario: "tool-call-expand", + runs: 3, + interaction_ms_median: 120, + interaction_ms_worst: 510, + interaction_ms: 120, + interaction_delay_ms: 12, + long_task_count: 1, + long_task_max_ms: 90, + tbt_ms: 36, + frame_gap_p95_ms: 32, + frame_gap_max_ms: 90, + jank_count_50ms: 1, + cls: 0.01, + window_ms: 900, + run_details: [], + }, + }) + + expect(result.pass).toBe(false) + expect(result.failures).toContain("interaction_ms_worst") + }) + + test("keeps Web Vitals good lines warn-only in PR0.2", () => { + const result = comparePerfScenarioSummaries({ + scenario: "homepage-cold", + base: { + branch: "base", + scenario: "homepage-cold", + runs: 3, + interaction_ms_median: 70, + interaction_ms_worst: 95, + interaction_ms: 70, + interaction_delay_ms: 9, + long_task_count: 0, + long_task_max_ms: 0, + tbt_ms: 0, + frame_gap_p95_ms: 18, + frame_gap_max_ms: 24, + jank_count_50ms: 0, + cls: 0.11, + window_ms: 600, + fcp_ms: 1500, + lcp_ms: 2300, + run_details: [], + }, + head: { + branch: "head", + scenario: "homepage-cold", + runs: 3, + interaction_ms_median: 74, + interaction_ms_worst: 99, + interaction_ms: 74, + interaction_delay_ms: 11, + long_task_count: 0, + long_task_max_ms: 0, + tbt_ms: 0, + frame_gap_p95_ms: 19, + frame_gap_max_ms: 26, + jank_count_50ms: 0, + cls: 0.12, + window_ms: 620, + fcp_ms: 2400, + lcp_ms: 3100, + run_details: [], + }, + }) + + expect(result.pass).toBe(true) + expect(result.failures).toHaveLength(0) + expect(result.warnings).toEqual(expect.arrayContaining(["cls", "fcp_ms", "lcp_ms"])) + }) + + test("compares baseline collections by scenario and fails when a head scenario is missing", () => { + const base = [ + aggregatePerfRuns({ + branch: "base", + scenario: "homepage-cold", + runs: [ + { + interaction_ms: 60, + interaction_delay_ms: 8, + long_task_count: 0, + long_task_max_ms: 0, + tbt_ms: 0, + frame_gap_p95_ms: 18, + frame_gap_max_ms: 26, + jank_count_50ms: 0, + cls: 0.01, + window_ms: 800, + }, + ], + }), + aggregatePerfRuns({ + branch: "base", + scenario: "tool-call-expand", + runs: [ + { + interaction_ms: 70, + interaction_delay_ms: 9, + long_task_count: 1, + long_task_max_ms: 75, + tbt_ms: 25, + frame_gap_p95_ms: 28, + frame_gap_max_ms: 40, + jank_count_50ms: 0, + cls: 0.002, + window_ms: 900, + }, + ], + }), + ] + const head = [base[0]] + + const result = comparePerfBaselines({ base, head }) + + expect(result.pass).toBe(false) + expect(result.failures).toContain("missing_head_scenario:tool-call-expand") + }) }) diff --git a/packages/app/src/testing/perf-metrics.ts b/packages/app/src/testing/perf-metrics.ts index 59ba55ce3..e0b8805dd 100644 --- a/packages/app/src/testing/perf-metrics.ts +++ b/packages/app/src/testing/perf-metrics.ts @@ -51,6 +51,48 @@ export type PerfScenarioSummary = PerfRunSummary & { run_details: PerfRunSummary[] } +export type PerfScenarioComparison = { + scenario: string + pass: boolean + failures: string[] + warnings: string[] + base: PerfScenarioSummary + head: PerfScenarioSummary +} + +export type PerfBaselineComparison = { + pass: boolean + failures: string[] + warnings: string[] + scenarios: PerfScenarioComparison[] +} + +const perfDeltaThresholds = { + interactionMedianMs: 10, + interactionMedianRatio: 1.05, + interactionWorstMs: 50, + longTaskMaxMs: 25, + tbtMs: 50, + frameGapP95Ms: 10, + frameGapMaxMs: 50, + jankCount: 2, + cls: 0.02, +} as const + +const perfAbsoluteWarnings = { + interactionMsWorst: 200, + tbtMs: 200, + cls: 0.05, + fcpMs: 1800, + lcpMs: 2500, +} as const + +const perfCatastrophicThresholds = { + interactionMsWorst: 500, + frameGapMaxMs: 500, + longTaskMaxMs: 250, +} as const + function round(input: number) { return Math.round(input * 1000) / 1000 } @@ -146,3 +188,121 @@ export function aggregatePerfRuns(input: { run_details: runs, } } + +function exceededByDelta(input: { base: number; head: number; maxDelta: number; maxRatio?: number }) { + const delta = input.head - input.base + if (delta <= input.maxDelta) return false + if (input.maxRatio === undefined) return true + if (input.base <= 0) return true + return input.head > input.base * input.maxRatio +} + +function addAbsoluteWarning(target: string[], key: string, value: number | undefined, threshold: number) { + if (value === undefined) return + if (value > threshold) target.push(key) +} + +export function comparePerfScenarioSummaries(input: { + scenario: string + base: PerfScenarioSummary + head: PerfScenarioSummary +}): PerfScenarioComparison { + const failures: string[] = [] + const warnings: string[] = [] + + if ( + exceededByDelta({ + base: input.base.interaction_ms_median, + head: input.head.interaction_ms_median, + maxDelta: perfDeltaThresholds.interactionMedianMs, + maxRatio: perfDeltaThresholds.interactionMedianRatio, + }) + ) { + failures.push("interaction_ms_median") + } + + if (input.head.interaction_ms_worst > input.base.interaction_ms_worst + perfDeltaThresholds.interactionWorstMs) { + failures.push("interaction_ms_worst_delta") + } + if (input.head.interaction_ms_worst >= perfCatastrophicThresholds.interactionMsWorst) { + failures.push("interaction_ms_worst") + } + if (input.head.long_task_max_ms > input.base.long_task_max_ms + perfDeltaThresholds.longTaskMaxMs) { + failures.push("long_task_max_ms_delta") + } + if (input.head.long_task_max_ms >= perfCatastrophicThresholds.longTaskMaxMs) { + failures.push("long_task_max_ms") + } + if (input.head.tbt_ms > input.base.tbt_ms + perfDeltaThresholds.tbtMs) { + failures.push("tbt_ms") + } + if (input.head.frame_gap_p95_ms > input.base.frame_gap_p95_ms + perfDeltaThresholds.frameGapP95Ms) { + failures.push("frame_gap_p95_ms") + } + if (input.head.frame_gap_max_ms > input.base.frame_gap_max_ms + perfDeltaThresholds.frameGapMaxMs) { + failures.push("frame_gap_max_ms_delta") + } + if (input.head.frame_gap_max_ms >= perfCatastrophicThresholds.frameGapMaxMs) { + failures.push("frame_gap_max_ms") + } + if (input.head.jank_count_50ms > input.base.jank_count_50ms + perfDeltaThresholds.jankCount) { + failures.push("jank_count_50ms") + } + if (input.head.cls > input.base.cls + perfDeltaThresholds.cls) { + failures.push("cls_delta") + } + + addAbsoluteWarning(warnings, "interaction_ms_worst", input.head.interaction_ms_worst, perfAbsoluteWarnings.interactionMsWorst) + addAbsoluteWarning(warnings, "tbt_ms", input.head.tbt_ms, perfAbsoluteWarnings.tbtMs) + addAbsoluteWarning(warnings, "cls", input.head.cls, perfAbsoluteWarnings.cls) + addAbsoluteWarning(warnings, "fcp_ms", input.head.fcp_ms, perfAbsoluteWarnings.fcpMs) + addAbsoluteWarning(warnings, "lcp_ms", input.head.lcp_ms, perfAbsoluteWarnings.lcpMs) + + return { + scenario: input.scenario, + pass: failures.length === 0, + failures: [...new Set(failures)], + warnings: [...new Set(warnings)], + base: input.base, + head: input.head, + } +} + +export function comparePerfBaselines(input: { + base: PerfScenarioSummary[] + head: PerfScenarioSummary[] +}): PerfBaselineComparison { + const failures: string[] = [] + const warnings: string[] = [] + const scenarios: PerfScenarioComparison[] = [] + const headByScenario = new Map(input.head.map((scenario) => [scenario.scenario, scenario])) + + for (const baseScenario of input.base) { + const headScenario = headByScenario.get(baseScenario.scenario) + if (!headScenario) { + failures.push(`missing_head_scenario:${baseScenario.scenario}`) + continue + } + const comparison = comparePerfScenarioSummaries({ + scenario: baseScenario.scenario, + base: baseScenario, + head: headScenario, + }) + scenarios.push(comparison) + for (const failure of comparison.failures) failures.push(`${baseScenario.scenario}:${failure}`) + for (const warning of comparison.warnings) warnings.push(`${baseScenario.scenario}:${warning}`) + } + + for (const headScenario of input.head) { + if (!input.base.some((scenario) => scenario.scenario === headScenario.scenario)) { + failures.push(`missing_base_scenario:${headScenario.scenario}`) + } + } + + return { + pass: failures.length === 0, + failures, + warnings, + scenarios, + } +}