diff --git a/.github/workflows/perf-probe-baseline.yml b/.github/workflows/perf-probe-baseline.yml index 0744ed3b..6d1c2fde 100644 --- a/.github/workflows/perf-probe-baseline.yml +++ b/.github/workflows/perf-probe-baseline.yml @@ -4,14 +4,21 @@ on: pull_request: paths: - ".github/workflows/perf-probe-baseline.yml" + - "packages/app/src/**" + - "packages/ui/src/**" - "packages/app/e2e/perf/**" - "packages/app/e2e/fixtures.ts" - "packages/app/package.json" + - "packages/app/playwright.config.ts" - "packages/app/script/compare-perf.ts" - "packages/app/script/e2e-local.ts" - "packages/app/src/testing/perf-metrics*" workflow_dispatch: +permissions: + contents: read + pull-requests: write + env: PLAYWRIGHT_BROWSERS_PATH: ${{ github.workspace }}/.playwright-browsers @@ -68,6 +75,7 @@ jobs: - name: Sync perf harness into base checkout run: | rsync -a --delete head/packages/app/e2e/perf/ base/packages/app/e2e/perf/ + cp head/packages/app/playwright.config.ts base/packages/app/playwright.config.ts cp head/packages/app/src/testing/perf-metrics.ts base/packages/app/src/testing/perf-metrics.ts - name: Run perf probe baseline (base) @@ -85,11 +93,104 @@ jobs: run: bun --cwd head/packages/app test:e2e:local:perf - name: Compare base and head perf baselines + id: compare + continue-on-error: true run: > bun head/packages/app/script/compare-perf.ts --base "${PERF_ARTIFACT_DIR}/perf-base.json" --head "${PERF_ARTIFACT_DIR}/perf-head.json" --output "${PERF_ARTIFACT_DIR}/perf-compare.json" + --comment-output "${PERF_ARTIFACT_DIR}/perf-comment.md" + + - name: Confirm perf regression (base) + if: steps.compare.outcome == 'failure' + env: + CI: "true" + PAWWORK_PERF_BRANCH: base + PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-base-confirm.json + run: bun --cwd base/packages/app test:e2e:local:perf + + - name: Confirm perf regression (head) + if: steps.compare.outcome == 'failure' + env: + CI: "true" + PAWWORK_PERF_BRANCH: head + PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-head-confirm.json + run: bun --cwd head/packages/app test:e2e:local:perf + + - name: Compare confirmed perf baselines + id: compare_confirmed + if: steps.compare.outcome == 'failure' + continue-on-error: true + run: > + bun head/packages/app/script/compare-perf.ts + --base "${PERF_ARTIFACT_DIR}/perf-base-confirm.json" + --head "${PERF_ARTIFACT_DIR}/perf-head-confirm.json" + --output "${PERF_ARTIFACT_DIR}/perf-compare-confirm.json" + --comment-output "${PERF_ARTIFACT_DIR}/perf-comment.md" + + - name: Comment perf deltas on pull request + if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository + uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # actions/github-script@v7 + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + with: + script: | + const fs = require("node:fs") + const marker = "" + const body = fs.readFileSync(`${process.env.GITHUB_WORKSPACE}/perf-artifacts/perf-comment.md`, "utf8") + const pull_number = Number(process.env.PR_NUMBER) + const owner = context.repo.owner + const repo = context.repo.repo + + const comments = await github.paginate(github.rest.issues.listComments, { + owner, + repo, + issue_number: pull_number, + per_page: 100, + }) + const existing = comments.find((comment) => typeof comment.body === "string" && comment.body.includes(marker)) + + if (existing) { + await github.rest.issues.updateComment({ + owner, + repo, + comment_id: existing.id, + body, + }) + core.info(`Updated perf delta comment ${existing.id}`) + return + } + + await github.rest.issues.createComment({ + owner, + repo, + issue_number: pull_number, + body, + }) + core.info("Created perf delta comment") + + - name: Capture perf diagnostic trace (base) + if: steps.compare.outcome == 'failure' && steps.compare_confirmed.outcome == 'failure' + env: + CI: "true" + PAWWORK_PERF_BRANCH: base + PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-base-trace.json + PAWWORK_PERF_TRACE: "1" + run: bun --cwd base/packages/app test:e2e:local:perf + + - name: Capture perf diagnostic trace (head) + if: steps.compare.outcome == 'failure' && steps.compare_confirmed.outcome == 'failure' + env: + CI: "true" + PAWWORK_PERF_BRANCH: head + PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-head-trace.json + PAWWORK_PERF_TRACE: "1" + run: bun --cwd head/packages/app test:e2e:local:perf + + - name: Fail job on comparator regression + if: steps.compare.outcome == 'failure' && steps.compare_confirmed.outcome == 'failure' + run: exit 1 - name: Upload perf probe artifacts if: always() diff --git a/packages/app/e2e/perf/perf-probe.spec.ts b/packages/app/e2e/perf/perf-probe.spec.ts index f48174ad..a34f0f9f 100644 --- a/packages/app/e2e/perf/perf-probe.spec.ts +++ b/packages/app/e2e/perf/perf-probe.spec.ts @@ -2,9 +2,15 @@ import fs from "node:fs/promises" import path from "node:path" import { raw } from "../../../opencode/test/lib/llm-server" import { test, expect } from "../fixtures" -import { withSession } from "../actions" -import { promptSelector, sessionMessageItemSelector, sessionTurnListSelector, scrollViewportSelector } from "../selectors" -import { sessionPath } from "../utils" +import { waitTerminalFocusIdle, withSession } from "../actions" +import { + promptSelector, + sessionMessageItemSelector, + sessionTurnListSelector, + scrollViewportSelector, + terminalSelector, +} from "../selectors" +import { sessionPath, terminalToggleKey } from "../utils" import { installPerfProbe, resetPerfProbe, snapshotPerfProbe, summarizeScenarioRuns } from "./probe" const outputPath = process.env.PAWWORK_PERF_OUTPUT ?? path.join(process.cwd(), "e2e", "perf-results", "pr0.1-baseline.json") @@ -79,6 +85,14 @@ async function cooldownAfterRun(page: Parameters[0]) { await page.waitForTimeout(250) } +async function ensureTerminalClosed(page: Parameters[0]) { + const terminal = page.locator(terminalSelector) + const visible = await terminal.isVisible().catch(() => false) + if (!visible) return + await page.keyboard.press(terminalToggleKey) + await expect(terminal).toHaveCount(0) +} + async function navigateProjectHome(page: Parameters[0], directory: string) { await page.goto(sessionPath(directory)) await expect(page.locator('[data-component="session-new-home"]')).toBeVisible() @@ -235,6 +249,33 @@ test.describe("PR0.1 perf probe baseline", () => { scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "tool-call-expand", runs })) }) + test("terminal-side-panel-open emits a 3-run JSON baseline", async ({ page, project }) => { + await installPerfProbe(page) + await project.open() + + const runs = [] + for (let run = 0; run < 3; run += 1) { + await withSession(project.sdk, `perf terminal ${Date.now()}-${run}`, async (session) => { + await page.goto(sessionPath(project.directory, session.id)) + await expect(page.locator(promptSelector).first()).toBeVisible({ timeout: 30_000 }) + + const terminal = page.locator(terminalSelector) + + await ensureTerminalClosed(page) + await resetPerfProbe(page) + await page.keyboard.press(terminalToggleKey) + await waitTerminalFocusIdle(page, { term: terminal.first() }) + await settleFrames(page, 4) + runs.push(await snapshotPerfProbe(page)) + await page.keyboard.press(terminalToggleKey) + await expect(terminal).toHaveCount(0) + if (run < 2) await cooldownAfterRun(page) + }) + } + + scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "terminal-side-panel-open", runs })) + }) + test("session-scroll-reading emits a 3-run JSON baseline", async ({ page, project }) => { await installPerfProbe(page) await project.open() diff --git a/packages/app/playwright.config.ts b/packages/app/playwright.config.ts index 421079ed..1e5c06e5 100644 --- a/packages/app/playwright.config.ts +++ b/packages/app/playwright.config.ts @@ -8,6 +8,7 @@ const command = `bun run dev -- --host 0.0.0.0 --port ${port}` const reuse = !process.env.CI const workers = Number(process.env.PLAYWRIGHT_WORKERS ?? (process.env.CI ? 5 : 0)) || undefined const reporter = [["html", { outputFolder: "e2e/playwright-report", open: "never" }], ["line"]] as const +const trace = process.env.PAWWORK_PERF_TRACE === "1" ? "on" : "on-first-retry" if (process.env.PLAYWRIGHT_JUNIT_OUTPUT) { reporter.push(["junit", { outputFile: process.env.PLAYWRIGHT_JUNIT_OUTPUT }]) @@ -38,7 +39,7 @@ export default defineConfig({ }, use: { baseURL, - trace: "on-first-retry", + trace, screenshot: "only-on-failure", video: "retain-on-failure", }, diff --git a/packages/app/script/compare-perf.ts b/packages/app/script/compare-perf.ts index d4bb9d83..50bddefd 100644 --- a/packages/app/script/compare-perf.ts +++ b/packages/app/script/compare-perf.ts @@ -1,6 +1,6 @@ import fs from "node:fs/promises" import path from "node:path" -import { comparePerfBaselines, type PerfScenarioSummary } from "../src/testing/perf-metrics" +import { comparePerfBaselines, renderPerfBaselineComment, type PerfScenarioSummary } from "../src/testing/perf-metrics" function readArg(flag: string) { const index = process.argv.indexOf(flag) @@ -22,6 +22,7 @@ async function main() { const basePath = readArg("--base") const headPath = readArg("--head") const outputPath = readArg("--output") + const commentOutputPath = readArg("--comment-output") if (!basePath || !headPath) { throw new Error("Usage: bun script/compare-perf.ts --base --head [--output ]") @@ -34,6 +35,10 @@ async function main() { await fs.mkdir(path.dirname(outputPath), { recursive: true }) await fs.writeFile(outputPath, `${JSON.stringify(comparison, null, 2)}\n`) } + if (commentOutputPath) { + await fs.mkdir(path.dirname(commentOutputPath), { recursive: true }) + await fs.writeFile(commentOutputPath, renderPerfBaselineComment(comparison)) + } const summary = { pass: comparison.pass, diff --git a/packages/app/src/testing/perf-metrics.test.ts b/packages/app/src/testing/perf-metrics.test.ts index 7852d47d..c9d1502d 100644 --- a/packages/app/src/testing/perf-metrics.test.ts +++ b/packages/app/src/testing/perf-metrics.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from "bun:test" -import { aggregatePerfRuns, comparePerfBaselines, comparePerfScenarioSummaries, summarizePerfRun } from "./perf-metrics" +import { aggregatePerfRuns, comparePerfBaselines, comparePerfScenarioSummaries, PERF_COMMENT_MARKER, renderPerfBaselineComment, summarizePerfRun } from "./perf-metrics" describe("perf metrics", () => { test("summarizes a perf sample window", () => { @@ -304,4 +304,96 @@ describe("perf metrics", () => { expect(result.pass).toBe(false) expect(result.failures).toContain("missing_head_scenario:tool-call-expand") }) + + test("renders a markdown comment with scenario deltas and fail or warn status", () => { + const comparison = comparePerfBaselines({ + base: [ + aggregatePerfRuns({ + branch: "base", + scenario: "homepage-cold", + runs: [ + { + interaction_ms: 40, + interaction_delay_ms: 2, + long_task_count: 0, + long_task_max_ms: 0, + tbt_ms: 0, + frame_gap_p95_ms: 16, + frame_gap_max_ms: 24, + jank_count_50ms: 0, + cls: 0, + window_ms: 900, + }, + ], + }), + aggregatePerfRuns({ + branch: "base", + scenario: "session-scroll-reading", + runs: [ + { + interaction_ms: 16, + interaction_delay_ms: 1, + long_task_count: 0, + long_task_max_ms: 0, + tbt_ms: 0, + frame_gap_p95_ms: 16, + frame_gap_max_ms: 16, + jank_count_50ms: 0, + cls: 0.01, + window_ms: 300, + }, + ], + }), + ], + head: [ + aggregatePerfRuns({ + branch: "head", + scenario: "homepage-cold", + runs: [ + { + interaction_ms: 40, + interaction_delay_ms: 2, + long_task_count: 0, + long_task_max_ms: 0, + tbt_ms: 0, + frame_gap_p95_ms: 16, + frame_gap_max_ms: 24, + jank_count_50ms: 0, + cls: 0, + window_ms: 900, + fcp_ms: 2400, + lcp_ms: 3100, + }, + ], + }), + aggregatePerfRuns({ + branch: "head", + scenario: "session-scroll-reading", + runs: [ + { + interaction_ms: 32, + interaction_delay_ms: 1, + long_task_count: 0, + long_task_max_ms: 0, + tbt_ms: 0, + frame_gap_p95_ms: 16, + frame_gap_max_ms: 16, + jank_count_50ms: 0, + cls: 0.01, + window_ms: 300, + }, + ], + }), + ], + }) + + const comment = renderPerfBaselineComment(comparison) + + expect(comment).toContain(PERF_COMMENT_MARKER) + expect(comment).toContain("## Perf delta summary") + expect(comment).toContain("| homepage-cold |") + expect(comment).toContain("| session-scroll-reading |") + expect(comment).toContain("warn: fcp_ms, lcp_ms") + expect(comment).toContain("fail: interaction_ms_median") + }) }) diff --git a/packages/app/src/testing/perf-metrics.ts b/packages/app/src/testing/perf-metrics.ts index e0b8805d..bedb132c 100644 --- a/packages/app/src/testing/perf-metrics.ts +++ b/packages/app/src/testing/perf-metrics.ts @@ -67,6 +67,8 @@ export type PerfBaselineComparison = { scenarios: PerfScenarioComparison[] } +export const PERF_COMMENT_MARKER = "" + const perfDeltaThresholds = { interactionMedianMs: 10, interactionMedianRatio: 1.05, @@ -202,6 +204,56 @@ function addAbsoluteWarning(target: string[], key: string, value: number | undef if (value > threshold) target.push(key) } +function formatDelta(value: number) { + if (value === 0) return "0" + return `${value > 0 ? "+" : ""}${round(value)}` +} + +function formatMetricDelta(base: number, head: number) { + return `${base} -> ${head} (${formatDelta(head - base)})` +} + +function scenarioStatus(input: PerfScenarioComparison) { + if (input.failures.length > 0) return `fail: ${input.failures.join(", ")}` + if (input.warnings.length > 0) return `warn: ${input.warnings.join(", ")}` + return "pass" +} + +export function renderPerfBaselineComment(input: PerfBaselineComparison) { + const lines = [ + PERF_COMMENT_MARKER, + "## Perf delta summary", + "", + `Comparator: ${input.pass ? "pass" : "fail"}`, + "", + "| Scenario | interaction median | interaction worst | long task max | tbt | frame gap p95 | frame gap max | jank count | cls | status |", + "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |", + ] + + for (const scenario of input.scenarios) { + const columns = [ + scenario.scenario, + formatMetricDelta(scenario.base.interaction_ms_median, scenario.head.interaction_ms_median), + formatMetricDelta(scenario.base.interaction_ms_worst, scenario.head.interaction_ms_worst), + formatMetricDelta(scenario.base.long_task_max_ms, scenario.head.long_task_max_ms), + formatMetricDelta(scenario.base.tbt_ms, scenario.head.tbt_ms), + formatMetricDelta(scenario.base.frame_gap_p95_ms, scenario.head.frame_gap_p95_ms), + formatMetricDelta(scenario.base.frame_gap_max_ms, scenario.head.frame_gap_max_ms), + formatMetricDelta(scenario.base.jank_count_50ms, scenario.head.jank_count_50ms), + formatMetricDelta(scenario.base.cls, scenario.head.cls), + scenarioStatus(scenario), + ] + lines.push(`| ${columns.join(" | ")} |`) + } + + if (input.failures.some((failure) => failure.startsWith("missing_"))) { + lines.push("") + lines.push(`Missing scenarios: ${input.failures.filter((failure) => failure.startsWith("missing_")).join(", ")}`) + } + + return `${lines.join("\n")}\n` +} + export function comparePerfScenarioSummaries(input: { scenario: string base: PerfScenarioSummary