Skip to content
73 changes: 72 additions & 1 deletion .github/workflows/perf-probe-baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@ on:
pull_request:
paths:
- ".github/workflows/perf-probe-baseline.yml"
- "packages/app/src/**"
- "packages/ui/src/**"
Comment thread
coderabbitai[bot] marked this conversation as resolved.
- "packages/app/e2e/perf/**"
- "packages/app/e2e/fixtures.ts"
- "packages/app/package.json"
- "packages/app/script/compare-perf.ts"
- "packages/app/script/e2e-local.ts"
- "packages/app/src/testing/perf-metrics*"
workflow_dispatch:

permissions:
contents: read
pull-requests: write

env:
PLAYWRIGHT_BROWSERS_PATH: ${{ github.workspace }}/.playwright-browsers

Expand Down Expand Up @@ -85,11 +90,77 @@ jobs:
run: bun --cwd head/packages/app test:e2e:local:perf

- name: Compare base and head perf baselines
id: compare
continue-on-error: true
run: >
bun head/packages/app/script/compare-perf.ts
--base "${PERF_ARTIFACT_DIR}/perf-base.json"
--head "${PERF_ARTIFACT_DIR}/perf-head.json"
--output "${PERF_ARTIFACT_DIR}/perf-compare.json"
--comment-output "${PERF_ARTIFACT_DIR}/perf-comment.md"

- name: Comment perf deltas on pull request
if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # actions/github-script@v7
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
with:
script: |
const fs = require("node:fs")
const marker = "<!-- pawwork-perf-probe-baseline -->"
const body = fs.readFileSync(`${process.env.GITHUB_WORKSPACE}/perf-artifacts/perf-comment.md`, "utf8")
const pull_number = Number(process.env.PR_NUMBER)
const owner = context.repo.owner
const repo = context.repo.repo

const comments = await github.paginate(github.rest.issues.listComments, {
owner,
repo,
issue_number: pull_number,
per_page: 100,
})
const existing = comments.find((comment) => typeof comment.body === "string" && comment.body.includes(marker))

if (existing) {
await github.rest.issues.updateComment({
owner,
repo,
comment_id: existing.id,
body,
})
core.info(`Updated perf delta comment ${existing.id}`)
return
}

await github.rest.issues.createComment({
owner,
repo,
issue_number: pull_number,
body,
})
core.info("Created perf delta comment")

- name: Capture perf diagnostic trace (base)
if: steps.compare.outcome == 'failure'
env:
CI: "true"
PAWWORK_PERF_BRANCH: base
PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-base-trace.json
PAWWORK_PERF_TRACE: "1"
run: bun --cwd base/packages/app test:e2e:local:perf

- name: Capture perf diagnostic trace (head)
if: steps.compare.outcome == 'failure'
env:
CI: "true"
PAWWORK_PERF_BRANCH: head
PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-head-trace.json
PAWWORK_PERF_TRACE: "1"
run: bun --cwd head/packages/app test:e2e:local:perf

- name: Fail job on comparator regression
if: steps.compare.outcome == 'failure'
run: exit 1

- name: Upload perf probe artifacts
if: always()
Expand Down
36 changes: 33 additions & 3 deletions packages/app/e2e/perf/perf-probe.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@ import fs from "node:fs/promises"
import path from "node:path"
import { raw } from "../../../opencode/test/lib/llm-server"
import { test, expect } from "../fixtures"
import { withSession } from "../actions"
import { promptSelector, sessionMessageItemSelector, sessionTurnListSelector, scrollViewportSelector } from "../selectors"
import { sessionPath } from "../utils"
import { waitTerminalFocusIdle, withSession } from "../actions"
import {
promptSelector,
sessionMessageItemSelector,
sessionTurnListSelector,
scrollViewportSelector,
terminalSelector,
} from "../selectors"
import { sessionPath, terminalToggleKey } from "../utils"
import { installPerfProbe, resetPerfProbe, snapshotPerfProbe, summarizeScenarioRuns } from "./probe"

const outputPath = process.env.PAWWORK_PERF_OUTPUT ?? path.join(process.cwd(), "e2e", "perf-results", "pr0.1-baseline.json")
Expand Down Expand Up @@ -235,6 +241,30 @@ test.describe("PR0.1 perf probe baseline", () => {
scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "tool-call-expand", runs }))
})

test("terminal-side-panel-open emits a 3-run JSON baseline", async ({ page, project }) => {
await installPerfProbe(page)
await project.open()

const runs = []
for (let run = 0; run < 3; run += 1) {
await withSession(project.sdk, `perf terminal ${Date.now()}-${run}`, async (session) => {
await page.goto(sessionPath(project.directory, session.id))
await expect(page.locator(promptSelector).first()).toBeVisible({ timeout: 30_000 })

const terminal = page.locator(terminalSelector).first()

await resetPerfProbe(page)
await page.keyboard.press(terminalToggleKey)
await waitTerminalFocusIdle(page, { term: terminal })
await settleFrames(page, 4)
runs.push(await snapshotPerfProbe(page))
if (run < 2) await cooldownAfterRun(page)
Comment thread
Astro-Han marked this conversation as resolved.
})
}

scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "terminal-side-panel-open", runs }))
})

test("session-scroll-reading emits a 3-run JSON baseline", async ({ page, project }) => {
await installPerfProbe(page)
await project.open()
Expand Down
3 changes: 2 additions & 1 deletion packages/app/playwright.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const command = `bun run dev -- --host 0.0.0.0 --port ${port}`
const reuse = !process.env.CI
const workers = Number(process.env.PLAYWRIGHT_WORKERS ?? (process.env.CI ? 5 : 0)) || undefined
const reporter = [["html", { outputFolder: "e2e/playwright-report", open: "never" }], ["line"]] as const
const trace = process.env.PAWWORK_PERF_TRACE === "1" ? "on" : "on-first-retry"

if (process.env.PLAYWRIGHT_JUNIT_OUTPUT) {
reporter.push(["junit", { outputFile: process.env.PLAYWRIGHT_JUNIT_OUTPUT }])
Expand Down Expand Up @@ -38,7 +39,7 @@ export default defineConfig({
},
use: {
baseURL,
trace: "on-first-retry",
trace,
screenshot: "only-on-failure",
video: "retain-on-failure",
},
Expand Down
7 changes: 6 additions & 1 deletion packages/app/script/compare-perf.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import fs from "node:fs/promises"
import path from "node:path"
import { comparePerfBaselines, type PerfScenarioSummary } from "../src/testing/perf-metrics"
import { comparePerfBaselines, renderPerfBaselineComment, type PerfScenarioSummary } from "../src/testing/perf-metrics"

function readArg(flag: string) {
const index = process.argv.indexOf(flag)
Expand All @@ -22,6 +22,7 @@ async function main() {
const basePath = readArg("--base")
const headPath = readArg("--head")
const outputPath = readArg("--output")
const commentOutputPath = readArg("--comment-output")

if (!basePath || !headPath) {
throw new Error("Usage: bun script/compare-perf.ts --base <perf-base.json> --head <perf-head.json> [--output <path>]")
Expand All @@ -34,6 +35,10 @@ async function main() {
await fs.mkdir(path.dirname(outputPath), { recursive: true })
await fs.writeFile(outputPath, `${JSON.stringify(comparison, null, 2)}\n`)
}
if (commentOutputPath) {
await fs.mkdir(path.dirname(commentOutputPath), { recursive: true })
await fs.writeFile(commentOutputPath, renderPerfBaselineComment(comparison))
}

const summary = {
pass: comparison.pass,
Expand Down
94 changes: 93 additions & 1 deletion packages/app/src/testing/perf-metrics.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { describe, expect, test } from "bun:test"
import { aggregatePerfRuns, comparePerfBaselines, comparePerfScenarioSummaries, summarizePerfRun } from "./perf-metrics"
import { aggregatePerfRuns, comparePerfBaselines, comparePerfScenarioSummaries, PERF_COMMENT_MARKER, renderPerfBaselineComment, summarizePerfRun } from "./perf-metrics"

describe("perf metrics", () => {
test("summarizes a perf sample window", () => {
Expand Down Expand Up @@ -304,4 +304,96 @@ describe("perf metrics", () => {
expect(result.pass).toBe(false)
expect(result.failures).toContain("missing_head_scenario:tool-call-expand")
})

test("renders a markdown comment with scenario deltas and fail or warn status", () => {
const comparison = comparePerfBaselines({
base: [
aggregatePerfRuns({
branch: "base",
scenario: "homepage-cold",
runs: [
{
interaction_ms: 40,
interaction_delay_ms: 2,
long_task_count: 0,
long_task_max_ms: 0,
tbt_ms: 0,
frame_gap_p95_ms: 16,
frame_gap_max_ms: 24,
jank_count_50ms: 0,
cls: 0,
window_ms: 900,
},
],
}),
aggregatePerfRuns({
branch: "base",
scenario: "session-scroll-reading",
runs: [
{
interaction_ms: 16,
interaction_delay_ms: 1,
long_task_count: 0,
long_task_max_ms: 0,
tbt_ms: 0,
frame_gap_p95_ms: 16,
frame_gap_max_ms: 16,
jank_count_50ms: 0,
cls: 0.01,
window_ms: 300,
},
],
}),
],
head: [
aggregatePerfRuns({
branch: "head",
scenario: "homepage-cold",
runs: [
{
interaction_ms: 40,
interaction_delay_ms: 2,
long_task_count: 0,
long_task_max_ms: 0,
tbt_ms: 0,
frame_gap_p95_ms: 16,
frame_gap_max_ms: 24,
jank_count_50ms: 0,
cls: 0,
window_ms: 900,
fcp_ms: 2400,
lcp_ms: 3100,
},
],
}),
aggregatePerfRuns({
branch: "head",
scenario: "session-scroll-reading",
runs: [
{
interaction_ms: 32,
interaction_delay_ms: 1,
long_task_count: 0,
long_task_max_ms: 0,
tbt_ms: 0,
frame_gap_p95_ms: 16,
frame_gap_max_ms: 16,
jank_count_50ms: 0,
cls: 0.01,
window_ms: 300,
},
],
}),
],
})

const comment = renderPerfBaselineComment(comparison)

expect(comment).toContain(PERF_COMMENT_MARKER)
expect(comment).toContain("## Perf delta summary")
expect(comment).toContain("| homepage-cold |")
expect(comment).toContain("| session-scroll-reading |")
expect(comment).toContain("warn: fcp_ms, lcp_ms")
expect(comment).toContain("fail: interaction_ms_median")
})
})
42 changes: 42 additions & 0 deletions packages/app/src/testing/perf-metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ export type PerfBaselineComparison = {
scenarios: PerfScenarioComparison[]
}

export const PERF_COMMENT_MARKER = "<!-- pawwork-perf-probe-baseline -->"

const perfDeltaThresholds = {
interactionMedianMs: 10,
interactionMedianRatio: 1.05,
Expand Down Expand Up @@ -202,6 +204,46 @@ function addAbsoluteWarning(target: string[], key: string, value: number | undef
if (value > threshold) target.push(key)
}

function formatDelta(value: number) {
if (value === 0) return "0"
return `${value > 0 ? "+" : ""}${round(value)}`
}

function formatMetricDelta(base: number, head: number) {
return `${base} -> ${head} (${formatDelta(head - base)})`
}

function scenarioStatus(input: PerfScenarioComparison) {
if (input.failures.length > 0) return `fail: ${input.failures.join(", ")}`
if (input.warnings.length > 0) return `warn: ${input.warnings.join(", ")}`
return "pass"
}

export function renderPerfBaselineComment(input: PerfBaselineComparison) {
const lines = [
PERF_COMMENT_MARKER,
"## Perf delta summary",
"",
`Comparator: ${input.pass ? "pass" : "fail"}`,
"",
"| Scenario | interaction median | interaction worst | long task max | tbt | frame gap p95 | frame gap max | jank count | cls | status |",
"| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |",
]

for (const scenario of input.scenarios) {
lines.push(
`| ${scenario.scenario} | ${formatMetricDelta(scenario.base.interaction_ms_median, scenario.head.interaction_ms_median)} | ${formatMetricDelta(scenario.base.interaction_ms_worst, scenario.head.interaction_ms_worst)} | ${formatMetricDelta(scenario.base.long_task_max_ms, scenario.head.long_task_max_ms)} | ${formatMetricDelta(scenario.base.tbt_ms, scenario.head.tbt_ms)} | ${formatMetricDelta(scenario.base.frame_gap_p95_ms, scenario.head.frame_gap_p95_ms)} | ${formatMetricDelta(scenario.base.frame_gap_max_ms, scenario.head.frame_gap_max_ms)} | ${formatMetricDelta(scenario.base.jank_count_50ms, scenario.head.jank_count_50ms)} | ${formatMetricDelta(scenario.base.cls, scenario.head.cls)} | ${scenarioStatus(scenario)} |`,
)
Comment thread
Astro-Han marked this conversation as resolved.
Outdated
}

if (input.failures.some((failure) => failure.startsWith("missing_"))) {
lines.push("")
lines.push(`Missing scenarios: ${input.failures.filter((failure) => failure.startsWith("missing_")).join(", ")}`)
}

return `${lines.join("\n")}\n`
}

export function comparePerfScenarioSummaries(input: {
scenario: string
base: PerfScenarioSummary
Expand Down
Loading