diff --git a/.github/workflows/perf-probe-baseline.yml b/.github/workflows/perf-probe-baseline.yml
new file mode 100644
index 000000000..6798510b7
--- /dev/null
+++ b/.github/workflows/perf-probe-baseline.yml
@@ -0,0 +1,67 @@
+name: perf-probe-baseline
+
+on:
+  pull_request:
+    paths:
+      - ".github/workflows/perf-probe-baseline.yml"
+      - "packages/app/e2e/perf/**"
+      - "packages/app/e2e/fixtures.ts"
+      - "packages/app/package.json"
+      - "packages/app/script/e2e-local.ts"
+      - "packages/app/src/testing/perf-metrics*"
+  workflow_dispatch:
+
+env:
+  PLAYWRIGHT_BROWSERS_PATH: ${{ github.workspace }}/.playwright-browsers
+
+jobs:
+  perf-probe-baseline:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # actions/checkout@v6
+        with:
+          persist-credentials: false
+
+      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # actions/setup-node@v6.4.0
+        with:
+          node-version: "24"
+
+      - uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # oven-sh/setup-bun@v2
+        with:
+          bun-version: "1.3.13"
+
+      - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5
+        with:
+          path: ~/.bun/install/cache
+          key: bun-${{ runner.os }}-${{ hashFiles('bun.lock') }}
+          restore-keys: |
+            bun-${{ runner.os }}-
+
+      - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5
+        with:
+          path: ${{ github.workspace }}/.playwright-browsers
+          key: playwright-${{ runner.os }}-${{ hashFiles('packages/app/package.json', 'bun.lock') }}
+
+      - run: bun install --frozen-lockfile
+
+      - name: Install Playwright browsers
+        working-directory: packages/app
+        run: bunx playwright install --with-deps chromium
+
+      - name: Run perf probe baseline
+        env:
+          CI: "true"
+        run: bun --cwd packages/app test:e2e:local:perf
+
+      - name: Upload perf probe artifacts
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # actions/upload-artifact@v7
+        with:
+          name: perf-probe-baseline-${{ github.run_attempt }}
+          if-no-files-found: ignore
+          retention-days: 7
+          path: |
+            packages/app/e2e/perf-results
+            packages/app/e2e/playwright-report
+            packages/app/e2e/test-results
diff --git a/packages/app/.gitignore b/packages/app/.gitignore
index d699efb38..12bee1874 100644
--- a/packages/app/.gitignore
+++ b/packages/app/.gitignore
@@ -1,3 +1,4 @@
 src/assets/theme.css
 e2e/test-results
 e2e/playwright-report
+e2e/perf-results
diff --git a/packages/app/e2e/perf/perf-probe.spec.ts b/packages/app/e2e/perf/perf-probe.spec.ts
new file mode 100644
index 000000000..e44cfb0dd
--- /dev/null
+++ b/packages/app/e2e/perf/perf-probe.spec.ts
@@ -0,0 +1,266 @@
+import fs from "node:fs/promises"
+import path from "node:path"
+import { raw } from "../../../opencode/test/lib/llm-server"
+import { test, expect } from "../fixtures"
+import { withSession } from "../actions"
+import { promptSelector, sessionMessageItemSelector, sessionTurnListSelector, scrollViewportSelector } from "../selectors"
+import { sessionPath } from "../utils"
+import { installPerfProbe, resetPerfProbe, snapshotPerfProbe, summarizeScenarioRuns } from "./probe"
+
+const outputPath = process.env.PAWWORK_PERF_OUTPUT ?? path.join(process.cwd(), "e2e", "perf-results", "pr0.1-baseline.json")
+
+const longMarkdown = [
+  "# Baseline stream",
+  "",
+  "This stream exists to stress markdown rendering while the session remains interactive.",
+  "",
+  "- list item one",
+  "- list item two with a [link](https://example.com)",
+  "- 中英混排 content for layout and glyph coverage",
+  "",
+  "```ts",
+  "export function sample(input: number) {",
+  "  return input * 2",
+  "}",
+  "```",
+  "",
+  ...Array.from({ length: 80 }, (_, index) => `Paragraph ${index + 1}: ${"streaming markdown content ".repeat(8)}`),
+].join("\n")
+
+const scenarioResults: ReturnType<typeof summarizeScenarioRuns>[] = []
+
+const chatChunk = (delta: Record<string, unknown>, input?: { finish?: string; usage?: { input: number; output: number } }) => ({
+  id: "chatcmpl-test",
+  object: "chat.completion.chunk",
+  choices: [
+    {
+      delta,
+      ...(input?.finish ? { finish_reason: input.finish } : {}),
+    },
+  ],
+  ...(input?.usage
+    ? {
+        usage: {
+          prompt_tokens: input.usage.input,
+          completion_tokens: input.usage.output,
+          total_tokens: input.usage.input + input.usage.output,
+        },
+      }
+    : {}),
+})
+
+function splitText(value: string, size: number) {
+  const out: string[] = []
+  for (let index = 0; index < value.length; index += size) {
+    out.push(value.slice(index, index + size))
+  }
+  return out
+}
+
+function deferred() {
+  let resolve!: () => void
+  const promise = new Promise<void>((done) => {
+    resolve = done
+  })
+  return { promise, resolve }
+}
+
+async function settleFrames(page: Parameters<typeof snapshotPerfProbe>[0], count = 2) {
+  await page.evaluate(async (frames) => {
+    for (let index = 0; index < frames; index += 1) {
+      await new Promise<void>((resolve) => requestAnimationFrame(() => resolve()))
+    }
+  }, count)
+}
+
+async function navigateProjectHome(page: Parameters<typeof snapshotPerfProbe>[0], directory: string) {
+  await page.goto(sessionPath(directory))
+  await expect(page.locator('[data-component="session-new-home"]')).toBeVisible()
+}
+
+async function readPromptSend(page: Parameters<typeof snapshotPerfProbe>[0]) {
+  return page.evaluate(() => {
+    const win = window as Window & {
+      __opencode_e2e?: {
+        prompt?: {
+          sent?: {
+            started?: number
+            count?: number
+            sessionID?: string
+          }
+        }
+      }
+    }
+    const sent = win.__opencode_e2e?.prompt?.sent
+    return {
+      started: sent?.started ?? 0,
+      count: sent?.count ?? 0,
+      sessionID: sent?.sessionID,
+    }
+  })
+}
+
+async function submitVisiblePrompt(page: Parameters<typeof snapshotPerfProbe>[0], text: string) {
+  const prompt = page.locator(promptSelector).first()
+  const previous = await readPromptSend(page)
+  await expect(prompt).toBeVisible()
+  await prompt.click()
+  await prompt.fill("")
+  await page.keyboard.type(text)
+  await page.keyboard.press("Enter")
+  await expect.poll(async () => (await readPromptSend(page)).started, { timeout: 10_000 }).toBeGreaterThan(previous.started)
+}
+
+async function scrollTimelineTo(page: Parameters<typeof snapshotPerfProbe>[0], top: number) {
+  const found = await page.evaluate(
+    ({ top, scrollViewportSelector, turnListSelector }) => {
+      const list = document.querySelector(turnListSelector)
+      const viewport = list?.closest(scrollViewportSelector)
+      if (!(viewport instanceof HTMLElement)) return false
+      viewport.scrollTop = top
+      viewport.dispatchEvent(new Event("scroll", { bubbles: true }))
+      return true
+    },
+    { top, scrollViewportSelector, turnListSelector: sessionTurnListSelector },
+  )
+  expect(found).toBe(true)
+}
+
+test.describe("PR0.1 perf probe baseline", () => {
+  test.describe.configure({ mode: "serial" })
+
+  test.afterAll(async () => {
+    await fs.mkdir(path.dirname(outputPath), { recursive: true })
+    await fs.writeFile(outputPath, `${JSON.stringify(scenarioResults, null, 2)}\n`)
+  })
+
+  test("homepage-cold emits a 3-run JSON baseline", async ({ page, project }) => {
+    await installPerfProbe(page)
+    await project.open()
+
+    const runs = []
+    for (let run = 0; run < 3; run += 1) {
+      if (run > 0) await navigateProjectHome(page, project.directory)
+      const prompt = page.locator(promptSelector).first()
+      await expect(prompt).toBeVisible()
+      await prompt.click()
+      await page.getByRole("button", { name: /Switch workspace|切换工作目录/i }).click()
+      await settleFrames(page, 3)
+      await page.keyboard.press("Escape")
+      runs.push(await snapshotPerfProbe(page))
+    }
+
+    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "homepage-cold", runs }))
+  })
+
+  test("session-streaming-long emits a 3-run JSON baseline", async ({ page, project, llm }) => {
+    await installPerfProbe(page)
+    await project.open()
+
+    const runs = []
+    for (let run = 0; run < 3; run += 1) {
+      const firstWave = deferred()
+      const secondWave = deferred()
+      const chunks = splitText(longMarkdown, 320)
+      const headChunks = [chatChunk({ role: "assistant" }), ...chunks.slice(0, 3).map((chunk) => chatChunk({ content: chunk }))]
+      const stageOneChunks = chunks.slice(3, 9).map((chunk) => chatChunk({ content: chunk }))
+      const stageTwoChunks = chunks.slice(9).map((chunk) => chatChunk({ content: chunk }))
+
+      await navigateProjectHome(page, project.directory)
+      await llm.push(
+        raw({
+          head: headChunks,
+          stages: [
+            { wait: firstWave.promise, chunks: stageOneChunks },
+            { wait: secondWave.promise, chunks: stageTwoChunks },
+          ],
+          tail: [chatChunk({}, { finish: "stop", usage: { input: 120, output: 480 } })],
+        }),
+      )
+
+      const send = project.prompt(`Stream probe ${run + 1}`)
+      await expect.poll(() => page.url(), { timeout: 30_000 }).toContain("/session/")
+      await expect(page.getByText("This stream exists to stress markdown rendering while the session remains interactive.")).toBeVisible({
+        timeout: 30_000,
+      })
+
+      await resetPerfProbe(page)
+      const click = page.getByRole("button", { name: "Right utility panel" }).click()
+      firstWave.resolve()
+      await click
+      await settleFrames(page, 6)
+      secondWave.resolve()
+      await send
+      runs.push(await snapshotPerfProbe(page))
+    }
+
+    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-streaming-long", runs }))
+  })
+
+  test("tool-call-expand emits a 3-run JSON baseline", async ({ page, project, llm }) => {
+    await installPerfProbe(page)
+    await project.open()
+
+    const runs = []
+    for (let run = 0; run < 3; run += 1) {
+      await navigateProjectHome(page, project.directory)
+      const created = await project.sdk.worktree.create({ directory: project.directory }).then((result) => result.data)
+      if (!created?.directory) throw new Error("Failed to create worktree for perf probe")
+      project.trackDirectory(created.directory)
+      await llm.tool("enter-worktree", { path: created.directory })
+      await llm.text(`tool call baseline ${run + 1}`)
+      await project.prompt(`Create todos for perf probe run ${run + 1}.`)
+      const trigger = page.locator('[data-slot="collapsible-trigger"]').filter({ has: page.locator('[data-component="tool-trigger"]') }).first()
+      await expect(trigger).toBeVisible({ timeout: 30_000 })
+      await resetPerfProbe(page)
+      await trigger.click()
+      await expect(trigger).toHaveAttribute("aria-expanded", "true")
+      await trigger.click()
+      await expect(trigger).toHaveAttribute("aria-expanded", "false")
+      await trigger.click()
+      await expect(trigger).toHaveAttribute("aria-expanded", "true")
+      await settleFrames(page, 4)
+      runs.push(await snapshotPerfProbe(page))
+    }
+
+    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "tool-call-expand", runs }))
+  })
+
+  test("session-scroll-reading emits a 3-run JSON baseline", async ({ page, project }) => {
+    await installPerfProbe(page)
+    await project.open()
+
+    const runs = []
+    for (let run = 0; run < 3; run += 1) {
+      await withSession(project.sdk, `perf scroll ${Date.now()}-${run}`, async (session) => {
+        for (let index = 0; index < 18; index += 1) {
+          await project.sdk.session.promptAsync({
+            sessionID: session.id,
+            noReply: true,
+            parts: [
+              {
+                type: "text",
+                text: `scroll seed ${run}-${index}\n${Array.from({ length: 18 }, (_, line) => `line ${line} ${"content ".repeat(8)}`).join("\n")}`,
+              },
+            ],
+          })
+        }
+
+        await page.goto(sessionPath(project.directory, session.id))
+        await expect(page.locator(sessionMessageItemSelector).first()).toBeVisible({ timeout: 30_000 })
+        await expect.poll(async () => page.locator(sessionMessageItemSelector).count()).toBeGreaterThanOrEqual(8)
+        await resetPerfProbe(page)
+        await page.locator(scrollViewportSelector).first().hover()
+        await page.mouse.wheel(0, -3600)
+        await settleFrames(page, 2)
+        await scrollTimelineTo(page, 0)
+        await settleFrames(page, 2)
+        await page.mouse.wheel(0, 3600)
+        await settleFrames(page, 4)
+        runs.push(await snapshotPerfProbe(page))
+      })
+    }
+
+    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-scroll-reading", runs }))
+  })
+})
diff --git a/packages/app/e2e/perf/probe.ts b/packages/app/e2e/perf/probe.ts
new file mode 100644
index 000000000..ffd52f235
--- /dev/null
+++ b/packages/app/e2e/perf/probe.ts
@@ -0,0 +1,163 @@
+import type { Page } from "@playwright/test"
+import { aggregatePerfRuns, summarizePerfRun, type PerfRunSummary } from "../../src/testing/perf-metrics"
+
+type BrowserPerfSample = {
+  startedAt: number
+  endedAt: number
+  interactions: Array<{ at: number; delay: number; duration: number }>
+  longTasks: Array<{ at: number; duration: number }>
+  frames: Array<{ at: number; duration: number }>
+  shifts: Array<{ at: number; value: number }>
+  fcpMs?: number
+  lcpMs?: number
+  heapUsedMb?: number
+}
+
+type BrowserPerfWindow = Window & {
+  __pawwork_perf_probe?: {
+    reset: () => void
+    snapshot: () => BrowserPerfSample
+  }
+}
+
+export async function installPerfProbe(page: Page) {
+  await page.addInitScript(() => {
+    const win = window as BrowserPerfWindow
+    if (win.__pawwork_perf_probe) return
+
+    const interactions: Array<{ at: number; delay: number; duration: number }> = []
+    const longTasks: Array<{ at: number; duration: number }> = []
+    const frames: Array<{ at: number; duration: number }> = []
+    const shifts: Array<{ at: number; value: number }> = []
+    const maxEntries = 4000
+    const supported = PerformanceObserver.supportedEntryTypes ?? []
+    let startedAt = 0
+    let raf = 0
+    let lastFrame = 0
+    let fcpMs: number | undefined
+    let lcpMs: number | undefined
+
+    const trim = <T,>(list: T[]) => {
+      if (list.length <= maxEntries) return
+      list.splice(0, list.length - maxEntries)
+    }
+
+    const observe = (
+      type: string,
+      init: PerformanceObserverInit & { durationThreshold?: number },
+      fn: (entries: PerformanceEntry[]) => void,
+    ) => {
+      if (!supported.includes(type)) return
+      const observer = new PerformanceObserver((list) => fn(list.getEntries()))
+      try {
+        observer.observe(init)
+      } catch {
+        observer.disconnect()
+      }
+    }
+
+    observe("event", { buffered: true, durationThreshold: 16, type: "event" }, (entries) => {
+      for (const entry of entries as Array<PerformanceEntry & { processingStart?: number }>) {
+        if (entry.duration < 16) continue
+        interactions.push({
+          at: entry.startTime,
+          delay: Math.max(0, (entry.processingStart ?? entry.startTime) - entry.startTime),
+          duration: entry.duration,
+        })
+      }
+      trim(interactions)
+    })
+
+    observe("longtask", { buffered: true, type: "longtask" }, (entries) => {
+      for (const entry of entries) {
+        longTasks.push({ at: entry.startTime, duration: entry.duration })
+      }
+      trim(longTasks)
+    })
+
+    observe("layout-shift", { buffered: true, type: "layout-shift" }, (entries) => {
+      for (const entry of entries as Array<PerformanceEntry & { value?: number; hadRecentInput?: boolean }>) {
+        if (entry.hadRecentInput) continue
+        if (typeof entry.value !== "number") continue
+        shifts.push({ at: entry.startTime, value: entry.value })
+      }
+      trim(shifts)
+    })
+
+    observe("paint", { buffered: true, type: "paint" }, (entries) => {
+      for (const entry of entries) {
+        if (entry.name === "first-contentful-paint") {
+          fcpMs = entry.startTime
+        }
+      }
+    })
+
+    observe("largest-contentful-paint", { buffered: true, type: "largest-contentful-paint" }, (entries) => {
+      for (const entry of entries) {
+        lcpMs = entry.startTime
+      }
+    })
+
+    const loop = (at: number) => {
+      if (document.visibilityState === "visible") {
+        if (lastFrame !== 0) {
+          frames.push({ at, duration: at - lastFrame })
+          trim(frames)
+        }
+        lastFrame = at
+      } else {
+        lastFrame = 0
+      }
+      raf = requestAnimationFrame(loop)
+    }
+
+    raf = requestAnimationFrame(loop)
+    win.addEventListener("beforeunload", () => {
+      if (raf !== 0) cancelAnimationFrame(raf)
+    })
+
+    win.__pawwork_perf_probe = {
+      reset() {
+        startedAt = performance.now()
+      },
+      snapshot() {
+        const memory = performance as Performance & { memory?: { usedJSHeapSize?: number } }
+        return {
+          startedAt,
+          endedAt: performance.now(),
+          interactions: interactions.slice(),
+          longTasks: longTasks.slice(),
+          frames: frames.slice(),
+          shifts: shifts.slice(),
+          fcpMs,
+          lcpMs,
+          heapUsedMb:
+            typeof memory.memory?.usedJSHeapSize === "number"
+              ? memory.memory.usedJSHeapSize / 1024 / 1024
+              : undefined,
+        }
+      },
+    }
+  })
+}
+
+export async function resetPerfProbe(page: Page) {
+  await page.evaluate(() => {
+    const probe = (window as BrowserPerfWindow).__pawwork_perf_probe
+    if (!probe) throw new Error("Perf probe is not installed")
+    probe.reset()
+  })
+}
+
+export async function snapshotPerfProbe(page: Page) {
+  const sample = await page.evaluate(() => {
+    const probe = (window as BrowserPerfWindow).__pawwork_perf_probe
+    if (!probe) throw new Error("Perf probe is not installed")
+    return probe.snapshot()
+  })
+  return summarizePerfRun(sample)
+}
+
+export function summarizeScenarioRuns(input: { branch: string; scenario: string; runs: PerfRunSummary[] }) {
+  return aggregatePerfRuns(input)
+}
diff --git a/packages/app/package.json b/packages/app/package.json
index 64f8dbcce..250a75638 100644
--- a/packages/app/package.json
+++ b/packages/app/package.json
@@ -22,6 +22,8 @@
     "test:e2e": "playwright test",
     "test:e2e:smoke": "playwright test --grep @smoke",
     "test:e2e:local": "bun script/e2e-local.ts",
+    "test:e2e:local:perf": "bun script/e2e-local.ts -- e2e/perf/perf-probe.spec.ts",
+    "test:e2e:perf": "playwright test e2e/perf/perf-probe.spec.ts",
     "test:e2e:local:smoke": "bun script/e2e-local.ts -- --grep @smoke",
     "test:e2e:ui": "playwright test --ui",
     "test:e2e:report": "playwright show-report e2e/playwright-report"
diff --git a/packages/app/src/testing/perf-metrics.test.ts b/packages/app/src/testing/perf-metrics.test.ts
new file mode 100644
index 000000000..8ede5ba95
--- /dev/null
+++ b/packages/app/src/testing/perf-metrics.test.ts
@@ -0,0 +1,119 @@
+import { describe, expect, test } from "bun:test"
+import { aggregatePerfRuns, summarizePerfRun } from "./perf-metrics"
+
+describe("perf metrics", () => {
+  test("summarizes a perf sample window", () => {
+    const summary = summarizePerfRun({
+      startedAt: 100,
+      endedAt: 200,
+      interactions: [
+        { at: 90, delay: 1, duration: 10 },
+        { at: 120, delay: 8, duration: 72 },
+        { at: 160, delay: 4, duration: 41 },
+      ],
+      longTasks: [
+        { at: 80, duration: 120 },
+        { at: 130, duration: 80 },
+        { at: 170, duration: 55 },
+      ],
+      frames: [
+        { at: 110, duration: 16 },
+        { at: 120, duration: 18 },
+        { at: 130, duration: 22 },
+        { at: 140, duration: 55 },
+        { at: 150, duration: 80 },
+      ],
+      shifts: [
+        { at: 95, value: 0.3 },
+        { at: 135, value: 0.01 },
+        { at: 165, value: 0.02 },
+      ],
+      fcpMs: 456.4,
+      lcpMs: 789.2,
+      heapUsedMb: 123.6,
+    })
+
+    expect(summary).toEqual({
+      interaction_ms: 72,
+      interaction_delay_ms: 8,
+      long_task_count: 2,
+      long_task_max_ms: 80,
+      tbt_ms: 35,
+      frame_gap_p95_ms: 80,
+      frame_gap_max_ms: 80,
+      jank_count_50ms: 2,
+      cls: 0.03,
+      window_ms: 100,
+      fcp_ms: 456.4,
+      lcp_ms: 789.2,
+      heap_used_mb: 123.6,
+    })
+  })
+
+  test("aggregates scenario medians and worst interaction", () => {
+    const summary = aggregatePerfRuns({
+      branch: "dev",
+      scenario: "tool-call-expand",
+      runs: [
+        {
+          interaction_ms: 40,
+          interaction_delay_ms: 6,
+          long_task_count: 1,
+          long_task_max_ms: 61,
+          tbt_ms: 11,
+          frame_gap_p95_ms: 28,
+          frame_gap_max_ms: 47,
+          jank_count_50ms: 0,
+          cls: 0.002,
+          window_ms: 1200,
+          fcp_ms: undefined,
+          lcp_ms: undefined,
+          heap_used_mb: 91,
+        },
+        {
+          interaction_ms: 75,
+          interaction_delay_ms: 10,
+          long_task_count: 2,
+          long_task_max_ms: 74,
+          tbt_ms: 24,
+          frame_gap_p95_ms: 35,
+          frame_gap_max_ms: 58,
+          jank_count_50ms: 1,
+          cls: 0.004,
+          window_ms: 1190,
+          fcp_ms: undefined,
+          lcp_ms: undefined,
+          heap_used_mb: 93,
+        },
+        {
+          interaction_ms: 52,
+          interaction_delay_ms: 7,
+          long_task_count: 1,
+          long_task_max_ms: 68,
+          tbt_ms: 18,
+          frame_gap_p95_ms: 30,
+          frame_gap_max_ms: 54,
+          jank_count_50ms: 1,
+          cls: 0.003,
+          window_ms: 1210,
+          fcp_ms: undefined,
+          lcp_ms: undefined,
+          heap_used_mb: 95,
+        },
+      ],
+    })
+
+    expect(summary.branch).toBe("dev")
+    expect(summary.scenario).toBe("tool-call-expand")
+    expect(summary.runs).toBe(3)
+    expect(summary.interaction_ms_median).toBe(52)
+    expect(summary.interaction_ms_worst).toBe(75)
+    expect(summary.long_task_max_ms).toBe(68)
+    expect(summary.tbt_ms).toBe(18)
+    expect(summary.frame_gap_p95_ms).toBe(30)
+    expect(summary.frame_gap_max_ms).toBe(54)
+    expect(summary.cls).toBe(0.003)
+    expect(summary.heap_used_mb).toBe(93)
+    expect(summary.run_details).toHaveLength(3)
+  })
+})
diff --git a/packages/app/src/testing/perf-metrics.ts b/packages/app/src/testing/perf-metrics.ts
new file mode 100644
index 000000000..59ba55ce3
--- /dev/null
+++ b/packages/app/src/testing/perf-metrics.ts
@@ -0,0 +1,148 @@
+export type PerfInteractionSample = {
+  at: number
+  delay: number
+  duration: number
+}
+
+export type PerfDurationSample = {
+  at: number
+  duration: number
+}
+
+export type PerfShiftSample = {
+  at: number
+  value: number
+}
+
+export type PerfRunSample = {
+  startedAt: number
+  endedAt: number
+  interactions: PerfInteractionSample[]
+  longTasks: PerfDurationSample[]
+  frames: PerfDurationSample[]
+  shifts: PerfShiftSample[]
+  fcpMs?: number
+  lcpMs?: number
+  heapUsedMb?: number
+}
+
+export type PerfRunSummary = {
+  interaction_ms: number
+  interaction_delay_ms: number
+  long_task_count: number
+  long_task_max_ms: number
+  tbt_ms: number
+  frame_gap_p95_ms: number
+  frame_gap_max_ms: number
+  jank_count_50ms: number
+  cls: number
+  window_ms: number
+  fcp_ms?: number
+  lcp_ms?: number
+  heap_used_mb?: number
+}
+
+export type PerfScenarioSummary = PerfRunSummary & {
+  branch: string
+  scenario: string
+  runs: number
+  interaction_ms_median: number
+  interaction_ms_worst: number
+  run_details: PerfRunSummary[]
+}
+
+function round(input: number) {
+  return Math.round(input * 1000) / 1000
+}
+
+function median(values: number[]) {
+  if (values.length === 0) return 0
+  const sorted = [...values].sort((a, b) => a - b)
+  const mid = Math.floor(sorted.length / 2)
+  return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]
+}
+
+function percentile(values: number[], p: number) {
+  if (values.length === 0) return 0
+  const sorted = [...values].sort((a, b) => a - b)
+  const index = Math.max(0, Math.ceil(sorted.length * p) - 1)
+  return sorted[index]
+}
+
+function pickWindow<T extends { at: number }>(items: T[], start: number, end: number) {
+  return items.filter((item) => item.at >= start && item.at <= end)
+}
+
+function optionalMedian(values: Array<number | undefined>) {
+  const filtered = values.filter((value): value is number => typeof value === "number")
+  if (filtered.length === 0) return undefined
+  return round(median(filtered))
+}
+
+export function summarizePerfRun(input: PerfRunSample): PerfRunSummary {
+  const startedAt = input.startedAt
+  const endedAt = Math.max(input.endedAt, startedAt)
+  const interactions = pickWindow(input.interactions, startedAt, endedAt)
+  const longTasks = pickWindow(input.longTasks, startedAt, endedAt)
+  const frames = pickWindow(input.frames, startedAt, endedAt)
+  const shifts = pickWindow(input.shifts, startedAt, endedAt)
+
+  const interaction = interactions.reduce((max, entry) => Math.max(max, entry.duration), 0)
+  const interactionDelay = interactions.reduce((max, entry) => Math.max(max, entry.delay), 0)
+  const longTaskMax = longTasks.reduce((max, entry) => Math.max(max, entry.duration), 0)
+  const tbt = longTasks.reduce((sum, entry) => sum + Math.max(0, entry.duration - 50), 0)
+  const frameDurations = frames.map((entry) => entry.duration)
+  const frameGapP95 = percentile(frameDurations, 0.95)
+  const frameGapMax = frameDurations.reduce((max, value) => Math.max(max, value), 0)
+  const jankCount = frameDurations.filter((value) => value > 50).length
+  const cls = shifts.reduce((sum, entry) => sum + entry.value, 0)
+
+  return {
+    interaction_ms: round(interaction),
+    interaction_delay_ms: round(interactionDelay),
+    long_task_count: longTasks.length,
+    long_task_max_ms: round(longTaskMax),
+    tbt_ms: round(tbt),
+    frame_gap_p95_ms: round(frameGapP95),
+    frame_gap_max_ms: round(frameGapMax),
+    jank_count_50ms: jankCount,
+    cls: round(cls),
+    window_ms: round(endedAt - startedAt),
+    fcp_ms: input.fcpMs === undefined ? undefined : round(input.fcpMs),
+    lcp_ms: input.lcpMs === undefined ? undefined : round(input.lcpMs),
+    heap_used_mb: input.heapUsedMb === undefined ? undefined : round(input.heapUsedMb),
+  }
+}
+
+export function aggregatePerfRuns(input: {
+  branch: string
+  scenario: string
+  runs: PerfRunSummary[]
+}): PerfScenarioSummary {
+  const runs = input.runs
+  if (runs.length === 0) {
+    throw new Error(`Cannot aggregate perf runs for ${input.scenario} without samples`)
+  }
+
+  return {
+    branch: input.branch,
+    scenario: input.scenario,
+    runs: runs.length,
+    interaction_ms_median: round(median(runs.map((run) => run.interaction_ms))),
+    interaction_ms_worst: round(runs.reduce((max, run) => Math.max(max, run.interaction_ms), 0)),
+    interaction_ms: round(median(runs.map((run) => run.interaction_ms))),
+    interaction_delay_ms: round(median(runs.map((run) => run.interaction_delay_ms))),
+    long_task_count: round(median(runs.map((run) => run.long_task_count))),
+    long_task_max_ms: round(median(runs.map((run) => run.long_task_max_ms))),
+    tbt_ms: round(median(runs.map((run) => run.tbt_ms))),
+    frame_gap_p95_ms: round(median(runs.map((run) => run.frame_gap_p95_ms))),
+    frame_gap_max_ms: round(median(runs.map((run) => run.frame_gap_max_ms))),
+    jank_count_50ms: round(median(runs.map((run) => run.jank_count_50ms))),
+    cls: round(median(runs.map((run) => run.cls))),
+    window_ms: round(median(runs.map((run) => run.window_ms))),
+    fcp_ms: optionalMedian(runs.map((run) => run.fcp_ms)),
+    lcp_ms: optionalMedian(runs.map((run) => run.lcp_ms)),
+    heap_used_mb: optionalMedian(runs.map((run) => run.heap_used_mb)),
+    run_details: runs,
+  }
+}
diff --git a/packages/opencode/test/lib/llm-server.ts b/packages/opencode/test/lib/llm-server.ts
index 2e2a2ea89..6c7814576 100644
--- a/packages/opencode/test/lib/llm-server.ts
+++ b/packages/opencode/test/lib/llm-server.ts
@@ -35,6 +35,10 @@ type Wait = {
 type Sse = {
   type: "sse"
   head: unknown[]
+  stages?: Array<{
+    wait?: PromiseLike<unknown>
+    chunks: unknown[]
+  }>
   tail: unknown[]
   wait?: PromiseLike<unknown>
   hang?: boolean
@@ -288,9 +292,9 @@ function choices(part: unknown) {
   return choice
 }
 
-function flow(item: Sse) {
+function flowParts(parts: Iterable<unknown>) {
   const out: Flow[] = []
-  for (const part of [...item.head, ...item.tail]) {
+  for (const part of parts) {
     const choice = choices(part)
     const delta =
       choice && "delta" in choice && choice.delta && typeof choice.delta === "object" ? choice.delta : undefined
@@ -344,85 +348,100 @@ function responses(item: Sse, model: string) {
       }
     | undefined
   let usage: Usage | undefined
-  const lines: unknown[] = [responseCreated(model)]
-
-  for (const part of flow(item)) {
-    if (part.type === "text") {
-      msg ??= "msg_1"
-      if (!hasMsg) {
-        hasMsg = true
+  const pushFlow = (target: unknown[], parts: Iterable<unknown>) => {
+    for (const part of flowParts(parts)) {
+      if (part.type === "text") {
+        msg ??= "msg_1"
+        if (!hasMsg) {
+          hasMsg = true
+          seq += 1
+          target.push(responseMessage(msg, seq))
+        }
         seq += 1
-        lines.push(responseMessage(msg, seq))
+        target.push(responseText(msg, part.text, seq))
+        continue
       }
-      seq += 1
-      lines.push(responseText(msg, part.text, seq))
-      continue
-    }
 
-    if (part.type === "reason") {
-      reason ||= "rs_1"
-      if (!hasReason) {
-        hasReason = true
+      if (part.type === "reason") {
+        reason ||= "rs_1"
+        if (!hasReason) {
+          hasReason = true
+          seq += 1
+          target.push(responseReason(reason, seq))
+          seq += 1
+          target.push(responseReasonPart(reason, seq))
+        }
         seq += 1
-        lines.push(responseReason(reason, seq))
+        target.push(responseReasonText(reason, part.text, seq))
+        continue
+      }
+
+      if (part.type === "tool-start") {
+        call ||= { id: part.id, item: "fc_1", name: part.name, args: "" }
         seq += 1
-        lines.push(responseReasonPart(reason, seq))
+        target.push(responseTool(call.id, call.item, call.name, seq))
+        continue
       }
-      seq += 1
-      lines.push(responseReasonText(reason, part.text, seq))
-      continue
-    }
 
-    if (part.type === "tool-start") {
-      call ||= { id: part.id, item: "fc_1", name: part.name, args: "" }
-      seq += 1
-      lines.push(responseTool(call.id, call.item, call.name, seq))
-      continue
-    }
+      if (part.type === "tool-args") {
+        if (!call) continue
+        call.args += part.text
+        seq += 1
+        target.push(responseToolArgs(call.item, part.text, seq))
+        continue
+      }
 
-    if (part.type === "tool-args") {
-      if (!call) continue
-      call.args += part.text
-      seq += 1
-      lines.push(responseToolArgs(call.item, part.text, seq))
-      continue
+      usage = part.usage
     }
-
-    usage = part.usage
   }
 
+  const head = [responseCreated(model)]
+  pushFlow(head, item.head)
+  const stages =
+    item.stages?.map((stage) => {
+      const chunks: unknown[] = []
+      pushFlow(chunks, stage.chunks)
+      return {
+        wait: stage.wait,
+        chunks,
+      }
+    }) ?? []
+  const tail: unknown[] = []
+  pushFlow(tail, item.tail)
   if (msg) {
     seq += 1
-    lines.push(responseMessageDone(msg, seq))
+    tail.push(responseMessageDone(msg, seq))
   }
   if (reason) {
     seq += 1
-    lines.push(responseReasonDone(reason, seq))
+    tail.push(responseReasonDone(reason, seq))
   }
   if (call && !item.hang && !item.error) {
     seq += 1
-    lines.push(responseToolArgsDone(call.item, call.args, seq))
+    tail.push(responseToolArgsDone(call.item, call.args, seq))
     seq += 1
-    lines.push(responseToolDone(call, seq))
+    tail.push(responseToolDone(call, seq))
   }
-  if (!item.hang && !item.error) lines.push(responseCompleted({ seq: seq + 1, usage }))
-  return { ...item, head: lines, tail: [] } satisfies Sse
-}
-
-function modelFrom(body: unknown) {
-  if (!body || typeof body !== "object") return "test-model"
-  if (!("model" in body) || typeof body.model !== "string") return "test-model"
-  return body.model
+  if (!item.hang && !item.error) tail.push(responseCompleted({ seq: seq + 1, usage }))
+  return { ...item, head, stages, tail } satisfies Sse
 }
 
 function send(item: Sse) {
   const head = bytes(item.head)
+  const segments = item.stages ?? []
   const tail = bytes([...item.tail, ...(item.hang || item.error ? [] : [done])])
   const empty = Stream.fromIterable<Uint8Array>([])
+
+  let body: Stream.Stream<Uint8Array, unknown> = head
+  for (const stage of segments) {
+    const chunkStream = bytes(stage.chunks)
+    const wait = stage.wait
+    const segment = wait ? Stream.fromEffect(Effect.promise(() => wait)).pipe(Stream.flatMap(() => chunkStream)) : chunkStream
+    body = Stream.concat(body, segment)
+  }
   const wait = item.wait
-  const body: Stream.Stream<Uint8Array, unknown> = wait
-    ? Stream.concat(head, Stream.fromEffect(Effect.promise(() => wait)).pipe(Stream.flatMap(() => tail)))
-    : Stream.concat(head, tail)
+  body = Stream.concat(body, wait ? Stream.fromEffect(Effect.promise(() => wait)).pipe(Stream.flatMap(() => tail)) : tail)
+
   let end: Stream.Stream<Uint8Array, unknown> = empty
   if (item.error) end = Stream.concat(empty, Stream.fail(item.error))
   else if (item.hang) end = Stream.concat(empty, Stream.never)
@@ -430,23 +449,32 @@ function send(item: Sse) {
   return HttpServerResponse.stream(Stream.concat(body, end), { contentType: "text/event-stream" })
 }
 
+function fail(item: HttpError) {
+  return HttpServerResponse.text(JSON.stringify(item.body), {
+    status: item.status,
+    contentType: "application/json",
+  })
+}
+
 const reset = Effect.fn("TestLLMServer.reset")(function* (item: Sse) {
   const req = yield* HttpServerRequest.HttpServerRequest
   const res = NodeHttpServerRequest.toServerResponse(req)
   yield* Effect.sync(() => {
     res.writeHead(200, { "content-type": "text/event-stream" })
     for (const part of item.head) res.write(line(part))
+    for (const stage of item.stages ?? []) {
+      for (const part of stage.chunks) res.write(line(part))
+    }
     for (const part of item.tail) res.write(line(part))
     res.destroy(new Error("connection reset"))
   })
   return yield* Effect.never
 })
 
-function fail(item: HttpError) {
-  return HttpServerResponse.text(JSON.stringify(item.body), {
-    status: item.status,
-    contentType: "application/json",
-  })
+function modelFrom(body: unknown) {
+  if (!body || typeof body !== "object") return "test-model"
+  if (!("model" in body) || typeof body.model !== "string") return "test-model"
+  return body.model
 }
 
 export class Reply {
@@ -568,6 +596,10 @@ export function httpError(status: number, body: unknown): Item {
 export function raw(input: {
   chunks?: unknown[]
   head?: unknown[]
+  stages?: Array<{
+    wait?: PromiseLike<unknown>
+    chunks: unknown[]
+  }>
   tail?: unknown[]
   wait?: PromiseLike<unknown>
   hang?: boolean
@@ -577,6 +609,7 @@ export function raw(input: {
   return {
     type: "sse",
     head: input.head ?? input.chunks ?? [],
+    stages: input.stages,
     tail: input.tail ?? [],
     wait: input.wait,
     hang: input.hang,