diff --git a/.github/workflows/perf-probe-baseline.yml b/.github/workflows/perf-probe-baseline.yml
index 6798510b7..0744ed3bd 100644
--- a/.github/workflows/perf-probe-baseline.yml
+++ b/.github/workflows/perf-probe-baseline.yml
@@ -7,6 +7,7 @@ on:
       - "packages/app/e2e/perf/**"
       - "packages/app/e2e/fixtures.ts"
       - "packages/app/package.json"
+      - "packages/app/script/compare-perf.ts"
       - "packages/app/script/e2e-local.ts"
       - "packages/app/src/testing/perf-metrics*"
   workflow_dispatch:
@@ -18,11 +19,20 @@ jobs:
   perf-probe-baseline:
     runs-on: ubuntu-latest
     timeout-minutes: 30
+    env:
+      PERF_ARTIFACT_DIR: ${{ github.workspace }}/perf-artifacts
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # actions/checkout@v6
         with:
+          path: head
           persist-credentials: false
 
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # actions/checkout@v6
+        with:
+          path: base
+          persist-credentials: false
+          ref: ${{ github.event.pull_request.base.sha || github.sha }}
+
       - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # actions/setup-node@v6.4.0
         with:
           node-version: "24"
@@ -34,25 +44,52 @@ jobs:
       - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5
         with:
           path: ~/.bun/install/cache
-          key: bun-${{ runner.os }}-${{ hashFiles('bun.lock') }}
+          key: bun-${{ runner.os }}-${{ hashFiles('head/bun.lock', 'base/bun.lock') }}
           restore-keys: |
             bun-${{ runner.os }}-
 
       - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5
         with:
           path: ${{ github.workspace }}/.playwright-browsers
-          key: playwright-${{ runner.os }}-${{ hashFiles('packages/app/package.json', 'bun.lock') }}
+          key: playwright-${{ runner.os }}-${{ hashFiles('head/packages/app/package.json', 'head/bun.lock', 'base/packages/app/package.json', 'base/bun.lock') }}
+
+      - name: Install head dependencies
+        working-directory: head
+        run: bun install --frozen-lockfile
 
-      - run: bun install --frozen-lockfile
+      - name: Install base dependencies
+        working-directory: base
+        run: bun install --frozen-lockfile
 
       - name: Install Playwright browsers
-        working-directory: packages/app
+        working-directory: head/packages/app
         run: bunx playwright install --with-deps chromium
 
-      - name: Run perf probe baseline
+      - name: Sync perf harness into base checkout
+        run: |
+          rsync -a --delete head/packages/app/e2e/perf/ base/packages/app/e2e/perf/
+          cp head/packages/app/src/testing/perf-metrics.ts base/packages/app/src/testing/perf-metrics.ts
+
+      - name: Run perf probe baseline (base)
         env:
           CI: "true"
-        run: bun --cwd packages/app test:e2e:local:perf
+          PAWWORK_PERF_BRANCH: base
+          PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-base.json
+        run: bun --cwd base/packages/app test:e2e:local:perf
+
+      - name: Run perf probe baseline (head)
+        env:
+          CI: "true"
+          PAWWORK_PERF_BRANCH: head
+          PAWWORK_PERF_OUTPUT: ${{ github.workspace }}/perf-artifacts/perf-head.json
+        run: bun --cwd head/packages/app test:e2e:local:perf
+
+      - name: Compare base and head perf baselines
+        run: >
+          bun head/packages/app/script/compare-perf.ts
+          --base "${PERF_ARTIFACT_DIR}/perf-base.json"
+          --head "${PERF_ARTIFACT_DIR}/perf-head.json"
+          --output "${PERF_ARTIFACT_DIR}/perf-compare.json"
 
       - name: Upload perf probe artifacts
         if: always()
@@ -62,6 +99,8 @@ jobs:
           if-no-files-found: ignore
           retention-days: 7
           path: |
-            packages/app/e2e/perf-results
-            packages/app/e2e/playwright-report
-            packages/app/e2e/test-results
+            perf-artifacts
+            base/packages/app/e2e/playwright-report
+            base/packages/app/e2e/test-results
+            head/packages/app/e2e/playwright-report
+            head/packages/app/e2e/test-results
diff --git a/packages/app/e2e/perf/perf-probe.spec.ts b/packages/app/e2e/perf/perf-probe.spec.ts
index e44cfb0dd..f48174ad4 100644
--- a/packages/app/e2e/perf/perf-probe.spec.ts
+++ b/packages/app/e2e/perf/perf-probe.spec.ts
@@ -8,6 +8,7 @@ import { sessionPath } from "../utils"
 import { installPerfProbe, resetPerfProbe, snapshotPerfProbe, summarizeScenarioRuns } from "./probe"
 
 const outputPath = process.env.PAWWORK_PERF_OUTPUT ?? path.join(process.cwd(), "e2e", "perf-results", "pr0.1-baseline.json")
+const perfBranch = process.env.PAWWORK_PERF_BRANCH ?? "dev"
 
 const longMarkdown = [
   "# Baseline stream",
@@ -73,6 +74,11 @@ async function settleFrames(page: Parameters<typeof snapshotPerfProbe>[0], count
   }, count)
 }
 
+async function cooldownAfterRun(page: Parameters<typeof snapshotPerfProbe>[0]) {
+  await settleFrames(page, 6)
+  await page.waitForTimeout(250)
+}
+
 async function navigateProjectHome(page: Parameters<typeof snapshotPerfProbe>[0], directory: string) {
   await page.goto(sessionPath(directory))
   await expect(page.locator('[data-component="session-new-home"]')).toBeVisible()
@@ -148,9 +154,10 @@ test.describe("PR0.1 perf probe baseline", () => {
       await settleFrames(page, 3)
       await page.keyboard.press("Escape")
       runs.push(await snapshotPerfProbe(page))
+      if (run < 2) await cooldownAfterRun(page)
     }
 
-    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "homepage-cold", runs }))
+    scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "homepage-cold", runs }))
   })
 
   test("session-streaming-long emits a 3-run JSON baseline", async ({ page, project, llm }) => {
@@ -192,9 +199,10 @@ test.describe("PR0.1 perf probe baseline", () => {
       secondWave.resolve()
       await send
       runs.push(await snapshotPerfProbe(page))
+      if (run < 2) await cooldownAfterRun(page)
     }
 
-    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-streaming-long", runs }))
+    scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "session-streaming-long", runs }))
   })
 
   test("tool-call-expand emits a 3-run JSON baseline", async ({ page, project, llm }) => {
@@ -221,9 +229,10 @@ test.describe("PR0.1 perf probe baseline", () => {
       await expect(trigger).toHaveAttribute("aria-expanded", "true")
       await settleFrames(page, 4)
       runs.push(await snapshotPerfProbe(page))
+      if (run < 2) await cooldownAfterRun(page)
     }
 
-    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "tool-call-expand", runs }))
+    scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "tool-call-expand", runs }))
   })
 
   test("session-scroll-reading emits a 3-run JSON baseline", async ({ page, project }) => {
@@ -258,9 +267,10 @@ test.describe("PR0.1 perf probe baseline", () => {
         await page.mouse.wheel(0, 3600)
         await settleFrames(page, 4)
         runs.push(await snapshotPerfProbe(page))
+        if (run < 2) await cooldownAfterRun(page)
       })
     }
 
-    scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-scroll-reading", runs }))
+    scenarioResults.push(summarizeScenarioRuns({ branch: perfBranch, scenario: "session-scroll-reading", runs }))
   })
 })
diff --git a/packages/app/script/compare-perf.ts b/packages/app/script/compare-perf.ts
new file mode 100644
index 000000000..d4bb9d837
--- /dev/null
+++ b/packages/app/script/compare-perf.ts
@@ -0,0 +1,50 @@
+import fs from "node:fs/promises"
+import path from "node:path"
+import { comparePerfBaselines, type PerfScenarioSummary } from "../src/testing/perf-metrics"
+
+function readArg(flag: string) {
+  const index = process.argv.indexOf(flag)
+  if (index === -1) return undefined
+  const value = process.argv[index + 1]
+  if (!value || value.startsWith("--")) return undefined
+  return value
+}
+
+async function readPerfFile(filePath: string) {
+  const payload = JSON.parse(await fs.readFile(filePath, "utf8")) as PerfScenarioSummary[]
+  if (!Array.isArray(payload)) {
+    throw new Error(`Expected an array of perf scenarios in ${filePath}`)
+  }
+  return payload
+}
+
+async function main() {
+  const basePath = readArg("--base")
+  const headPath = readArg("--head")
+  const outputPath = readArg("--output")
+
+  if (!basePath || !headPath) {
+    throw new Error("Usage: bun script/compare-perf.ts --base <perf-base.json> --head <perf-head.json> [--output <path>]")
+  }
+
+  const [base, head] = await Promise.all([readPerfFile(basePath), readPerfFile(headPath)])
+  const comparison = comparePerfBaselines({ base, head })
+
+  if (outputPath) {
+    await fs.mkdir(path.dirname(outputPath), { recursive: true })
+    await fs.writeFile(outputPath, `${JSON.stringify(comparison, null, 2)}\n`)
+  }
+
+  const summary = {
+    pass: comparison.pass,
+    failures: comparison.failures,
+    warnings: comparison.warnings,
+  }
+  console.log(JSON.stringify(summary, null, 2))
+
+  if (!comparison.pass) {
+    process.exitCode = 1
+  }
+}
+
+await main()
diff --git a/packages/app/src/testing/perf-metrics.test.ts b/packages/app/src/testing/perf-metrics.test.ts
index 8ede5ba95..7852d47d2 100644
--- a/packages/app/src/testing/perf-metrics.test.ts
+++ b/packages/app/src/testing/perf-metrics.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, test } from "bun:test"
-import { aggregatePerfRuns, summarizePerfRun } from "./perf-metrics"
+import { aggregatePerfRuns, comparePerfBaselines, comparePerfScenarioSummaries, summarizePerfRun } from "./perf-metrics"
 
 describe("perf metrics", () => {
   test("summarizes a perf sample window", () => {
@@ -116,4 +116,192 @@ describe("perf metrics", () => {
     expect(summary.heap_used_mb).toBe(93)
     expect(summary.run_details).toHaveLength(3)
   })
+
+  test("fails a scenario when median regression breaks both the ms and percentage budgets", () => {
+    const result = comparePerfScenarioSummaries({
+      scenario: "session-streaming-long",
+      base: {
+        branch: "base",
+        scenario: "session-streaming-long",
+        runs: 3,
+        interaction_ms_median: 100,
+        interaction_ms_worst: 140,
+        interaction_ms: 100,
+        interaction_delay_ms: 12,
+        long_task_count: 1,
+        long_task_max_ms: 80,
+        tbt_ms: 30,
+        frame_gap_p95_ms: 32,
+        frame_gap_max_ms: 60,
+        jank_count_50ms: 1,
+        cls: 0.11,
+        window_ms: 1200,
+        run_details: [],
+      },
+      head: {
+        branch: "head",
+        scenario: "session-streaming-long",
+        runs: 3,
+        interaction_ms_median: 116,
+        interaction_ms_worst: 168,
+        interaction_ms: 116,
+        interaction_delay_ms: 14,
+        long_task_count: 1,
+        long_task_max_ms: 88,
+        tbt_ms: 34,
+        frame_gap_p95_ms: 35,
+        frame_gap_max_ms: 70,
+        jank_count_50ms: 1,
+        cls: 0.013,
+        window_ms: 1220,
+        run_details: [],
+      },
+    })
+
+    expect(result.pass).toBe(false)
+    expect(result.failures).toContain("interaction_ms_median")
+    expect(result.warnings).toHaveLength(0)
+  })
+
+  test("fails a scenario on catastrophic absolute thresholds even when there is no regression delta", () => {
+    const result = comparePerfScenarioSummaries({
+      scenario: "tool-call-expand",
+      base: {
+        branch: "base",
+        scenario: "tool-call-expand",
+        runs: 3,
+        interaction_ms_median: 120,
+        interaction_ms_worst: 510,
+        interaction_ms: 120,
+        interaction_delay_ms: 12,
+        long_task_count: 1,
+        long_task_max_ms: 90,
+        tbt_ms: 36,
+        frame_gap_p95_ms: 32,
+        frame_gap_max_ms: 90,
+        jank_count_50ms: 1,
+        cls: 0.01,
+        window_ms: 900,
+        run_details: [],
+      },
+      head: {
+        branch: "head",
+        scenario: "tool-call-expand",
+        runs: 3,
+        interaction_ms_median: 120,
+        interaction_ms_worst: 510,
+        interaction_ms: 120,
+        interaction_delay_ms: 12,
+        long_task_count: 1,
+        long_task_max_ms: 90,
+        tbt_ms: 36,
+        frame_gap_p95_ms: 32,
+        frame_gap_max_ms: 90,
+        jank_count_50ms: 1,
+        cls: 0.01,
+        window_ms: 900,
+        run_details: [],
+      },
+    })
+
+    expect(result.pass).toBe(false)
+    expect(result.failures).toContain("interaction_ms_worst")
+  })
+
+  test("keeps Web Vitals good lines warn-only in PR0.2", () => {
+    const result = comparePerfScenarioSummaries({
+      scenario: "homepage-cold",
+      base: {
+        branch: "base",
+        scenario: "homepage-cold",
+        runs: 3,
+        interaction_ms_median: 70,
+        interaction_ms_worst: 95,
+        interaction_ms: 70,
+        interaction_delay_ms: 9,
+        long_task_count: 0,
+        long_task_max_ms: 0,
+        tbt_ms: 0,
+        frame_gap_p95_ms: 18,
+        frame_gap_max_ms: 24,
+        jank_count_50ms: 0,
+        cls: 0.11,
+        window_ms: 600,
+        fcp_ms: 1500,
+        lcp_ms: 2300,
+        run_details: [],
+      },
+      head: {
+        branch: "head",
+        scenario: "homepage-cold",
+        runs: 3,
+        interaction_ms_median: 74,
+        interaction_ms_worst: 99,
+        interaction_ms: 74,
+        interaction_delay_ms: 11,
+        long_task_count: 0,
+        long_task_max_ms: 0,
+        tbt_ms: 0,
+        frame_gap_p95_ms: 19,
+        frame_gap_max_ms: 26,
+        jank_count_50ms: 0,
+        cls: 0.12,
+        window_ms: 620,
+        fcp_ms: 2400,
+        lcp_ms: 3100,
+        run_details: [],
+      },
+    })
+
+    expect(result.pass).toBe(true)
+    expect(result.failures).toHaveLength(0)
+    expect(result.warnings).toEqual(expect.arrayContaining(["cls", "fcp_ms", "lcp_ms"]))
+  })
+
+  test("compares baseline collections by scenario and fails when a head scenario is missing", () => {
+    const base = [
+      aggregatePerfRuns({
+        branch: "base",
+        scenario: "homepage-cold",
+        runs: [
+          {
+            interaction_ms: 60,
+            interaction_delay_ms: 8,
+            long_task_count: 0,
+            long_task_max_ms: 0,
+            tbt_ms: 0,
+            frame_gap_p95_ms: 18,
+            frame_gap_max_ms: 26,
+            jank_count_50ms: 0,
+            cls: 0.01,
+            window_ms: 800,
+          },
+        ],
+      }),
+      aggregatePerfRuns({
+        branch: "base",
+        scenario: "tool-call-expand",
+        runs: [
+          {
+            interaction_ms: 70,
+            interaction_delay_ms: 9,
+            long_task_count: 1,
+            long_task_max_ms: 75,
+            tbt_ms: 25,
+            frame_gap_p95_ms: 28,
+            frame_gap_max_ms: 40,
+            jank_count_50ms: 0,
+            cls: 0.002,
+            window_ms: 900,
+          },
+        ],
+      }),
+    ]
+    const head = [base[0]]
+
+    const result = comparePerfBaselines({ base, head })
+
+    expect(result.pass).toBe(false)
+    expect(result.failures).toContain("missing_head_scenario:tool-call-expand")
+  })
 })
diff --git a/packages/app/src/testing/perf-metrics.ts b/packages/app/src/testing/perf-metrics.ts
index 59ba55ce3..e0b8805dd 100644
--- a/packages/app/src/testing/perf-metrics.ts
+++ b/packages/app/src/testing/perf-metrics.ts
@@ -51,6 +51,48 @@ export type PerfScenarioSummary = PerfRunSummary & {
   run_details: PerfRunSummary[]
 }
 
+export type PerfScenarioComparison = {
+  scenario: string
+  pass: boolean
+  failures: string[]
+  warnings: string[]
+  base: PerfScenarioSummary
+  head: PerfScenarioSummary
+}
+
+export type PerfBaselineComparison = {
+  pass: boolean
+  failures: string[]
+  warnings: string[]
+  scenarios: PerfScenarioComparison[]
+}
+
+const perfDeltaThresholds = {
+  interactionMedianMs: 10,
+  interactionMedianRatio: 1.05,
+  interactionWorstMs: 50,
+  longTaskMaxMs: 25,
+  tbtMs: 50,
+  frameGapP95Ms: 10,
+  frameGapMaxMs: 50,
+  jankCount: 2,
+  cls: 0.02,
+} as const
+
+const perfAbsoluteWarnings = {
+  interactionMsWorst: 200,
+  tbtMs: 200,
+  cls: 0.05,
+  fcpMs: 1800,
+  lcpMs: 2500,
+} as const
+
+const perfCatastrophicThresholds = {
+  interactionMsWorst: 500,
+  frameGapMaxMs: 500,
+  longTaskMaxMs: 250,
+} as const
+
 function round(input: number) {
   return Math.round(input * 1000) / 1000
 }
@@ -146,3 +188,121 @@ export function aggregatePerfRuns(input: {
     run_details: runs,
   }
 }
+
+function exceededByDelta(input: { base: number; head: number; maxDelta: number; maxRatio?: number }) {
+  const delta = input.head - input.base
+  if (delta <= input.maxDelta) return false
+  if (input.maxRatio === undefined) return true
+  if (input.base <= 0) return true
+  return input.head > input.base * input.maxRatio
+}
+
+function addAbsoluteWarning(target: string[], key: string, value: number | undefined, threshold: number) {
+  if (value === undefined) return
+  if (value > threshold) target.push(key)
+}
+
+export function comparePerfScenarioSummaries(input: {
+  scenario: string
+  base: PerfScenarioSummary
+  head: PerfScenarioSummary
+}): PerfScenarioComparison {
+  const failures: string[] = []
+  const warnings: string[] = []
+
+  if (
+    exceededByDelta({
+      base: input.base.interaction_ms_median,
+      head: input.head.interaction_ms_median,
+      maxDelta: perfDeltaThresholds.interactionMedianMs,
+      maxRatio: perfDeltaThresholds.interactionMedianRatio,
+    })
+  ) {
+    failures.push("interaction_ms_median")
+  }
+
+  if (input.head.interaction_ms_worst > input.base.interaction_ms_worst + perfDeltaThresholds.interactionWorstMs) {
+    failures.push("interaction_ms_worst_delta")
+  }
+  if (input.head.interaction_ms_worst >= perfCatastrophicThresholds.interactionMsWorst) {
+    failures.push("interaction_ms_worst")
+  }
+  if (input.head.long_task_max_ms > input.base.long_task_max_ms + perfDeltaThresholds.longTaskMaxMs) {
+    failures.push("long_task_max_ms_delta")
+  }
+  if (input.head.long_task_max_ms >= perfCatastrophicThresholds.longTaskMaxMs) {
+    failures.push("long_task_max_ms")
+  }
+  if (input.head.tbt_ms > input.base.tbt_ms + perfDeltaThresholds.tbtMs) {
+    failures.push("tbt_ms")
+  }
+  if (input.head.frame_gap_p95_ms > input.base.frame_gap_p95_ms + perfDeltaThresholds.frameGapP95Ms) {
+    failures.push("frame_gap_p95_ms")
+  }
+  if (input.head.frame_gap_max_ms > input.base.frame_gap_max_ms + perfDeltaThresholds.frameGapMaxMs) {
+    failures.push("frame_gap_max_ms_delta")
+  }
+  if (input.head.frame_gap_max_ms >= perfCatastrophicThresholds.frameGapMaxMs) {
+    failures.push("frame_gap_max_ms")
+  }
+  if (input.head.jank_count_50ms > input.base.jank_count_50ms + perfDeltaThresholds.jankCount) {
+    failures.push("jank_count_50ms")
+  }
+  if (input.head.cls > input.base.cls + perfDeltaThresholds.cls) {
+    failures.push("cls_delta")
+  }
+
+  addAbsoluteWarning(warnings, "interaction_ms_worst", input.head.interaction_ms_worst, perfAbsoluteWarnings.interactionMsWorst)
+  addAbsoluteWarning(warnings, "tbt_ms", input.head.tbt_ms, perfAbsoluteWarnings.tbtMs)
+  addAbsoluteWarning(warnings, "cls", input.head.cls, perfAbsoluteWarnings.cls)
+  addAbsoluteWarning(warnings, "fcp_ms", input.head.fcp_ms, perfAbsoluteWarnings.fcpMs)
+  addAbsoluteWarning(warnings, "lcp_ms", input.head.lcp_ms, perfAbsoluteWarnings.lcpMs)
+
+  return {
+    scenario: input.scenario,
+    pass: failures.length === 0,
+    failures: [...new Set(failures)],
+    warnings: [...new Set(warnings)],
+    base: input.base,
+    head: input.head,
+  }
+}
+
+export function comparePerfBaselines(input: {
+  base: PerfScenarioSummary[]
+  head: PerfScenarioSummary[]
+}): PerfBaselineComparison {
+  const failures: string[] = []
+  const warnings: string[] = []
+  const scenarios: PerfScenarioComparison[] = []
+  const headByScenario = new Map(input.head.map((scenario) => [scenario.scenario, scenario]))
+
+  for (const baseScenario of input.base) {
+    const headScenario = headByScenario.get(baseScenario.scenario)
+    if (!headScenario) {
+      failures.push(`missing_head_scenario:${baseScenario.scenario}`)
+      continue
+    }
+    const comparison = comparePerfScenarioSummaries({
+      scenario: baseScenario.scenario,
+      base: baseScenario,
+      head: headScenario,
+    })
+    scenarios.push(comparison)
+    for (const failure of comparison.failures) failures.push(`${baseScenario.scenario}:${failure}`)
+    for (const warning of comparison.warnings) warnings.push(`${baseScenario.scenario}:${warning}`)
+  }
+
+  for (const headScenario of input.head) {
+    if (!input.base.some((scenario) => scenario.scenario === headScenario.scenario)) {
+      failures.push(`missing_base_scenario:${headScenario.scenario}`)
+    }
+  }
+
+  return {
+    pass: failures.length === 0,
+    failures,
+    warnings,
+    scenarios,
+  }
+}