diff --git a/.github/workflows/perf-probe-baseline.yml b/.github/workflows/perf-probe-baseline.yml new file mode 100644 index 000000000..6798510b7 --- /dev/null +++ b/.github/workflows/perf-probe-baseline.yml @@ -0,0 +1,67 @@ +name: perf-probe-baseline + +on: + pull_request: + paths: + - ".github/workflows/perf-probe-baseline.yml" + - "packages/app/e2e/perf/**" + - "packages/app/e2e/fixtures.ts" + - "packages/app/package.json" + - "packages/app/script/e2e-local.ts" + - "packages/app/src/testing/perf-metrics*" + workflow_dispatch: + +env: + PLAYWRIGHT_BROWSERS_PATH: ${{ github.workspace }}/.playwright-browsers + +jobs: + perf-probe-baseline: + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # actions/checkout@v6 + with: + persist-credentials: false + + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # actions/setup-node@v6.4.0 + with: + node-version: "24" + + - uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # oven-sh/setup-bun@v2 + with: + bun-version: "1.3.13" + + - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5 + with: + path: ~/.bun/install/cache + key: bun-${{ runner.os }}-${{ hashFiles('bun.lock') }} + restore-keys: | + bun-${{ runner.os }}- + + - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # actions/cache@v5 + with: + path: ${{ github.workspace }}/.playwright-browsers + key: playwright-${{ runner.os }}-${{ hashFiles('packages/app/package.json', 'bun.lock') }} + + - run: bun install --frozen-lockfile + + - name: Install Playwright browsers + working-directory: packages/app + run: bunx playwright install --with-deps chromium + + - name: Run perf probe baseline + env: + CI: "true" + run: bun --cwd packages/app test:e2e:local:perf + + - name: Upload perf probe artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # actions/upload-artifact@v7 + with: + name: perf-probe-baseline-${{ github.run_attempt }} + if-no-files-found: ignore + retention-days: 7 + path: | + packages/app/e2e/perf-results + packages/app/e2e/playwright-report + packages/app/e2e/test-results diff --git a/packages/app/.gitignore b/packages/app/.gitignore index d699efb38..12bee1874 100644 --- a/packages/app/.gitignore +++ b/packages/app/.gitignore @@ -1,3 +1,4 @@ src/assets/theme.css e2e/test-results e2e/playwright-report +e2e/perf-results diff --git a/packages/app/e2e/perf/perf-probe.spec.ts b/packages/app/e2e/perf/perf-probe.spec.ts new file mode 100644 index 000000000..e44cfb0dd --- /dev/null +++ b/packages/app/e2e/perf/perf-probe.spec.ts @@ -0,0 +1,266 @@ +import fs from "node:fs/promises" +import path from "node:path" +import { raw } from "../../../opencode/test/lib/llm-server" +import { test, expect } from "../fixtures" +import { withSession } from "../actions" +import { promptSelector, sessionMessageItemSelector, sessionTurnListSelector, scrollViewportSelector } from "../selectors" +import { sessionPath } from "../utils" +import { installPerfProbe, resetPerfProbe, snapshotPerfProbe, summarizeScenarioRuns } from "./probe" + +const outputPath = process.env.PAWWORK_PERF_OUTPUT ?? path.join(process.cwd(), "e2e", "perf-results", "pr0.1-baseline.json") + +const longMarkdown = [ + "# Baseline stream", + "", + "This stream exists to stress markdown rendering while the session remains interactive.", + "", + "- list item one", + "- list item two with a [link](https://example.com)", + "- 中英混排 content for layout and glyph coverage", + "", + "```ts", + "export function sample(input: number) {", + " return input * 2", + "}", + "```", + "", + ...Array.from({ length: 80 }, (_, index) => `Paragraph ${index + 1}: ${"streaming markdown content ".repeat(8)}`), +].join("\n") + +const scenarioResults: ReturnType[] = [] + +const chatChunk = (delta: Record, input?: { finish?: string; usage?: { input: number; output: number } }) => ({ + id: "chatcmpl-test", + object: "chat.completion.chunk", + choices: [ + { + delta, + ...(input?.finish ? { finish_reason: input.finish } : {}), + }, + ], + ...(input?.usage + ? { + usage: { + prompt_tokens: input.usage.input, + completion_tokens: input.usage.output, + total_tokens: input.usage.input + input.usage.output, + }, + } + : {}), +}) + +function splitText(value: string, size: number) { + const out: string[] = [] + for (let index = 0; index < value.length; index += size) { + out.push(value.slice(index, index + size)) + } + return out +} + +function deferred() { + let resolve!: () => void + const promise = new Promise((done) => { + resolve = done + }) + return { promise, resolve } +} + +async function settleFrames(page: Parameters[0], count = 2) { + await page.evaluate(async (frames) => { + for (let index = 0; index < frames; index += 1) { + await new Promise((resolve) => requestAnimationFrame(() => resolve())) + } + }, count) +} + +async function navigateProjectHome(page: Parameters[0], directory: string) { + await page.goto(sessionPath(directory)) + await expect(page.locator('[data-component="session-new-home"]')).toBeVisible() +} + +async function readPromptSend(page: Parameters[0]) { + return page.evaluate(() => { + const win = window as Window & { + __opencode_e2e?: { + prompt?: { + sent?: { + started?: number + count?: number + sessionID?: string + } + } + } + } + const sent = win.__opencode_e2e?.prompt?.sent + return { + started: sent?.started ?? 0, + count: sent?.count ?? 0, + sessionID: sent?.sessionID, + } + }) +} + +async function submitVisiblePrompt(page: Parameters[0], text: string) { + const prompt = page.locator(promptSelector).first() + const previous = await readPromptSend(page) + await expect(prompt).toBeVisible() + await prompt.click() + await prompt.fill("") + await page.keyboard.type(text) + await page.keyboard.press("Enter") + await expect.poll(async () => (await readPromptSend(page)).started, { timeout: 10_000 }).toBeGreaterThan(previous.started) +} + +async function scrollTimelineTo(page: Parameters[0], top: number) { + const found = await page.evaluate( + ({ top, scrollViewportSelector, turnListSelector }) => { + const list = document.querySelector(turnListSelector) + const viewport = list?.closest(scrollViewportSelector) + if (!(viewport instanceof HTMLElement)) return false + viewport.scrollTop = top + viewport.dispatchEvent(new Event("scroll", { bubbles: true })) + return true + }, + { top, scrollViewportSelector, turnListSelector: sessionTurnListSelector }, + ) + expect(found).toBe(true) +} + +test.describe("PR0.1 perf probe baseline", () => { + test.describe.configure({ mode: "serial" }) + + test.afterAll(async () => { + await fs.mkdir(path.dirname(outputPath), { recursive: true }) + await fs.writeFile(outputPath, `${JSON.stringify(scenarioResults, null, 2)}\n`) + }) + + test("homepage-cold emits a 3-run JSON baseline", async ({ page, project }) => { + await installPerfProbe(page) + await project.open() + + const runs = [] + for (let run = 0; run < 3; run += 1) { + if (run > 0) await navigateProjectHome(page, project.directory) + const prompt = page.locator(promptSelector).first() + await expect(prompt).toBeVisible() + await prompt.click() + await page.getByRole("button", { name: /Switch workspace|切换工作目录/i }).click() + await settleFrames(page, 3) + await page.keyboard.press("Escape") + runs.push(await snapshotPerfProbe(page)) + } + + scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "homepage-cold", runs })) + }) + + test("session-streaming-long emits a 3-run JSON baseline", async ({ page, project, llm }) => { + await installPerfProbe(page) + await project.open() + + const runs = [] + for (let run = 0; run < 3; run += 1) { + const firstWave = deferred() + const secondWave = deferred() + const chunks = splitText(longMarkdown, 320) + const headChunks = [chatChunk({ role: "assistant" }), ...chunks.slice(0, 3).map((chunk) => chatChunk({ content: chunk }))] + const stageOneChunks = chunks.slice(3, 9).map((chunk) => chatChunk({ content: chunk })) + const stageTwoChunks = chunks.slice(9).map((chunk) => chatChunk({ content: chunk })) + + await navigateProjectHome(page, project.directory) + await llm.push( + raw({ + head: headChunks, + stages: [ + { wait: firstWave.promise, chunks: stageOneChunks }, + { wait: secondWave.promise, chunks: stageTwoChunks }, + ], + tail: [chatChunk({}, { finish: "stop", usage: { input: 120, output: 480 } })], + }), + ) + + const send = project.prompt(`Stream probe ${run + 1}`) + await expect.poll(() => page.url(), { timeout: 30_000 }).toContain("/session/") + await expect(page.getByText("This stream exists to stress markdown rendering while the session remains interactive.")).toBeVisible({ + timeout: 30_000, + }) + + await resetPerfProbe(page) + const click = page.getByRole("button", { name: "Right utility panel" }).click() + firstWave.resolve() + await click + await settleFrames(page, 6) + secondWave.resolve() + await send + runs.push(await snapshotPerfProbe(page)) + } + + scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-streaming-long", runs })) + }) + + test("tool-call-expand emits a 3-run JSON baseline", async ({ page, project, llm }) => { + await installPerfProbe(page) + await project.open() + + const runs = [] + for (let run = 0; run < 3; run += 1) { + await navigateProjectHome(page, project.directory) + const created = await project.sdk.worktree.create({ directory: project.directory }).then((result) => result.data) + if (!created?.directory) throw new Error("Failed to create worktree for perf probe") + project.trackDirectory(created.directory) + await llm.tool("enter-worktree", { path: created.directory }) + await llm.text(`tool call baseline ${run + 1}`) + await project.prompt(`Create todos for perf probe run ${run + 1}.`) + const trigger = page.locator('[data-slot="collapsible-trigger"]').filter({ has: page.locator('[data-component="tool-trigger"]') }).first() + await expect(trigger).toBeVisible({ timeout: 30_000 }) + await resetPerfProbe(page) + await trigger.click() + await expect(trigger).toHaveAttribute("aria-expanded", "true") + await trigger.click() + await expect(trigger).toHaveAttribute("aria-expanded", "false") + await trigger.click() + await expect(trigger).toHaveAttribute("aria-expanded", "true") + await settleFrames(page, 4) + runs.push(await snapshotPerfProbe(page)) + } + + scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "tool-call-expand", runs })) + }) + + test("session-scroll-reading emits a 3-run JSON baseline", async ({ page, project }) => { + await installPerfProbe(page) + await project.open() + + const runs = [] + for (let run = 0; run < 3; run += 1) { + await withSession(project.sdk, `perf scroll ${Date.now()}-${run}`, async (session) => { + for (let index = 0; index < 18; index += 1) { + await project.sdk.session.promptAsync({ + sessionID: session.id, + noReply: true, + parts: [ + { + type: "text", + text: `scroll seed ${run}-${index}\n${Array.from({ length: 18 }, (_, line) => `line ${line} ${"content ".repeat(8)}`).join("\n")}`, + }, + ], + }) + } + + await page.goto(sessionPath(project.directory, session.id)) + await expect(page.locator(sessionMessageItemSelector).first()).toBeVisible({ timeout: 30_000 }) + await expect.poll(async () => page.locator(sessionMessageItemSelector).count()).toBeGreaterThanOrEqual(8) + await resetPerfProbe(page) + await page.locator(scrollViewportSelector).first().hover() + await page.mouse.wheel(0, -3600) + await settleFrames(page, 2) + await scrollTimelineTo(page, 0) + await settleFrames(page, 2) + await page.mouse.wheel(0, 3600) + await settleFrames(page, 4) + runs.push(await snapshotPerfProbe(page)) + }) + } + + scenarioResults.push(summarizeScenarioRuns({ branch: "dev", scenario: "session-scroll-reading", runs })) + }) +}) diff --git a/packages/app/e2e/perf/probe.ts b/packages/app/e2e/perf/probe.ts new file mode 100644 index 000000000..ffd52f235 --- /dev/null +++ b/packages/app/e2e/perf/probe.ts @@ -0,0 +1,163 @@ +import type { Page } from "@playwright/test" +import { aggregatePerfRuns, summarizePerfRun, type PerfRunSummary } from "../../src/testing/perf-metrics" + +type BrowserPerfSample = { + startedAt: number + endedAt: number + interactions: Array<{ at: number; delay: number; duration: number }> + longTasks: Array<{ at: number; duration: number }> + frames: Array<{ at: number; duration: number }> + shifts: Array<{ at: number; value: number }> + fcpMs?: number + lcpMs?: number + heapUsedMb?: number +} + +type BrowserPerfWindow = Window & { + __pawwork_perf_probe?: { + reset: () => void + snapshot: () => BrowserPerfSample + } +} + +export async function installPerfProbe(page: Page) { + await page.addInitScript(() => { + const win = window as BrowserPerfWindow + if (win.__pawwork_perf_probe) return + + const interactions: Array<{ at: number; delay: number; duration: number }> = [] + const longTasks: Array<{ at: number; duration: number }> = [] + const frames: Array<{ at: number; duration: number }> = [] + const shifts: Array<{ at: number; value: number }> = [] + const maxEntries = 4000 + const supported = PerformanceObserver.supportedEntryTypes ?? [] + let startedAt = 0 + let raf = 0 + let lastFrame = 0 + let fcpMs: number | undefined + let lcpMs: number | undefined + + const trim = (list: T[]) => { + if (list.length <= maxEntries) return + list.splice(0, list.length - maxEntries) + } + + const observe = ( + type: string, + init: PerformanceObserverInit & { durationThreshold?: number }, + fn: (entries: PerformanceEntry[]) => void, + ) => { + if (!supported.includes(type)) return + const observer = new PerformanceObserver((list) => fn(list.getEntries())) + try { + observer.observe(init) + } catch { + observer.disconnect() + } + } + + observe("event", { buffered: true, durationThreshold: 16, type: "event" }, (entries) => { + for (const entry of entries as Array) { + if (entry.duration < 16) continue + interactions.push({ + at: entry.startTime, + delay: Math.max(0, (entry.processingStart ?? entry.startTime) - entry.startTime), + duration: entry.duration, + }) + } + trim(interactions) + }) + + observe("longtask", { buffered: true, type: "longtask" }, (entries) => { + for (const entry of entries) { + longTasks.push({ at: entry.startTime, duration: entry.duration }) + } + trim(longTasks) + }) + + observe("layout-shift", { buffered: true, type: "layout-shift" }, (entries) => { + for (const entry of entries as Array) { + if (entry.hadRecentInput) continue + if (typeof entry.value !== "number") continue + shifts.push({ at: entry.startTime, value: entry.value }) + } + trim(shifts) + }) + + observe("paint", { buffered: true, type: "paint" }, (entries) => { + for (const entry of entries) { + if (entry.name === "first-contentful-paint") { + fcpMs = entry.startTime + } + } + }) + + observe("largest-contentful-paint", { buffered: true, type: "largest-contentful-paint" }, (entries) => { + for (const entry of entries) { + lcpMs = entry.startTime + } + }) + + const loop = (at: number) => { + if (document.visibilityState === "visible") { + if (lastFrame !== 0) { + frames.push({ at, duration: at - lastFrame }) + trim(frames) + } + lastFrame = at + } else { + lastFrame = 0 + } + raf = requestAnimationFrame(loop) + } + + raf = requestAnimationFrame(loop) + win.addEventListener("beforeunload", () => { + if (raf !== 0) cancelAnimationFrame(raf) + }) + + win.__pawwork_perf_probe = { + reset() { + startedAt = performance.now() + }, + snapshot() { + const memory = performance as Performance & { memory?: { usedJSHeapSize?: number } } + return { + startedAt, + endedAt: performance.now(), + interactions: interactions.slice(), + longTasks: longTasks.slice(), + frames: frames.slice(), + shifts: shifts.slice(), + fcpMs, + lcpMs, + heapUsedMb: + typeof memory.memory?.usedJSHeapSize === "number" + ? memory.memory.usedJSHeapSize / 1024 / 1024 + : undefined, + } + }, + } + }) +} + +export async function resetPerfProbe(page: Page) { + await page.evaluate(() => { + const probe = (window as BrowserPerfWindow).__pawwork_perf_probe + if (!probe) throw new Error("Perf probe is not installed") + probe.reset() + }) +} + +export async function snapshotPerfProbe(page: Page) { + const sample = await page.evaluate(() => { + const probe = (window as BrowserPerfWindow).__pawwork_perf_probe + if (!probe) throw new Error("Perf probe is not installed") + return probe.snapshot() + }) + return summarizePerfRun(sample) +} + +export function summarizeScenarioRuns(input: { branch: string; scenario: string; runs: PerfRunSummary[] }) { + return aggregatePerfRuns(input) +} diff --git a/packages/app/package.json b/packages/app/package.json index 64f8dbcce..250a75638 100644 --- a/packages/app/package.json +++ b/packages/app/package.json @@ -22,6 +22,8 @@ "test:e2e": "playwright test", "test:e2e:smoke": "playwright test --grep @smoke", "test:e2e:local": "bun script/e2e-local.ts", + "test:e2e:local:perf": "bun script/e2e-local.ts -- e2e/perf/perf-probe.spec.ts", + "test:e2e:perf": "playwright test e2e/perf/perf-probe.spec.ts", "test:e2e:local:smoke": "bun script/e2e-local.ts -- --grep @smoke", "test:e2e:ui": "playwright test --ui", "test:e2e:report": "playwright show-report e2e/playwright-report" diff --git a/packages/app/src/testing/perf-metrics.test.ts b/packages/app/src/testing/perf-metrics.test.ts new file mode 100644 index 000000000..8ede5ba95 --- /dev/null +++ b/packages/app/src/testing/perf-metrics.test.ts @@ -0,0 +1,119 @@ +import { describe, expect, test } from "bun:test" +import { aggregatePerfRuns, summarizePerfRun } from "./perf-metrics" + +describe("perf metrics", () => { + test("summarizes a perf sample window", () => { + const summary = summarizePerfRun({ + startedAt: 100, + endedAt: 200, + interactions: [ + { at: 90, delay: 1, duration: 10 }, + { at: 120, delay: 8, duration: 72 }, + { at: 160, delay: 4, duration: 41 }, + ], + longTasks: [ + { at: 80, duration: 120 }, + { at: 130, duration: 80 }, + { at: 170, duration: 55 }, + ], + frames: [ + { at: 110, duration: 16 }, + { at: 120, duration: 18 }, + { at: 130, duration: 22 }, + { at: 140, duration: 55 }, + { at: 150, duration: 80 }, + ], + shifts: [ + { at: 95, value: 0.3 }, + { at: 135, value: 0.01 }, + { at: 165, value: 0.02 }, + ], + fcpMs: 456.4, + lcpMs: 789.2, + heapUsedMb: 123.6, + }) + + expect(summary).toEqual({ + interaction_ms: 72, + interaction_delay_ms: 8, + long_task_count: 2, + long_task_max_ms: 80, + tbt_ms: 35, + frame_gap_p95_ms: 80, + frame_gap_max_ms: 80, + jank_count_50ms: 2, + cls: 0.03, + window_ms: 100, + fcp_ms: 456.4, + lcp_ms: 789.2, + heap_used_mb: 123.6, + }) + }) + + test("aggregates scenario medians and worst interaction", () => { + const summary = aggregatePerfRuns({ + branch: "dev", + scenario: "tool-call-expand", + runs: [ + { + interaction_ms: 40, + interaction_delay_ms: 6, + long_task_count: 1, + long_task_max_ms: 61, + tbt_ms: 11, + frame_gap_p95_ms: 28, + frame_gap_max_ms: 47, + jank_count_50ms: 0, + cls: 0.002, + window_ms: 1200, + fcp_ms: undefined, + lcp_ms: undefined, + heap_used_mb: 91, + }, + { + interaction_ms: 75, + interaction_delay_ms: 10, + long_task_count: 2, + long_task_max_ms: 74, + tbt_ms: 24, + frame_gap_p95_ms: 35, + frame_gap_max_ms: 58, + jank_count_50ms: 1, + cls: 0.004, + window_ms: 1190, + fcp_ms: undefined, + lcp_ms: undefined, + heap_used_mb: 93, + }, + { + interaction_ms: 52, + interaction_delay_ms: 7, + long_task_count: 1, + long_task_max_ms: 68, + tbt_ms: 18, + frame_gap_p95_ms: 30, + frame_gap_max_ms: 54, + jank_count_50ms: 1, + cls: 0.003, + window_ms: 1210, + fcp_ms: undefined, + lcp_ms: undefined, + heap_used_mb: 95, + }, + ], + }) + + expect(summary.branch).toBe("dev") + expect(summary.scenario).toBe("tool-call-expand") + expect(summary.runs).toBe(3) + expect(summary.interaction_ms_median).toBe(52) + expect(summary.interaction_ms_worst).toBe(75) + expect(summary.long_task_max_ms).toBe(68) + expect(summary.tbt_ms).toBe(18) + expect(summary.frame_gap_p95_ms).toBe(30) + expect(summary.frame_gap_max_ms).toBe(54) + expect(summary.cls).toBe(0.003) + expect(summary.heap_used_mb).toBe(93) + expect(summary.run_details).toHaveLength(3) + }) +}) diff --git a/packages/app/src/testing/perf-metrics.ts b/packages/app/src/testing/perf-metrics.ts new file mode 100644 index 000000000..59ba55ce3 --- /dev/null +++ b/packages/app/src/testing/perf-metrics.ts @@ -0,0 +1,148 @@ +export type PerfInteractionSample = { + at: number + delay: number + duration: number +} + +export type PerfDurationSample = { + at: number + duration: number +} + +export type PerfShiftSample = { + at: number + value: number +} + +export type PerfRunSample = { + startedAt: number + endedAt: number + interactions: PerfInteractionSample[] + longTasks: PerfDurationSample[] + frames: PerfDurationSample[] + shifts: PerfShiftSample[] + fcpMs?: number + lcpMs?: number + heapUsedMb?: number +} + +export type PerfRunSummary = { + interaction_ms: number + interaction_delay_ms: number + long_task_count: number + long_task_max_ms: number + tbt_ms: number + frame_gap_p95_ms: number + frame_gap_max_ms: number + jank_count_50ms: number + cls: number + window_ms: number + fcp_ms?: number + lcp_ms?: number + heap_used_mb?: number +} + +export type PerfScenarioSummary = PerfRunSummary & { + branch: string + scenario: string + runs: number + interaction_ms_median: number + interaction_ms_worst: number + run_details: PerfRunSummary[] +} + +function round(input: number) { + return Math.round(input * 1000) / 1000 +} + +function median(values: number[]) { + if (values.length === 0) return 0 + const sorted = [...values].sort((a, b) => a - b) + const mid = Math.floor(sorted.length / 2) + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid] +} + +function percentile(values: number[], p: number) { + if (values.length === 0) return 0 + const sorted = [...values].sort((a, b) => a - b) + const index = Math.max(0, Math.ceil(sorted.length * p) - 1) + return sorted[index] +} + +function pickWindow(items: T[], start: number, end: number) { + return items.filter((item) => item.at >= start && item.at <= end) +} + +function optionalMedian(values: Array) { + const filtered = values.filter((value): value is number => typeof value === "number") + if (filtered.length === 0) return undefined + return round(median(filtered)) +} + +export function summarizePerfRun(input: PerfRunSample): PerfRunSummary { + const startedAt = input.startedAt + const endedAt = Math.max(input.endedAt, startedAt) + const interactions = pickWindow(input.interactions, startedAt, endedAt) + const longTasks = pickWindow(input.longTasks, startedAt, endedAt) + const frames = pickWindow(input.frames, startedAt, endedAt) + const shifts = pickWindow(input.shifts, startedAt, endedAt) + + const interaction = interactions.reduce((max, entry) => Math.max(max, entry.duration), 0) + const interactionDelay = interactions.reduce((max, entry) => Math.max(max, entry.delay), 0) + const longTaskMax = longTasks.reduce((max, entry) => Math.max(max, entry.duration), 0) + const tbt = longTasks.reduce((sum, entry) => sum + Math.max(0, entry.duration - 50), 0) + const frameDurations = frames.map((entry) => entry.duration) + const frameGapP95 = percentile(frameDurations, 0.95) + const frameGapMax = frameDurations.reduce((max, value) => Math.max(max, value), 0) + const jankCount = frameDurations.filter((value) => value > 50).length + const cls = shifts.reduce((sum, entry) => sum + entry.value, 0) + + return { + interaction_ms: round(interaction), + interaction_delay_ms: round(interactionDelay), + long_task_count: longTasks.length, + long_task_max_ms: round(longTaskMax), + tbt_ms: round(tbt), + frame_gap_p95_ms: round(frameGapP95), + frame_gap_max_ms: round(frameGapMax), + jank_count_50ms: jankCount, + cls: round(cls), + window_ms: round(endedAt - startedAt), + fcp_ms: input.fcpMs === undefined ? undefined : round(input.fcpMs), + lcp_ms: input.lcpMs === undefined ? undefined : round(input.lcpMs), + heap_used_mb: input.heapUsedMb === undefined ? undefined : round(input.heapUsedMb), + } +} + +export function aggregatePerfRuns(input: { + branch: string + scenario: string + runs: PerfRunSummary[] +}): PerfScenarioSummary { + const runs = input.runs + if (runs.length === 0) { + throw new Error(`Cannot aggregate perf runs for ${input.scenario} without samples`) + } + + return { + branch: input.branch, + scenario: input.scenario, + runs: runs.length, + interaction_ms_median: round(median(runs.map((run) => run.interaction_ms))), + interaction_ms_worst: round(runs.reduce((max, run) => Math.max(max, run.interaction_ms), 0)), + interaction_ms: round(median(runs.map((run) => run.interaction_ms))), + interaction_delay_ms: round(median(runs.map((run) => run.interaction_delay_ms))), + long_task_count: round(median(runs.map((run) => run.long_task_count))), + long_task_max_ms: round(median(runs.map((run) => run.long_task_max_ms))), + tbt_ms: round(median(runs.map((run) => run.tbt_ms))), + frame_gap_p95_ms: round(median(runs.map((run) => run.frame_gap_p95_ms))), + frame_gap_max_ms: round(median(runs.map((run) => run.frame_gap_max_ms))), + jank_count_50ms: round(median(runs.map((run) => run.jank_count_50ms))), + cls: round(median(runs.map((run) => run.cls))), + window_ms: round(median(runs.map((run) => run.window_ms))), + fcp_ms: optionalMedian(runs.map((run) => run.fcp_ms)), + lcp_ms: optionalMedian(runs.map((run) => run.lcp_ms)), + heap_used_mb: optionalMedian(runs.map((run) => run.heap_used_mb)), + run_details: runs, + } +} diff --git a/packages/opencode/test/lib/llm-server.ts b/packages/opencode/test/lib/llm-server.ts index 2e2a2ea89..6c7814576 100644 --- a/packages/opencode/test/lib/llm-server.ts +++ b/packages/opencode/test/lib/llm-server.ts @@ -35,6 +35,10 @@ type Wait = { type Sse = { type: "sse" head: unknown[] + stages?: Array<{ + wait?: PromiseLike + chunks: unknown[] + }> tail: unknown[] wait?: PromiseLike hang?: boolean @@ -288,9 +292,9 @@ function choices(part: unknown) { return choice } -function flow(item: Sse) { +function flowParts(parts: Iterable) { const out: Flow[] = [] - for (const part of [...item.head, ...item.tail]) { + for (const part of parts) { const choice = choices(part) const delta = choice && "delta" in choice && choice.delta && typeof choice.delta === "object" ? choice.delta : undefined @@ -344,85 +348,100 @@ function responses(item: Sse, model: string) { } | undefined let usage: Usage | undefined - const lines: unknown[] = [responseCreated(model)] - - for (const part of flow(item)) { - if (part.type === "text") { - msg ??= "msg_1" - if (!hasMsg) { - hasMsg = true + const pushFlow = (target: unknown[], parts: Iterable) => { + for (const part of flowParts(parts)) { + if (part.type === "text") { + msg ??= "msg_1" + if (!hasMsg) { + hasMsg = true + seq += 1 + target.push(responseMessage(msg, seq)) + } seq += 1 - lines.push(responseMessage(msg, seq)) + target.push(responseText(msg, part.text, seq)) + continue } - seq += 1 - lines.push(responseText(msg, part.text, seq)) - continue - } - if (part.type === "reason") { - reason ||= "rs_1" - if (!hasReason) { - hasReason = true + if (part.type === "reason") { + reason ||= "rs_1" + if (!hasReason) { + hasReason = true + seq += 1 + target.push(responseReason(reason, seq)) + seq += 1 + target.push(responseReasonPart(reason, seq)) + } seq += 1 - lines.push(responseReason(reason, seq)) + target.push(responseReasonText(reason, part.text, seq)) + continue + } + + if (part.type === "tool-start") { + call ||= { id: part.id, item: "fc_1", name: part.name, args: "" } seq += 1 - lines.push(responseReasonPart(reason, seq)) + target.push(responseTool(call.id, call.item, call.name, seq)) + continue } - seq += 1 - lines.push(responseReasonText(reason, part.text, seq)) - continue - } - if (part.type === "tool-start") { - call ||= { id: part.id, item: "fc_1", name: part.name, args: "" } - seq += 1 - lines.push(responseTool(call.id, call.item, call.name, seq)) - continue - } + if (part.type === "tool-args") { + if (!call) continue + call.args += part.text + seq += 1 + target.push(responseToolArgs(call.item, part.text, seq)) + continue + } - if (part.type === "tool-args") { - if (!call) continue - call.args += part.text - seq += 1 - lines.push(responseToolArgs(call.item, part.text, seq)) - continue + usage = part.usage } - - usage = part.usage } + const head = [responseCreated(model)] + pushFlow(head, item.head) + const stages = + item.stages?.map((stage) => { + const chunks: unknown[] = [] + pushFlow(chunks, stage.chunks) + return { + wait: stage.wait, + chunks, + } + }) ?? [] + const tail: unknown[] = [] + pushFlow(tail, item.tail) if (msg) { seq += 1 - lines.push(responseMessageDone(msg, seq)) + tail.push(responseMessageDone(msg, seq)) } if (reason) { seq += 1 - lines.push(responseReasonDone(reason, seq)) + tail.push(responseReasonDone(reason, seq)) } if (call && !item.hang && !item.error) { seq += 1 - lines.push(responseToolArgsDone(call.item, call.args, seq)) + tail.push(responseToolArgsDone(call.item, call.args, seq)) seq += 1 - lines.push(responseToolDone(call, seq)) + tail.push(responseToolDone(call, seq)) } - if (!item.hang && !item.error) lines.push(responseCompleted({ seq: seq + 1, usage })) - return { ...item, head: lines, tail: [] } satisfies Sse -} - -function modelFrom(body: unknown) { - if (!body || typeof body !== "object") return "test-model" - if (!("model" in body) || typeof body.model !== "string") return "test-model" - return body.model + if (!item.hang && !item.error) tail.push(responseCompleted({ seq: seq + 1, usage })) + return { ...item, head, stages, tail } satisfies Sse } function send(item: Sse) { const head = bytes(item.head) + const segments = item.stages ?? [] const tail = bytes([...item.tail, ...(item.hang || item.error ? [] : [done])]) const empty = Stream.fromIterable([]) + + let body: Stream.Stream = head + for (const stage of segments) { + const chunkStream = bytes(stage.chunks) + const wait = stage.wait + const segment = wait ? Stream.fromEffect(Effect.promise(() => wait)).pipe(Stream.flatMap(() => chunkStream)) : chunkStream + body = Stream.concat(body, segment) + } const wait = item.wait - const body: Stream.Stream = wait - ? Stream.concat(head, Stream.fromEffect(Effect.promise(() => wait)).pipe(Stream.flatMap(() => tail))) - : Stream.concat(head, tail) + body = Stream.concat(body, wait ? Stream.fromEffect(Effect.promise(() => wait)).pipe(Stream.flatMap(() => tail)) : tail) + let end: Stream.Stream = empty if (item.error) end = Stream.concat(empty, Stream.fail(item.error)) else if (item.hang) end = Stream.concat(empty, Stream.never) @@ -430,23 +449,32 @@ function send(item: Sse) { return HttpServerResponse.stream(Stream.concat(body, end), { contentType: "text/event-stream" }) } +function fail(item: HttpError) { + return HttpServerResponse.text(JSON.stringify(item.body), { + status: item.status, + contentType: "application/json", + }) +} + const reset = Effect.fn("TestLLMServer.reset")(function* (item: Sse) { const req = yield* HttpServerRequest.HttpServerRequest const res = NodeHttpServerRequest.toServerResponse(req) yield* Effect.sync(() => { res.writeHead(200, { "content-type": "text/event-stream" }) for (const part of item.head) res.write(line(part)) + for (const stage of item.stages ?? []) { + for (const part of stage.chunks) res.write(line(part)) + } for (const part of item.tail) res.write(line(part)) res.destroy(new Error("connection reset")) }) return yield* Effect.never }) -function fail(item: HttpError) { - return HttpServerResponse.text(JSON.stringify(item.body), { - status: item.status, - contentType: "application/json", - }) +function modelFrom(body: unknown) { + if (!body || typeof body !== "object") return "test-model" + if (!("model" in body) || typeof body.model !== "string") return "test-model" + return body.model } export class Reply { @@ -568,6 +596,10 @@ export function httpError(status: number, body: unknown): Item { export function raw(input: { chunks?: unknown[] head?: unknown[] + stages?: Array<{ + wait?: PromiseLike + chunks: unknown[] + }> tail?: unknown[] wait?: PromiseLike hang?: boolean @@ -577,6 +609,7 @@ export function raw(input: { return { type: "sse", head: input.head ?? input.chunks ?? [], + stages: input.stages, tail: input.tail ?? [], wait: input.wait, hang: input.hang,