diff --git a/bin/test b/bin/test new file mode 100755 index 00000000..166495fe --- /dev/null +++ b/bin/test @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Runs the same checks as CI by parsing .github/workflows/ci.yml directly. +# If CI steps change, this script automatically picks them up. +# +# Local adaptations: +# - `npm ci` checks if node_modules is in sync with package-lock.json +# and runs a clean install if not (CI always does npm ci). +# - `npm run format:check` checks only git-tracked files because CI +# runs on a clean checkout but locally we have untracked x.* scratch +# files that fail prettier. +set -euo pipefail + +cd "$(git rev-parse --show-toplevel)" + +ci_yaml=".github/workflows/ci.yml" + +if ! command -v yq &>/dev/null; then + echo "error: yq is required (brew install yq)" >&2 + exit 1 +fi + +# Extract run steps +mapfile -t names < <(yq '.jobs.build.steps[] | select(.run) | .name' "$ci_yaml") +mapfile -t commands < <(yq '.jobs.build.steps[] | select(.run) | .run' "$ci_yaml") + +for i in "${!commands[@]}"; do + cmd="${commands[$i]}" + name="${names[$i]}" + + echo "=== ${name} ===" + + if [[ "$cmd" == "npm ci" ]]; then + # Check if node_modules matches package-lock.json. If not, run + # npm ci to match what CI does. This catches stale-dependency bugs + # like sdk-tools.d.ts resolving locally but not in CI. + if npm ls --all >/dev/null 2>&1; then + echo "(node_modules in sync — skipping npm ci)" + else + echo "(node_modules out of sync — running npm ci)" + npm ci + fi + elif [[ "$cmd" == "npm run format:check" ]]; then + # Local override: format:check on git-tracked files only + git ls-files -z '*.ts' '*.tsx' '*.js' '*.jsx' '*.json' '*.md' '*.yml' '*.yaml' '*.css' '*.html' \ + | xargs -0 npx prettier --check + else + eval "$cmd" + fi + + echo "" +done + +echo "=== All CI checks passed ===" diff --git a/src/acp-agent.ts b/src/acp-agent.ts index 864d027f..17c38bc3 100644 --- a/src/acp-agent.ts +++ b/src/acp-agent.ts @@ -482,6 +482,8 @@ export class ClaudeAcpAgent implements Agent { }; let lastAssistantTotalUsage: number | null = null; + let lastAssistantModel: string | null = null; + let lastContextWindowSize: number = 200000; const userMessage = promptToClaude(params); @@ -538,9 +540,26 @@ export class ClaudeAcpAgent implements Agent { break; } case "compact_boundary": { - // We don't know the exact size, but since we compacted, - // we set it to zero. The client gets the exact size on the next message. + // Send used:0 immediately so the client doesn't keep showing + // the stale pre-compaction context size until the next turn. + // + // This is a deliberate approximation: we don't know the exact + // post-compaction token count (only the SDK's next API call + // reveals that). But used:0 is directionally correct — context + // just dropped dramatically — and the real value replaces it + // within seconds when the next result message arrives. + // The alternative (no update) leaves the client showing e.g. + // "944k/1m" right after the user sees "Compacting completed", + // which is confusing and wrong. lastAssistantTotalUsage = 0; + await this.client.sessionUpdate({ + sessionId: message.session_id, + update: { + sessionUpdate: "usage_update", + used: 0, + size: lastContextWindowSize, + }, + }); await this.client.sessionUpdate({ sessionId: message.session_id, update: { @@ -584,10 +603,23 @@ export class ClaudeAcpAgent implements Agent { session.accumulatedUsage.cachedReadTokens += message.usage.cache_read_input_tokens; session.accumulatedUsage.cachedWriteTokens += message.usage.cache_creation_input_tokens; - // Calculate context window size from modelUsage (minimum across all models used) - const contextWindows = Object.values(message.modelUsage).map((m) => m.contextWindow); - const contextWindowSize = - contextWindows.length > 0 ? Math.min(...contextWindows) : 200000; + // Calculate context window size from the current model's usage. + // The modelUsage keys may use the requested model alias (e.g. "claude-opus-4-6") + // while message.model on assistant messages has the resolved API response model + // (e.g. "claude-opus-4-6-20250514"), so we fall back to prefix matching. + const currentModel = lastAssistantModel; + const matchingModelUsage = currentModel + ? (message.modelUsage[currentModel] ?? + Object.entries(message.modelUsage) + .filter(([key]) => currentModel.startsWith(key) || key.startsWith(currentModel)) + .sort((a, b) => b[0].length - a[0].length)[0]?.[1]) + : undefined; + // Fallback to 200k: this is hit when lastAssistantModel is null (e.g. the + // assistant message lacked a model field) or no modelUsage key matches. + // 200k is a conservative default — the Anthropic API should always populate + // BetaMessage.model, so this path is unlikely in practice. + const contextWindowSize = matchingModelUsage?.contextWindow ?? 200000; + lastContextWindowSize = contextWindowSize; // Send usage_update notification if (lastAssistantTotalUsage !== null) { @@ -716,6 +748,11 @@ export class ClaudeAcpAgent implements Agent { } // Store latest assistant usage (excluding subagents) + // Sum all token types as a proxy for post-turn context occupancy: + // current turn's output will become next turn's input. + // Note: per the Anthropic API, input_tokens excludes cache tokens — + // cache_read and cache_creation are reported separately, so summing + // all four fields is not double-counting. if ((message.message as any).usage && message.parent_tool_use_id === null) { const messageWithUsage = message.message as unknown as SDKResultMessage; lastAssistantTotalUsage = @@ -724,6 +761,16 @@ export class ClaudeAcpAgent implements Agent { messageWithUsage.usage.cache_read_input_tokens + messageWithUsage.usage.cache_creation_input_tokens; } + // Track the current top-level model for context window size lookup + // (exclude subagent messages to stay in sync with lastAssistantTotalUsage) + if ( + message.type === "assistant" && + message.parent_tool_use_id === null && + message.message.model && + message.message.model !== "" + ) { + lastAssistantModel = message.message.model; + } // Slash commands like /compact can generate invalid output... doesn't match // their own docs: https://docs.anthropic.com/en/docs/claude-code/sdk/sdk-slash-commands#%2Fcompact-compact-conversation-history diff --git a/src/tests/acp-agent.test.ts b/src/tests/acp-agent.test.ts index 3a0d8d43..b47f896a 100644 --- a/src/tests/acp-agent.test.ts +++ b/src/tests/acp-agent.test.ts @@ -1581,3 +1581,431 @@ describe("session/close", () => { expect(agent.sessions["session-b"]).toBeDefined(); }); }); + +describe("usage_update computation", () => { + function createAssistantMessage(overrides: { + model: string; + usage?: { + input_tokens: number; + output_tokens: number; + cache_read_input_tokens: number; + cache_creation_input_tokens: number; + }; + }) { + return { + type: "assistant" as const, + parent_tool_use_id: null, + uuid: randomUUID(), + session_id: "test-session", + message: { + model: overrides.model, + content: [{ type: "text", text: "hello" }], + usage: overrides.usage ?? { + input_tokens: 100, + output_tokens: 50, + cache_read_input_tokens: 20, + cache_creation_input_tokens: 10, + }, + }, + }; + } + + function createResultMessageWithModel(overrides: { + modelUsage: Record< + string, + { + inputTokens: number; + outputTokens: number; + cacheReadInputTokens: number; + cacheCreationInputTokens: number; + webSearchRequests: number; + costUSD: number; + contextWindow: number; + maxOutputTokens: number; + } + >; + }) { + return { + type: "result" as const, + subtype: "success" as const, + stop_reason: "end_turn", + is_error: false, + result: "", + errors: [], + duration_ms: 0, + duration_api_ms: 0, + num_turns: 1, + total_cost_usd: 0.01, + usage: { + input_tokens: 10, + output_tokens: 5, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + }, + modelUsage: overrides.modelUsage, + permission_denials: [], + uuid: randomUUID(), + session_id: "test-session", + }; + } + + function createMockAgentWithCapture() { + const updates: any[] = []; + const mockClient = { + sessionUpdate: async (notification: any) => { + updates.push(notification); + }, + } as unknown as AgentSideConnection; + const agent = new ClaudeAcpAgent(mockClient, { log: () => {}, error: () => {} }); + return { agent, updates }; + } + + function injectSession(agent: ClaudeAcpAgent, messages: any[]) { + const input = new Pushable(); + async function* messageGenerator() { + // Wait for the prompt to push its user message so we can replay it + const iter = input[Symbol.asyncIterator](); + const { value: userMessage, done } = await iter.next(); + if (!done && userMessage) { + yield { + type: "user", + message: userMessage.message, + parent_tool_use_id: null, + uuid: userMessage.uuid, + session_id: "test-session", + isReplay: true, + }; + } + yield* messages; + } + agent.sessions["test-session"] = { + query: messageGenerator() as any, + input, + cancelled: false, + cwd: "/test", + permissionMode: "default", + settingsManager: {} as any, + accumulatedUsage: { + inputTokens: 0, + outputTokens: 0, + cachedReadTokens: 0, + cachedWriteTokens: 0, + }, + configOptions: [], + promptRunning: false, + pendingMessages: new Map(), + nextPendingOrder: 0, + abortController: new AbortController(), + }; + } + + it("used sums all token types as post-turn context occupancy proxy", async () => { + const { agent, updates } = createMockAgentWithCapture(); + injectSession(agent, [ + createAssistantMessage({ + model: "claude-opus-4-20250514", + usage: { + input_tokens: 1000, + output_tokens: 500, + cache_read_input_tokens: 200, + cache_creation_input_tokens: 100, + }, + }), + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-20250514": { + inputTokens: 1000, + outputTokens: 500, + cacheReadInputTokens: 200, + cacheCreationInputTokens: 100, + webSearchRequests: 0, + costUSD: 0.01, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + // used = input(1000) + output(500) + cache_read(200) + cache_creation(100) = 1800 + expect(usageUpdate.update.used).toBe(1800); + }); + + it("size reflects the current model's context window, not min across all", async () => { + const { agent, updates } = createMockAgentWithCapture(); + injectSession(agent, [ + createAssistantMessage({ model: "claude-opus-4-20250514" }), + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-20250514": { + inputTokens: 100, + outputTokens: 50, + cacheReadInputTokens: 20, + cacheCreationInputTokens: 10, + webSearchRequests: 0, + costUSD: 0.01, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + "claude-sonnet-4-20250514": { + inputTokens: 50, + outputTokens: 25, + cacheReadInputTokens: 10, + cacheCreationInputTokens: 5, + webSearchRequests: 0, + costUSD: 0.005, + contextWindow: 200000, + maxOutputTokens: 16384, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + // size should be 1000000 (Opus), not 200000 (min of both) + expect(usageUpdate.update.size).toBe(1000000); + }); + + it("after model switch, size updates to the new model's window", async () => { + const { agent, updates } = createMockAgentWithCapture(); + // Simulate: assistant on Sonnet with both models in modelUsage + injectSession(agent, [ + createAssistantMessage({ model: "claude-sonnet-4-20250514" }), + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-20250514": { + inputTokens: 100, + outputTokens: 50, + cacheReadInputTokens: 20, + cacheCreationInputTokens: 10, + webSearchRequests: 0, + costUSD: 0.01, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + "claude-sonnet-4-20250514": { + inputTokens: 50, + outputTokens: 25, + cacheReadInputTokens: 10, + cacheCreationInputTokens: 5, + webSearchRequests: 0, + costUSD: 0.005, + contextWindow: 200000, + maxOutputTokens: 16384, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + // size should be 200000 (Sonnet - the current model) + expect(usageUpdate.update.size).toBe(200000); + }); + + it("after switching back to original model, size returns to original window", async () => { + const { agent, updates } = createMockAgentWithCapture(); + // Last assistant message is Opus again + injectSession(agent, [ + createAssistantMessage({ model: "claude-sonnet-4-20250514" }), + createAssistantMessage({ model: "claude-opus-4-20250514" }), + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-20250514": { + inputTokens: 200, + outputTokens: 100, + cacheReadInputTokens: 40, + cacheCreationInputTokens: 20, + webSearchRequests: 0, + costUSD: 0.02, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + "claude-sonnet-4-20250514": { + inputTokens: 50, + outputTokens: 25, + cacheReadInputTokens: 10, + cacheCreationInputTokens: 5, + webSearchRequests: 0, + costUSD: 0.005, + contextWindow: 200000, + maxOutputTokens: 16384, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + // size should be 1000000 (Opus - switched back) + expect(usageUpdate.update.size).toBe(1000000); + }); + + it("subagent assistant messages do not affect size (top-level model is used)", async () => { + const { agent, updates } = createMockAgentWithCapture(); + // Top-level assistant on Opus, then subagent on Haiku (parent_tool_use_id set) + injectSession(agent, [ + createAssistantMessage({ model: "claude-opus-4-20250514" }), + { + type: "assistant" as const, + parent_tool_use_id: "tool_use_123", + uuid: randomUUID(), + session_id: "test-session", + message: { + model: "claude-haiku-4-5-20251001", + content: [{ type: "text", text: "subagent response" }], + usage: { + input_tokens: 50, + output_tokens: 25, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + }, + }, + }, + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-20250514": { + inputTokens: 100, + outputTokens: 50, + cacheReadInputTokens: 20, + cacheCreationInputTokens: 10, + webSearchRequests: 0, + costUSD: 0.01, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + "claude-haiku-4-5-20251001": { + inputTokens: 50, + outputTokens: 25, + cacheReadInputTokens: 0, + cacheCreationInputTokens: 0, + webSearchRequests: 0, + costUSD: 0.001, + contextWindow: 200000, + maxOutputTokens: 8192, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + // size should be 1000000 (Opus - the top-level model), NOT 200000 (Haiku subagent) + expect(usageUpdate.update.size).toBe(1000000); + }); + + it("prefix-matches when assistant model has date suffix but modelUsage key does not", async () => { + const { agent, updates } = createMockAgentWithCapture(); + // The API response has the full versioned model ID on assistant messages, + // but the SDK's streaming path may key modelUsage by the shorter alias. + injectSession(agent, [ + createAssistantMessage({ model: "claude-opus-4-6-20250514" }), + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-6": { + inputTokens: 100, + outputTokens: 50, + cacheReadInputTokens: 20, + cacheCreationInputTokens: 10, + webSearchRequests: 0, + costUSD: 0.01, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + // Should match via prefix: "claude-opus-4-6-20250514".startsWith("claude-opus-4-6") + expect(usageUpdate.update.size).toBe(1000000); + }); + + it("prefix-matches when modelUsage key has date suffix but assistant model does not", async () => { + const { agent, updates } = createMockAgentWithCapture(); + injectSession(agent, [ + createAssistantMessage({ model: "claude-opus-4-6" }), + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-6-20250514": { + inputTokens: 100, + outputTokens: 50, + cacheReadInputTokens: 20, + cacheCreationInputTokens: 10, + webSearchRequests: 0, + costUSD: 0.01, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + expect(usageUpdate.update.size).toBe(1000000); + }); + + it("synthetic assistant messages do not override lastAssistantModel", async () => { + const { agent, updates } = createMockAgentWithCapture(); + // Real assistant on Opus, then a synthetic message (e.g. from /compact) + injectSession(agent, [ + createAssistantMessage({ model: "claude-opus-4-20250514" }), + { + type: "assistant" as const, + parent_tool_use_id: null, + uuid: randomUUID(), + session_id: "test-session", + message: { + model: "", + content: [{ type: "text", text: "compacted" }], + usage: { + input_tokens: 0, + output_tokens: 0, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + }, + }, + }, + createResultMessageWithModel({ + modelUsage: { + "claude-opus-4-20250514": { + inputTokens: 100, + outputTokens: 50, + cacheReadInputTokens: 20, + cacheCreationInputTokens: 10, + webSearchRequests: 0, + costUSD: 0.01, + contextWindow: 1000000, + maxOutputTokens: 16384, + }, + }, + }), + ]); + + await agent.prompt({ sessionId: "test-session", prompt: [{ type: "text", text: "test" }] }); + + const usageUpdate = updates.find((u: any) => u.update?.sessionUpdate === "usage_update"); + expect(usageUpdate).toBeDefined(); + // size should be 1000000 (Opus), not 200000 (the fallback if overrode the model) + expect(usageUpdate.update.size).toBe(1000000); + }); +});