diff --git a/.github/workflows/pr-quality.yml b/.github/workflows/pr-quality.yml index 28ce016a2..78d143341 100644 --- a/.github/workflows/pr-quality.yml +++ b/.github/workflows/pr-quality.yml @@ -37,6 +37,9 @@ jobs: with: bun-version: latest + - name: Install dependencies + run: bun install + - name: Run policy tests run: bun run check:policy diff --git a/src/constants/system.ts b/src/constants/system.ts index d6d3efe13..7122c6bfb 100644 --- a/src/constants/system.ts +++ b/src/constants/system.ts @@ -47,13 +47,14 @@ export function getCLISyspromptPrefix(options?: { /** * Check if attribution header is enabled. - * Enabled by default, can be disabled via env var or GrowthBook killswitch. + * Disabled by default to preserve prompt cache stability. + * Can be enabled via env var CLAUDE_CODE_ATTRIBUTION_HEADER or GrowthBook feature flag. */ function isAttributionHeaderEnabled(): boolean { if (isEnvDefinedFalsy(process.env.CLAUDE_CODE_ATTRIBUTION_HEADER)) { return false } - return getFeatureValue_CACHED_MAY_BE_STALE('tengu_attribution_header', true) + return getFeatureValue_CACHED_MAY_BE_STALE('tengu_attribution_header', false) } /** diff --git a/src/query.ts b/src/query.ts index ccd17a0ff..085a00154 100644 --- a/src/query.ts +++ b/src/query.ts @@ -7,8 +7,13 @@ import type { CanUseToolFn } from './hooks/useCanUseTool.js' import { FallbackTriggeredError } from './services/api/withRetry.js' import { calculateTokenWarningState, + COMPACT_PRECHECK_FOLD_RATIO, + estimateTurnStartUsage, + getEffectiveContextWindowSize, isAutoCompactEnabled, + shouldPreFold, type AutoCompactTrackingState, + type CacheMetrics, } from './services/compact/autoCompact.js' import { buildPostCompactMessages } from './services/compact/compact.js' /* eslint-disable @typescript-eslint/no-require-imports */ @@ -452,7 +457,47 @@ async function* queryLoop( ) queryCheckpoint('query_autocompact_start') - const { compactionResult, consecutiveFailures } = await deps.autocompact( + + // Turn-start pre-estimation: check whether accumulated context from the + // last turn has pushed us into dangerous territory BEFORE the next API + // call. When the 90% threshold is crossed and we haven't already folded + // this turn, force a pre-fold via the existing autocompact pipeline. + let forcePreFold = false + if (feature('TURN_START_PRE_ESTIMATION')) { + const effectiveWindow = getEffectiveContextWindowSize( + toolUseContext.options.mainLoopModel, + ) + const { ratio, estimateTokens } = estimateTurnStartUsage( + messagesForQuery, + effectiveWindow, + ) + if ( + shouldPreFold(tracking, estimateTokens, effectiveWindow) + ) { + forcePreFold = true + logForDebugging( + `turnStartPreEstimate: context at ${(ratio * 100).toFixed(1)}% ` + + `(~${estimateTokens.toLocaleString()} tokens) — forcing pre-fold before API call`, + { level: 'warn' }, + ) + logEvent('tengu_turn_start_prefold_triggered', { + estimatedTokens: estimateTokens, + ratio: Math.round(ratio * 100), + }) + } else if (ratio >= COMPACT_PRECHECK_FOLD_RATIO) { + // Above threshold but suppressed by alreadyFoldedThisTurn + logForDebugging( + `turnStartPreEstimate: context at ${(ratio * 100).toFixed(1)}% ` + + `but pre-fold suppressed (already folded this turn)`, + ) + } + } + + const { + compactionResult, + consecutiveFailures, + cacheMetrics, + } = await deps.autocompact( messagesForQuery, toolUseContext, { @@ -465,9 +510,20 @@ async function* queryLoop( querySource, tracking, snipTokensFreed, + forcePreFold, ) queryCheckpoint('query_autocompact_end') + if (cacheMetrics) { + logForDebugging( + `cacheMetrics.compaction: hit=${cacheMetrics.cacheHitTokens.toLocaleString()} ` + + `miss=${cacheMetrics.cacheMissTokens.toLocaleString()} ` + + `ratio=${(cacheMetrics.cacheHitRatio * 100).toFixed(1)}%`, + ) + } + + queryCheckpoint('query_autocompact_end') + if (compactionResult) { const { preCompactTokenCount, @@ -491,6 +547,7 @@ async function* queryLoop( compactionUsage?.cache_read_input_tokens ?? 0, compactionCacheCreationTokens: compactionUsage?.cache_creation_input_tokens ?? 0, + compactionCacheHitRatio: cacheMetrics?.cacheHitRatio ?? 0, compactionTotalTokens: compactionUsage ? compactionUsage.input_tokens + (compactionUsage.cache_creation_input_tokens ?? 0) + diff --git a/src/server/proxy/handler.ts b/src/server/proxy/handler.ts index cb0fc7e48..75f6a76f6 100644 --- a/src/server/proxy/handler.ts +++ b/src/server/proxy/handler.ts @@ -19,6 +19,7 @@ import { openaiResponsesToAnthropic } from './transform/openaiResponsesToAnthrop import { openaiChatStreamToAnthropic } from './streaming/openaiChatStreamToAnthropic.js' import { openaiResponsesStreamToAnthropic } from './streaming/openaiResponsesStreamToAnthropic.js' import type { AnthropicRequest } from './transform/types.js' +import { normalizeModelStringForAPI } from '../../utils/model/model.js' import { getProxyFetchOptions } from '../../utils/proxy.js' import { getManualNetworkProxyUrl, loadNetworkSettings } from '../services/networkSettings.js' @@ -128,6 +129,9 @@ export async function handleProxyRequest(req: Request, url: URL): Promise { + const normalizedModelId = normalizeModelStringForAPI(modelId) const start = Date.now() try { - const { url, headers, body } = buildDirectTestRequest(base, apiKey, modelId, format, authStrategy) + const { url, headers, body } = buildDirectTestRequest(base, apiKey, normalizedModelId, format, authStrategy) const proxyOptions = getProxyFetchOptions({ proxyUrl: getManualNetworkProxyUrl(networkSettings) }) const response = await fetch(url, { method: 'POST', @@ -497,22 +499,22 @@ export class ProviderService { if (resBody?.error && typeof resBody.error === 'object') { error = ((resBody.error as Record).message as string) || error } - return { success: false, latencyMs, error, modelUsed: modelId, httpStatus: response.status } + return { success: false, latencyMs, error, modelUsed: normalizedModelId, httpStatus: response.status } } // Validate response structure const valid = validateResponseBody(resBody, format) if (!valid.ok) { - return { success: false, latencyMs, error: valid.error, modelUsed: modelId, httpStatus: response.status } + return { success: false, latencyMs, error: valid.error, modelUsed: normalizedModelId, httpStatus: response.status } } - return { success: true, latencyMs, modelUsed: valid.model || modelId, httpStatus: response.status } + return { success: true, latencyMs, modelUsed: valid.model || normalizedModelId, httpStatus: response.status } } catch (err: unknown) { const latencyMs = Date.now() - start if (err instanceof DOMException && err.name === 'TimeoutError') { - return { success: false, latencyMs, error: `Request timed out (${Math.round(networkSettings.aiRequestTimeoutMs / 1000)}s)`, modelUsed: modelId } + return { success: false, latencyMs, error: `Request timed out (${Math.round(networkSettings.aiRequestTimeoutMs / 1000)}s)`, modelUsed: normalizedModelId } } - return { success: false, latencyMs, error: err instanceof Error ? err.message : String(err), modelUsed: modelId } + return { success: false, latencyMs, error: err instanceof Error ? err.message : String(err), modelUsed: normalizedModelId } } } @@ -524,11 +526,12 @@ export class ProviderService { format: 'openai_chat' | 'openai_responses', networkSettings: NetworkSettings, ): Promise { + const normalizedModelId = normalizeModelStringForAPI(modelId) const start = Date.now() try { // Build an Anthropic Messages API request (same shape as what CLI sends) const anthropicReq: AnthropicRequest = { - model: modelId, + model: normalizedModelId, max_tokens: 64, messages: [{ role: 'user', content: 'Say "ok" and nothing else.' }], } @@ -557,31 +560,31 @@ export class ProviderService { if (!response.ok) { const latencyMs = Date.now() - start const errText = await response.text().catch(() => '') - return { success: false, latencyMs, modelUsed: modelId, httpStatus: response.status, + return { success: false, latencyMs, modelUsed: normalizedModelId, httpStatus: response.status, error: `Upstream HTTP ${response.status}: ${errText.slice(0, 200)}` } } // Transform response back to Anthropic format const responseBody = await response.json() const anthropicRes = format === 'openai_chat' - ? openaiChatToAnthropic(responseBody, modelId) - : openaiResponsesToAnthropic(responseBody, modelId) + ? openaiChatToAnthropic(responseBody, normalizedModelId) + : openaiResponsesToAnthropic(responseBody, normalizedModelId) const latencyMs = Date.now() - start // Validate the final Anthropic response if (anthropicRes.type !== 'message' || !Array.isArray(anthropicRes.content)) { - return { success: false, latencyMs, modelUsed: modelId, + return { success: false, latencyMs, modelUsed: normalizedModelId, error: 'Proxy transform produced invalid Anthropic response' } } - return { success: true, latencyMs, modelUsed: anthropicRes.model || modelId, httpStatus: response.status } + return { success: true, latencyMs, modelUsed: anthropicRes.model || normalizedModelId, httpStatus: response.status } } catch (err: unknown) { const latencyMs = Date.now() - start if (err instanceof DOMException && err.name === 'TimeoutError') { - return { success: false, latencyMs, error: `Proxy pipeline timed out (${Math.round(networkSettings.aiRequestTimeoutMs / 1000)}s)`, modelUsed: modelId } + return { success: false, latencyMs, error: `Proxy pipeline timed out (${Math.round(networkSettings.aiRequestTimeoutMs / 1000)}s)`, modelUsed: normalizedModelId } } - return { success: false, latencyMs, error: err instanceof Error ? err.message : String(err), modelUsed: modelId } + return { success: false, latencyMs, error: err instanceof Error ? err.message : String(err), modelUsed: normalizedModelId } } } } diff --git a/src/services/compact/autoCompact.ts b/src/services/compact/autoCompact.ts index 8c0a786d7..aeb67d8ca 100644 --- a/src/services/compact/autoCompact.ts +++ b/src/services/compact/autoCompact.ts @@ -57,6 +57,10 @@ export type AutoCompactTrackingState = { // Used as a circuit breaker to stop retrying when the context is // irrecoverably over the limit (e.g., prompt_too_long). consecutiveFailures?: number + // True when compaction has already run this turn (prevents double-fold). + // Mirrors Reasonix's alreadyFoldedThisTurn — the post-response check + // should not trigger a second fold if the pre-check already folded. + alreadyFoldedThisTurn?: boolean } export const AUTOCOMPACT_BUFFER_TOKENS = 13_000 @@ -64,6 +68,56 @@ export const WARNING_THRESHOLD_BUFFER_TOKENS = 20_000 export const ERROR_THRESHOLD_BUFFER_TOKENS = 20_000 export const MANUAL_COMPACT_BUFFER_TOKENS = 3_000 +// --------------------------------------------------------------------------- +// Percentage-based multi-level compaction thresholds (supplement, not replace) +// +// The fixed-buffer threshold (effectiveWindow - 13_000) is the "final defense" +// at ~93-98% of the window. These percentage thresholds provide earlier, +// gentler interventions that work well across all context window sizes +// (200K through 1M+). +// --------------------------------------------------------------------------- + +/** Normal fold: compact older messages, keep 20% of context window as tail budget */ +export const COMPACT_NORMAL_FOLD_RATIO = 0.75 +export const COMPACT_NORMAL_FOLD_TAIL_RATIO = 0.20 + +/** Aggressive fold: compact harder, keep 10% of context window as tail budget */ +export const COMPACT_AGGRESSIVE_FOLD_RATIO = 0.78 +export const COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO = 0.10 + +/** Force summary exit: stop the agent with a summary — no more room for folds */ +export const COMPACT_FORCE_SUMMARY_RATIO = 0.80 + +/** Turn-start pre-fold: pre-check before the API call (used by estimateTurnStartUsage) */ +export const COMPACT_PRECHECK_FOLD_RATIO = 0.90 + +/** + * Compaction levels ordered by severity. + * - none / turn_start_prefold are soft checks + * - normal_fold / aggressive_fold are actual compactions with tail budgets + * - force_summary / fixed_buffer are hard exits (no more folds) + */ +export type CompactionLevel = + | 'none' + | 'turn_start_prefold' + | 'normal_fold' + | 'aggressive_fold' + | 'force_summary' + | 'fixed_buffer' + +export type CompactionLevelResult = { + level: CompactionLevel + /** Token budget for the recent tail when level is normal_fold or aggressive_fold */ + tailBudgetTokens: number + effectiveWindow: number + fixedBufferThreshold: number +} + +// Minimum fraction of context that must be in the compactable "head" portion +// for compaction to be worthwhile. Prevents wasting a compact API call when +// the savings are marginal (Reasonix reference: HISTORY_FOLD_MIN_SAVINGS_FRACTION). +export const MIN_COMPACTION_SAVINGS_RATIO = 0.30 + // Stop trying autocompact after this many consecutive failures. // BQ 2026-03-10: 1,279 sessions had 50+ consecutive failures (up to 3,272) // in a single session, wasting ~250K API calls/day globally. @@ -90,6 +144,251 @@ export function getAutoCompactThreshold(model: string): number { return autocompactThreshold } +/** + * Gate for the multi-level percentage-based compaction feature. + * When disabled, the existing fixed-buffer behavior is unchanged. + */ +export function isPercentageCompactionEnabled(): boolean { + if (!isAutoCompactEnabled()) return false + return getFeatureValue_CACHED_MAY_BE_STALE('tengu_multi_level_compact', true) +} + +/** + * Determine the compaction level based on percentage thresholds AND the + * fixed-buffer threshold. Percentage thresholds act as earlier, gentler + * interventions; the fixed-buffer threshold is the "final defense." + * + * Checks in descending severity order so the most urgent level wins. + * + * @param tokenCount - current estimated token usage + * @param model - model name for context window lookup + */ +export function getCompactionLevel( + tokenCount: number, + model: string, +): CompactionLevelResult { + const effectiveWindow = getEffectiveContextWindowSize(model) + const fixedBufferThreshold = effectiveWindow - AUTOCOMPACT_BUFFER_TOKENS + + // Fixed buffer is the "final defense" — triggers closest to the window limit + if (tokenCount >= fixedBufferThreshold) { + return { + level: 'fixed_buffer', + tailBudgetTokens: Math.floor(effectiveWindow * 0.05), + effectiveWindow, + fixedBufferThreshold, + } + } + + const forceSummaryThreshold = Math.floor( + effectiveWindow * COMPACT_FORCE_SUMMARY_RATIO, + ) + if (tokenCount >= forceSummaryThreshold) { + return { + level: 'force_summary', + tailBudgetTokens: 0, // No tail — force exit + effectiveWindow, + fixedBufferThreshold, + } + } + + const aggressiveFoldThreshold = Math.floor( + effectiveWindow * COMPACT_AGGRESSIVE_FOLD_RATIO, + ) + if (tokenCount >= aggressiveFoldThreshold) { + return { + level: 'aggressive_fold', + tailBudgetTokens: Math.floor( + effectiveWindow * COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO, + ), + effectiveWindow, + fixedBufferThreshold, + } + } + + const normalFoldThreshold = Math.floor( + effectiveWindow * COMPACT_NORMAL_FOLD_RATIO, + ) + if (tokenCount >= normalFoldThreshold) { + return { + level: 'normal_fold', + tailBudgetTokens: Math.floor( + effectiveWindow * COMPACT_NORMAL_FOLD_TAIL_RATIO, + ), + effectiveWindow, + fixedBufferThreshold, + } + } + + return { + level: 'none', + tailBudgetTokens: effectiveWindow, + effectiveWindow, + fixedBufferThreshold, + } +} + +/** + * Estimate whether compaction would save enough tokens to justify its cost. + * + * The "head portion" is the messages that would be summarized (those before + * the last compact boundary). If this portion is less than + * MIN_COMPACTION_SAVINGS_RATIO of the total context, the compact agent's own + * token consumption would exceed or nearly match the savings. + * + * @returns true if compaction is worthwhile, false to skip + */ +export function isCompactionWorthwhile( + estimatedTotalTokens: number, + effectiveWindow: number, +): boolean { + // Circuit breaker: if total tokens are somehow higher than the window, + // compaction is definitely worthwhile (emergency scenario). + if (estimatedTotalTokens >= effectiveWindow) return true + + // Head portion = tokens above the normal fold threshold that could be freed. + // If most tokens are already in the "tail" (recent messages), compaction + // would save very little — the summary alone costs thousands of tokens. + const headFraction = estimatedTotalTokens / effectiveWindow + + logForDebugging( + `compaction_savings_check: tokens=${estimatedTotalTokens} window=${effectiveWindow} ` + + `headFraction=${(headFraction * 100).toFixed(1)}% ` + + `minRequired=${(MIN_COMPACTION_SAVINGS_RATIO * 100).toFixed(0)}%`, + ) + + return headFraction >= MIN_COMPACTION_SAVINGS_RATIO +} + +/** + * Fast turn-start token estimation using rough heuristics. + * Does NOT make an API call — intentionally a coarse estimate. + * + * Uses the existing roughTokenCountEstimationForMessages (~4 chars/token) + * plus fixed overhead estimates for system prompt and tool schemas. + * + * @returns estimated token count for messages + overhead + */ +export function estimateTurnStartUsage( + messages: Message[], + effectiveWindow: number, +): { estimateTokens: number; ratio: number } { + // Use the same token estimation pipeline that shouldAutoCompact uses + const dynamicTokens = tokenCountWithEstimation(messages) + // Pre-check ratio: compare against the effective context window + const ratio = effectiveWindow > 0 ? dynamicTokens / effectiveWindow : 0 + + logForDebugging( + `turnStartEstimate: tokens=${dynamicTokens} window=${effectiveWindow} ratio=${(ratio * 100).toFixed(1)}%`, + ) + + return { estimateTokens: dynamicTokens, ratio } +} + +// Hysteresis buffer: only trigger a second pre-fold when context grows by +// at least this ratio beyond the threshold, preventing oscillation when +// context hovers right at the boundary. +export const COMPACT_PRECHECK_FOLD_HYSTERESIS = 0.05 + +/** + * Check whether the turn-start pre-estimation triggers a pre-fold. + * + * This is NOT redundant with shouldAutoCompact — it uses the 90% threshold + * (vs 75%) and is called BEFORE the API call, catching the case where the + * last turn's tool output pushed context way up but no assistant response + * carried the usage data yet. + * + * The hysteresis buffer prevents oscillating fold/no-fold when token counts + * hover near the threshold (Reasonix: requireTailBoundary equivalent). + * + * @returns true when a pre-fold is recommended before the next API call + */ +export function needsTurnStartPreFold( + estimateTokens: number, + effectiveWindow: number, + lastPreFoldTokens?: number, +): boolean { + const threshold = Math.floor(effectiveWindow * COMPACT_PRECHECK_FOLD_RATIO) + if (estimateTokens < threshold) return false + + // Hysteresis: if we pre-folded recently, only re-trigger when context + // has grown significantly beyond the threshold (avoids oscillation). + if (lastPreFoldTokens !== undefined && lastPreFoldTokens > 0) { + const hysteresisThreshold = Math.floor( + threshold * (1 + COMPACT_PRECHECK_FOLD_HYSTERESIS), + ) + if (estimateTokens < hysteresisThreshold) return false + } + + return true +} + +// --------------------------------------------------------------------------- +// Cache Economics — per-session cache hit/miss tracking (Reasonix SessionStats) +// --------------------------------------------------------------------------- + +/** Per-turn cache metrics extracted from API usage data */ +export type CacheMetrics = { + /** Tokens read from prompt cache (HIT) */ + cacheHitTokens: number + /** Tokens NOT read from cache — fresh input (MISS) */ + cacheMissTokens: number + /** Tokens written to cache by this request */ + cacheWriteTokens: number + /** Total prompt-side tokens (hit + miss + write = input_tokens total) */ + totalPromptTokens: number + /** Cache hit ratio: hit / (hit + miss). 1.0 = perfect cache, 0.0 = all miss */ + cacheHitRatio: number +} + +/** + * Compute cache efficiency metrics from API usage data. + * + * Uses Anthropic's `cache_read_input_tokens` and `cache_creation_input_tokens` + * fields. For other providers (DeepSeek, OpenAI) that use different field names, + * the caller should normalize before passing. + * + * Pure function — no side effects, no state. Safe to call in any context. + */ +export function computeCacheMetrics(usage: { + input_tokens: number + cache_read_input_tokens?: number | null + cache_creation_input_tokens?: number | null +}): CacheMetrics { + const cacheHitTokens = usage.cache_read_input_tokens ?? 0 + const cacheWriteTokens = usage.cache_creation_input_tokens ?? 0 + const cacheMissTokens = Math.max( + 0, + usage.input_tokens - cacheHitTokens - cacheWriteTokens, + ) + const totalPromptTokens = cacheHitTokens + cacheMissTokens + cacheWriteTokens + const cacheHitRatio = + totalPromptTokens > 0 && (cacheHitTokens + cacheMissTokens) > 0 + ? cacheHitTokens / (cacheHitTokens + cacheMissTokens) + : 0 + + return { + cacheHitTokens, + cacheMissTokens, + cacheWriteTokens, + totalPromptTokens, + cacheHitRatio, + } +} + +/** + * The pre-fold decision needs `alreadyFoldedThisTurn` context — exported so + * query.ts can thread it without reaching into tracking internals. + */ +export function shouldPreFold( + tracking: AutoCompactTrackingState | undefined, + estimateTokens: number, + effectiveWindow: number, +): boolean { + if (tracking?.alreadyFoldedThisTurn) return false + return needsTurnStartPreFold(estimateTokens, effectiveWindow) +} + export function calculateTokenWarningState( tokenUsage: number, model: string, @@ -230,12 +529,26 @@ export async function shouldAutoCompact( `autocompact: tokens=${tokenCount} threshold=${threshold} effectiveWindow=${effectiveWindow}${snipTokensFreed > 0 ? ` snipFreed=${snipTokensFreed}` : ''}`, ) + // Existing fixed-buffer check: final defense at ~93-98% of window const { isAboveAutoCompactThreshold } = calculateTokenWarningState( tokenCount, model, ) - return isAboveAutoCompactThreshold + if (isAboveAutoCompactThreshold) return true + + // New: percentage-based multi-level check — earlier, gentler intervention + if (isPercentageCompactionEnabled()) { + const level = getCompactionLevel(tokenCount, model) + if (level.level !== 'none' && level.level !== 'turn_start_prefold') { + logForDebugging( + `autocompact: percentage threshold triggered (level=${level.level}, ratio=${(tokenCount / effectiveWindow * 100).toFixed(1)}%)`, + ) + return true + } + } + + return false } export async function autoCompactIfNeeded( @@ -245,10 +558,14 @@ export async function autoCompactIfNeeded( querySource?: QuerySource, tracking?: AutoCompactTrackingState, snipTokensFreed?: number, + /** Pass true from query.ts turn-start pre-estimation to run a pre-fold */ + forcePreFold?: boolean, ): Promise<{ wasCompacted: boolean compactionResult?: CompactionResult consecutiveFailures?: number + /** Loggable cache metrics from the compaction call (if one ran) */ + cacheMetrics?: CacheMetrics }> { if (isEnvTruthy(process.env.DISABLE_COMPACT)) { return { wasCompacted: false } @@ -272,10 +589,59 @@ export async function autoCompactIfNeeded( snipTokensFreed, ) - if (!shouldCompact) { + // alreadyFoldedThisTurn guard: if compaction ran via pre-fold earlier this + // turn, the post-response check should not re-trigger. Mirrors Reasonix's + // decideAfterUsage: alreadyFoldedThisTurn → skip fold. + if (!forcePreFold && tracking?.alreadyFoldedThisTurn) { + logForDebugging('autocompact: skipping post-response check — already folded this turn') + return { wasCompacted: false } + } + + if (!shouldCompact && !forcePreFold) { return { wasCompacted: false } } + // Route decision: forcePreFold overrides the passive check when the + // turn-start pre-estimation flagged us above 90%. In that case we only + // skip if the minimum savings check says there's nothing worth freeing. + if (!shouldCompact && forcePreFold) { + const tokenCount = tokenCountWithEstimation(messages) - (snipTokensFreed ?? 0) + const effectiveWindow = getEffectiveContextWindowSize(model) + if (!isCompactionWorthwhile(tokenCount, effectiveWindow)) { + logForDebugging( + `autocompact: skipping forced pre-fold — head portion too small (tokens=${tokenCount}, window=${effectiveWindow})`, + ) + return { wasCompacted: false } + } + logForDebugging( + `autocompact: forcePreFold active — triggering pre-fold (tokens=${tokenCount})`, + ) + } + + // Minimum savings gate: skip compaction when the head portion is too small + // to save meaningful tokens. Prevents wasting a compact API call when the + // summary alone costs nearly as many tokens as it frees. + // Skip this check for forcePreFold — already handled above. + if (!forcePreFold && isPercentageCompactionEnabled()) { + const tokenCount = tokenCountWithEstimation(messages) - (snipTokensFreed ?? 0) + const effectiveWindow = getEffectiveContextWindowSize(model) + if ( + !isCompactionWorthwhile(tokenCount, effectiveWindow) + ) { + logForDebugging( + `autocompact: skipping — head portion too small for worthwhile savings (tokens=${tokenCount}, window=${effectiveWindow})`, + ) + return { wasCompacted: false } + } + } + + // Compute the compaction level for use in recompactionInfo and to guide + // the compaction strategy (tail budget, aggressiveness). + const tokenCount = tokenCountWithEstimation(messages) - (snipTokensFreed ?? 0) + const compactionLevel = isPercentageCompactionEnabled() + ? getCompactionLevel(tokenCount, model).level + : 'fixed_buffer' + const recompactionInfo: RecompactionInfo = { isRecompactionInChain: tracking?.compacted === true, turnsSincePreviousCompact: tracking?.turnCounter ?? -1, @@ -284,6 +650,10 @@ export async function autoCompactIfNeeded( querySource, } + logForDebugging( + `autocompact: triggering compaction (level=${compactionLevel}, tokens=${tokenCount})`, + ) + // EXPERIMENT: Try session memory compaction first const sessionMemoryResult = await trySessionMemoryCompaction( messages, @@ -297,12 +667,13 @@ export async function autoCompactIfNeeded( runPostCompactCleanup(querySource) // Reset cache read baseline so the post-compact drop isn't flagged as a // break. compactConversation does this internally; SM-compact doesn't. - // BQ 2026-03-01: missing this made 20% of tengu_prompt_cache_break events - // false positives (systemPromptChanged=true, timeSinceLastAssistantMsg=-1). if (feature('PROMPT_CACHE_BREAK_DETECTION')) { notifyCompaction(querySource ?? 'compact', toolUseContext.agentId) } markPostCompaction() + // Mark alreadyFoldedThisTurn to prevent the post-response check from + // double-folding (Reasonix: decideAfterUsage returns 'none' when true). + if (tracking) tracking.alreadyFoldedThisTurn = true return { wasCompacted: true, compactionResult: sessionMemoryResult, @@ -325,11 +696,25 @@ export async function autoCompactIfNeeded( setLastSummarizedMessageId(undefined) runPostCompactCleanup(querySource) + // Compute cache metrics from the compaction agent's usage for visibility + const compactionUsage = compactionResult.compactionUsage + const compactionCacheMetrics = compactionUsage + ? computeCacheMetrics({ + input_tokens: compactionUsage.input_tokens, + cache_read_input_tokens: compactionUsage.cache_read_input_tokens, + cache_creation_input_tokens: compactionUsage.cache_creation_input_tokens, + }) + : undefined + + // Mark alreadyFoldedThisTurn to prevent the post-response check from + // double-folding (Reasonix: decideAfterUsage returns 'none' when true). + if (tracking) tracking.alreadyFoldedThisTurn = true + return { wasCompacted: true, compactionResult, - // Reset failure count on success consecutiveFailures: 0, + cacheMetrics: compactionCacheMetrics, } } catch (error) { if (!hasExactErrorMessage(error, ERROR_MESSAGE_USER_ABORT)) { diff --git a/src/utils/toolResultStorage.ts b/src/utils/toolResultStorage.ts index f4dfef326..ec119d985 100644 --- a/src/utils/toolResultStorage.ts +++ b/src/utils/toolResultStorage.ts @@ -14,6 +14,7 @@ import { } from '../constants/toolLimits.js' import { getFeatureValue_CACHED_MAY_BE_STALE } from '../services/analytics/growthbook.js' import { logEvent } from '../services/analytics/index.js' +import { roughTokenCountEstimation } from '../services/tokenEstimation.js' import { sanitizeToolNameForAnalytics } from '../services/analytics/metadata.js' import type { Message } from '../types/message.js' import { logForDebugging } from './debug.js' @@ -333,6 +334,55 @@ async function maybePersistLargeToolResult( return { ...toolResultBlock, content: message } } +/** + * Truncate tool result content to fit within a token budget. + * + * Uses rough character-based estimation (char/4) for speed. For CJK content + * this is conservative (undercounts tokens per char), but provides better + * accuracy than pure char-based truncation for mixed-language content. + * + * The truncation preserves newline boundaries when possible, and appends + * a marker so the model knows content was truncated. + * + * @param content - The content to potentially truncate + * @param maxTokens - Maximum allowed tokens before truncation + * @returns truncated content and whether truncation occurred + */ +export function truncateToolResultByTokens( + content: string, + maxTokens: number, +): { truncated: string; wasTruncated: boolean; estimatedTokens: number } { + const estimatedTokens = roughTokenCountEstimation(content) + if (estimatedTokens <= maxTokens || content.length <= maxTokens) { + return { truncated: content, wasTruncated: false, estimatedTokens } + } + + // Approximation: each token ~4 chars on average + const charBudget = maxTokens * BYTES_PER_TOKEN + let truncated = content.slice(0, charBudget) + + // Try to find a clean boundary (newline) near the cut point to avoid + // splitting mid-line or mid-word + const lastNewline = truncated.lastIndexOf('\n') + if (lastNewline > charBudget * 0.7) { + truncated = truncated.slice(0, lastNewline) + } + + // tool_result_content_block_start already wraps each block; this marker + // replaces bulky output with a compact signal the model can act on. + // Wording mirrors the per-tool persist message so the model already + // knows what to expect when it just *couldn't* fit in context. + truncated += + `\n\n[Content truncated from ~${formatFileSize(content.length)} to fit ` + + `within ~${maxTokens.toLocaleString()} tokens. Full output may be available in the session tool-results directory.]` + + return { + truncated, + wasTruncated: true, + estimatedTokens: roughTokenCountEstimation(truncated), + } +} + /** * Generate a preview of content, truncating at a newline boundary when possible. */ diff --git a/tests/cacheOptimization.integration.test.ts b/tests/cacheOptimization.integration.test.ts new file mode 100644 index 000000000..fa5962440 --- /dev/null +++ b/tests/cacheOptimization.integration.test.ts @@ -0,0 +1,423 @@ +import { expect, test } from 'bun:test' +import { + COMPACT_NORMAL_FOLD_RATIO, + COMPACT_AGGRESSIVE_FOLD_RATIO, + COMPACT_FORCE_SUMMARY_RATIO, + COMPACT_PRECHECK_FOLD_RATIO, + COMPACT_NORMAL_FOLD_TAIL_RATIO, + COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO, + COMPACT_PRECHECK_FOLD_HYSTERESIS, + MIN_COMPACTION_SAVINGS_RATIO, + type CompactionLevel, + computeCacheMetrics, + isCompactionWorthwhile, + needsTurnStartPreFold, + shouldPreFold, +} from '../src/services/compact/autoCompact.js' +import { truncateToolResultByTokens } from '../src/utils/toolResultStorage.js' +import type { AutoCompactTrackingState } from '../src/services/compact/autoCompact.js' + +// =========================================================================== +// TDD Integration Suite — cache optimization decision chain + boundary tests +// =========================================================================== + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +const WINDOWS = { + SMALL: 200_000, + MEDIUM: 500_000, + LARGE: 1_000_000, +} as const + +function makeTracking(overrides: Partial = {}): AutoCompactTrackingState { + return { + compacted: false, + turnCounter: 1, + turnId: 'tdd-test-turn', + ...overrides, + } +} + +function thresholdTokens(window: number, ratio: number): number { + return Math.floor(window * ratio) +} + +// --------------------------------------------------------------------------- +// Integration 1: Percentage threshold cross-window consistency +// +// The entire point of percentage thresholds is to work consistently across +// all context window sizes. Verify that for every window size the ratios +// produce the same percentile behavior. +// --------------------------------------------------------------------------- + +test('INTEGRATION: normal fold fires at same % regardless of window size', () => { + for (const w of Object.values(WINDOWS)) { + const tokens = thresholdTokens(w, COMPACT_NORMAL_FOLD_RATIO) + const ratio = tokens / w + expect(ratio).toBeGreaterThanOrEqual(COMPACT_NORMAL_FOLD_RATIO - 0.01) + expect(ratio).toBeLessThanOrEqual(COMPACT_NORMAL_FOLD_RATIO + 0.01) + } +}) + +test('INTEGRATION: aggressive fold fires at same % regardless of window size', () => { + for (const w of Object.values(WINDOWS)) { + const tokens = thresholdTokens(w, COMPACT_AGGRESSIVE_FOLD_RATIO) + const ratio = tokens / w + expect(ratio).toBeGreaterThanOrEqual(COMPACT_AGGRESSIVE_FOLD_RATIO - 0.01) + } +}) + +test('INTEGRATION: force summary fires at same % regardless of window size', () => { + for (const w of Object.values(WINDOWS)) { + const tokens = thresholdTokens(w, COMPACT_FORCE_SUMMARY_RATIO) + const ratio = tokens / w + expect(ratio).toBeGreaterThanOrEqual(COMPACT_FORCE_SUMMARY_RATIO - 0.01) + } +}) + +test('INTEGRATION: pre-check fires at same % regardless of window size', () => { + for (const w of Object.values(WINDOWS)) { + const tokens = thresholdTokens(w, COMPACT_PRECHECK_FOLD_RATIO) + const ratio = tokens / w + expect(ratio).toBeGreaterThanOrEqual(COMPACT_PRECHECK_FOLD_RATIO - 0.01) + } +}) + +// --------------------------------------------------------------------------- +// Integration 2: Tail budget ratio — aggressive < normal (cross-window) +// --------------------------------------------------------------------------- + +test('INTEGRATION: aggressive tail budget is half of normal across all window sizes', () => { + for (const w of Object.values(WINDOWS)) { + const normalTail = Math.floor(w * COMPACT_NORMAL_FOLD_TAIL_RATIO) + const aggressiveTail = Math.floor(w * COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO) + expect(aggressiveTail).toBeLessThan(normalTail) + // Aggressive tail should be exactly half of normal + expect(aggressiveTail).toBe(Math.floor(normalTail / 2)) + } +}) + +// --------------------------------------------------------------------------- +// Integration 3: Decision chain — isCompactionWorthwhile gates normal vs emergency +// --------------------------------------------------------------------------- + +test('INTEGRATION: isCompactionWorthwhile gates at exact boundary across windows', () => { + for (const w of Object.values(WINDOWS)) { + const boundary = Math.floor(w * MIN_COMPACTION_SAVINGS_RATIO) + // At exact boundary — worthwhile + expect(isCompactionWorthwhile(boundary, w)).toBe(true) + // Just below boundary — not worthwhile + if (boundary > 1) { + expect(isCompactionWorthwhile(boundary - 1, w)).toBe(false) + } + } +}) + +test('INTEGRATION: isCompactionWorthwhile emergency gate consistent', () => { + // When tokens exceed window, always worthwhile regardless of window size + for (const w of Object.values(WINDOWS)) { + expect(isCompactionWorthwhile(w + 1, w)).toBe(true) + expect(isCompactionWorthwhile(w * 2, w)).toBe(true) + } +}) + +// --------------------------------------------------------------------------- +// Integration 4: Pre-fold chain — estimate → needsPreFold → shouldPreFold +// Simulates a full turn-start pre-estimation decision +// --------------------------------------------------------------------------- + +test('INTEGRATION: pre-fold triggers at 90% regardless of window size', () => { + for (const w of Object.values(WINDOWS)) { + const atThreshold = thresholdTokens(w, COMPACT_PRECHECK_FOLD_RATIO) + expect(needsTurnStartPreFold(atThreshold, w)).toBe(true) + } +}) + +test('INTEGRATION: pre-fold does NOT trigger at 89% across windows', () => { + for (const w of Object.values(WINDOWS)) { + const belowThreshold = thresholdTokens(w, COMPACT_PRECHECK_FOLD_RATIO * 0.99) + expect(needsTurnStartPreFold(belowThreshold, w)).toBe(false) + } +}) + +test('INTEGRATION: hysteresis prevents oscillation at boundary', () => { + // Simulate context hovering right at 90% boundary + const w = WINDOWS.LARGE + const threshold = thresholdTokens(w, COMPACT_PRECHECK_FOLD_RATIO) + const hysteresisThreshold = thresholdTokens( + w, + COMPACT_PRECHECK_FOLD_RATIO * (1 + COMPACT_PRECHECK_FOLD_HYSTERESIS), + ) + + // First pre-fold: at threshold → true + const lastFoldAt = threshold + 1000 + expect(needsTurnStartPreFold(threshold, w)).toBe(true) + + // Second check: context slightly above threshold but below hysteresis + const afterFold = threshold + Math.floor((hysteresisThreshold - threshold) * 0.5) + expect(needsTurnStartPreFold(afterFold, w, lastFoldAt)).toBe(false) + + // Third check: context now well above hysteresis → re-trigger + const aboveHysteresis = hysteresisThreshold + 100 + expect(needsTurnStartPreFold(aboveHysteresis, w, lastFoldAt)).toBe(true) +}) + +// --------------------------------------------------------------------------- +// Integration 5: alreadyFoldedThisTurn — prevents double-fold in the same turn +// --------------------------------------------------------------------------- + +test('INTEGRATION: alreadyFoldedThisTurn suppresses shouldPreFold', () => { + const tracking = makeTracking({ alreadyFoldedThisTurn: true }) + // 95% context — would normally pre-fold, but already did this turn + expect(shouldPreFold(tracking, thresholdTokens(200_000, 0.95), 200_000)).toBe(false) +}) + +test('INTEGRATION: shouldPreFold returns true on fresh turn (not yet folded)', () => { + const tracking = makeTracking({ alreadyFoldedThisTurn: false }) + expect(shouldPreFold(tracking, thresholdTokens(200_000, 0.95), 200_000)).toBe(true) +}) + +test('INTEGRATION: shouldPreFold returns false below threshold even on fresh turn', () => { + const tracking = makeTracking({ alreadyFoldedThisTurn: false }) + // 80% — below 90% pre-check threshold + expect(shouldPreFold(tracking, thresholdTokens(200_000, 0.80), 200_000)).toBe(false) +}) + +// --------------------------------------------------------------------------- +// Integration 6: Compaction level ordering — verify severity hierarchy +// --------------------------------------------------------------------------- + +test('INTEGRATION: compaction level severity is monotonically ordered', () => { + // The severity order of compaction levels, from least to most urgent: + // none < normal_fold < aggressive_fold < force_summary + // turn_start_prefold (90%) is a pre-check at a different point in the + // turn lifecycle, so its ratio doesn't follow the post-response severity + // chain. fixed_buffer depends on the window size. + const postResponseLevels: CompactionLevel[] = [ + 'none', + 'normal_fold', + 'aggressive_fold', + 'force_summary', + ] + + const ratios: Record = { + none: 0, + turn_start_prefold: COMPACT_PRECHECK_FOLD_RATIO, + normal_fold: COMPACT_NORMAL_FOLD_RATIO, + aggressive_fold: COMPACT_AGGRESSIVE_FOLD_RATIO, + force_summary: COMPACT_FORCE_SUMMARY_RATIO, + fixed_buffer: 0.935, + } + + for (let i = 1; i < postResponseLevels.length; i++) { + const prev = postResponseLevels[i - 1]! + const curr = postResponseLevels[i]! + expect(ratios[curr]).toBeGreaterThanOrEqual(ratios[prev]) + } +}) + +// --------------------------------------------------------------------------- +// Integration 7: Cache metrics computation consistency +// --------------------------------------------------------------------------- + +test('INTEGRATION: cacheHitRatio consistent across different hit/miss splits', () => { + const scenarios = [ + { input: 100_000, hit: 99_000, write: 500, expectedRatio: 99_000 / (99_000 + 500) }, + { input: 100_000, hit: 50_000, write: 0, expectedRatio: 0.5 }, + { input: 100_000, hit: 0, write: 0, expectedRatio: 0 }, + { input: 100_000, hit: 100_000, write: 0, expectedRatio: 1 }, + { input: 1_000_000, hit: 750_000, write: 100_000, expectedRatio: 750_000 / (750_000 + 150_000) }, + ] + + for (const s of scenarios) { + const m = computeCacheMetrics({ + input_tokens: s.input, + cache_read_input_tokens: s.hit, + cache_creation_input_tokens: s.write, + }) + expect(m.cacheHitRatio).toBeCloseTo(s.expectedRatio, 4) + expect(m.cacheHitTokens).toBe(s.hit) + expect(m.cacheWriteTokens).toBe(s.write) + // hit + miss + write ≈ input (miss may be adjusted if hit+write > input) + expect(m.totalPromptTokens).toBeGreaterThanOrEqual(s.input - 10) + } +}) + +test('INTEGRATION: cacheMetrics invariant: hitRatio ∈ [0, 1]', () => { + // Random-ish sampling of plausible usage patterns + const patterns = [ + { input: 1, hit: 0, write: 0 }, + { input: 1, hit: 1, write: 0 }, + { input: 999_999, hit: 1, write: 0 }, + { input: 500_000, hit: 500_000, write: 0 }, + { input: 500_000, hit: 0, write: 500_000 }, + ] + for (const p of patterns) { + const m = computeCacheMetrics({ + input_tokens: p.input, + cache_read_input_tokens: p.hit, + cache_creation_input_tokens: p.write, + }) + expect(m.cacheHitRatio).toBeGreaterThanOrEqual(0) + expect(m.cacheHitRatio).toBeLessThanOrEqual(1) + } +}) + +// --------------------------------------------------------------------------- +// Integration 8: Token truncation — CJK and mixed-language edge cases +// --------------------------------------------------------------------------- + +test('INTEGRATION: truncation preserves CJK character integrity', () => { + // CJK: each character is 1-3 tokens. The truncation uses char/4 estimate + // which is conservative for CJK (undercounts). Verify it still truncates + // gracefully without corrupting characters. + const content = 'これは日本語のテストです。'.repeat(1000) + const result = truncateToolResultByTokens(content, 100) + expect(result.wasTruncated).toBe(true) + // Truncated content should be valid Unicode (no orphan surrogate pairs) + expect(() => encodeURIComponent(result.truncated)).not.toThrow() + // Should contain the truncation marker + expect(result.truncated).toContain('Content truncated') +}) + +test('INTEGRATION: truncation with data-like content is valid', () => { + // When content is dense data (no natural line breaks), truncation at the + // exact char boundary is acceptable — the function guarantees content + // is a prefix of the original and that the marker is present. + const lines: string[] = [] + for (let i = 0; i < 100; i++) { + lines.push(`Line ${i.toString().padStart(4, '0')}: ${'data '.repeat(50)}`) + } + const content = lines.join('\n') + const result = truncateToolResultByTokens(content, 100) + + if (result.wasTruncated) { + // Verify truncated is shorter than original + expect(result.truncated.length).toBeLessThan(content.length) + + // The truncated content should be a prefix of the original + // (before the marker is appended) + const markerIdx = result.truncated.lastIndexOf('[Content truncated') + expect(markerIdx).toBeGreaterThan(0) + + const beforeMarker = result.truncated.slice(0, markerIdx) + // Content before the marker should be contained in the original + // (may be truncated mid-word, which is acceptable) + expect(content.includes(beforeMarker.trim())).toBe(true) + } +}) + +test('INTEGRATION: truncation with mixed ASCII plus emoji content', () => { + const content = 'Regular text with emoji 🚀🔥💻 mixed in. '.repeat(200) + const result = truncateToolResultByTokens(content, 100) + expect(result.wasTruncated).toBe(true) + // Emoji are multi-byte; verify no orphan bytes + expect(() => encodeURIComponent(result.truncated)).not.toThrow() +}) + +// --------------------------------------------------------------------------- +// Integration 9: Threshold cross-check — all ratios are in valid range +// --------------------------------------------------------------------------- + +test('INTEGRATION: all compaction ratios are in (0, 1)', () => { + const ratios = [ + COMPACT_NORMAL_FOLD_RATIO, + COMPACT_AGGRESSIVE_FOLD_RATIO, + COMPACT_FORCE_SUMMARY_RATIO, + COMPACT_PRECHECK_FOLD_RATIO, + COMPACT_NORMAL_FOLD_TAIL_RATIO, + COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO, + MIN_COMPACTION_SAVINGS_RATIO, + COMPACT_PRECHECK_FOLD_HYSTERESIS, + ] + for (const r of ratios) { + expect(r).toBeGreaterThan(0) + expect(r).toBeLessThan(1) + } +}) + +test('INTEGRATION: normal fold threshold < aggressive < force_summary < precheck', () => { + expect(COMPACT_NORMAL_FOLD_RATIO).toBeLessThan(COMPACT_AGGRESSIVE_FOLD_RATIO) + expect(COMPACT_AGGRESSIVE_FOLD_RATIO).toBeLessThan(COMPACT_FORCE_SUMMARY_RATIO) + expect(COMPACT_FORCE_SUMMARY_RATIO).toBeLessThan(COMPACT_PRECHECK_FOLD_RATIO) +}) + +// --------------------------------------------------------------------------- +// Integration 10: Savings check never blocks emergency compaction +// --------------------------------------------------------------------------- + +test('INTEGRATION: savings check does not block when tokens exceed window', () => { + // Emergency: tokens > window → always true regardless of savings ratio + for (const w of Object.values(WINDOWS)) { + expect(isCompactionWorthwhile(w + 1, w)).toBe(true) + expect(isCompactionWorthwhile(w + 1000, w)).toBe(true) + } +}) + +test('INTEGRATION: savings check criteria consistent with min savings ratio', () => { + const w = WINDOWS.MEDIUM + const minFraction = MIN_COMPACTION_SAVINGS_RATIO + + // At exact fraction → worthwhile + expect(isCompactionWorthwhile(Math.floor(w * minFraction), w)).toBe(true) + + // Slightly below → not worthwhile (but we test 1 token below for small ratios) + const belowFraction = Math.floor(w * (minFraction - 0.01)) + if (belowFraction > 0) { + expect(isCompactionWorthwhile(belowFraction, w)).toBe(false) + } +}) + +// --------------------------------------------------------------------------- +// Integration 11: Pre-fold + post-fold never overlap (alreadyFoldedThisTurn invariant) +// --------------------------------------------------------------------------- + +test('INTEGRATION: pre-fold + post-fold coordination — alreadyFolded prevents second fold', () => { + // Simulate a full turn: + // 1. Turn starts + // 2. Pre-estimation finds 92% → triggers pre-fold + // 3. Pre-fold succeeds → alreadyFoldedThisTurn = true + // 4. API call runs + // 5. Post-response check → should NOT re-fold + + const tracking = makeTracking({ alreadyFoldedThisTurn: false }) + const w = WINDOWS.SMALL + + // Step 2: pre-estimation at 92% + const tokens = thresholdTokens(w, 0.92) + expect(shouldPreFold(tracking, tokens, w)).toBe(true) + + // Step 3: after pre-fold, mark + tracking.alreadyFoldedThisTurn = true + + // Step 5: post-response check — suppressed + expect(shouldPreFold(tracking, tokens, w)).toBe(false) +}) + +test('INTEGRATION: next turn resets alreadyFoldedThisTurn (caller responsibility)', () => { + // The tracking object is expected to be reset by the caller (query.ts) + // at the start of each new turn. Verify the flag is not sticky. + const tracking = makeTracking({ alreadyFoldedThisTurn: true }) + // Caller resets for new turn + tracking.alreadyFoldedThisTurn = false + expect(shouldPreFold(tracking, thresholdTokens(200_000, 0.95), 200_000)).toBe(true) +}) + +// --------------------------------------------------------------------------- +// Integration 12: Hysteresis prevents thrashing across all window sizes +// --------------------------------------------------------------------------- + +test('INTEGRATION: hysteresis gap is at least 4% of threshold across windows', () => { + for (const w of Object.values(WINDOWS)) { + const threshold = thresholdTokens(w, COMPACT_PRECHECK_FOLD_RATIO) + const hysteresisGap = thresholdTokens( + w, + COMPACT_PRECHECK_FOLD_RATIO * (1 + COMPACT_PRECHECK_FOLD_HYSTERESIS), + ) - threshold + // Gap should be ~5% of the threshold value + const expectedGap = Math.floor(threshold * COMPACT_PRECHECK_FOLD_HYSTERESIS) + expect(hysteresisGap).toBeGreaterThanOrEqual(expectedGap - 1) + } +}) diff --git a/tests/compactionThresholds.test.ts b/tests/compactionThresholds.test.ts new file mode 100644 index 000000000..a6bdeba23 --- /dev/null +++ b/tests/compactionThresholds.test.ts @@ -0,0 +1,323 @@ +import { expect, test } from 'bun:test' +import { + COMPACT_NORMAL_FOLD_RATIO, + COMPACT_AGGRESSIVE_FOLD_RATIO, + COMPACT_FORCE_SUMMARY_RATIO, + COMPACT_PRECHECK_FOLD_RATIO, + COMPACT_NORMAL_FOLD_TAIL_RATIO, + COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO, + COMPACT_PRECHECK_FOLD_HYSTERESIS, + MIN_COMPACTION_SAVINGS_RATIO, + computeCacheMetrics, + isCompactionWorthwhile, + needsTurnStartPreFold, + shouldPreFold, +} from '../src/services/compact/autoCompact.js' +import { truncateToolResultByTokens } from '../src/utils/toolResultStorage.js' +import type { AutoCompactTrackingState } from '../src/services/compact/autoCompact.js' + +// --------------------------------------------------------------------------- +// Constant validation — ensure thresholds stay at their expected values +// --------------------------------------------------------------------------- + +test('percentage thresholds are correctly ordered', () => { + expect(COMPACT_NORMAL_FOLD_RATIO).toBe(0.75) + expect(COMPACT_AGGRESSIVE_FOLD_RATIO).toBe(0.78) + expect(COMPACT_FORCE_SUMMARY_RATIO).toBe(0.80) + expect(COMPACT_PRECHECK_FOLD_RATIO).toBe(0.90) + + // Thresholds must be monotonically increasing + expect(COMPACT_NORMAL_FOLD_RATIO).toBeLessThan(COMPACT_AGGRESSIVE_FOLD_RATIO) + expect(COMPACT_AGGRESSIVE_FOLD_RATIO).toBeLessThan( + COMPACT_FORCE_SUMMARY_RATIO, + ) + expect(COMPACT_FORCE_SUMMARY_RATIO).toBeLessThan(COMPACT_PRECHECK_FOLD_RATIO) +}) + +test('tail budget ratios are correctly ordered', () => { + expect(COMPACT_NORMAL_FOLD_TAIL_RATIO).toBe(0.20) + expect(COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO).toBe(0.10) + + // Normal fold should preserve more tail than aggressive fold + expect(COMPACT_AGGRESSIVE_FOLD_TAIL_RATIO).toBeLessThan( + COMPACT_NORMAL_FOLD_TAIL_RATIO, + ) +}) + +test('minimum savings ratio is a reasonable value', () => { + expect(MIN_COMPACTION_SAVINGS_RATIO).toBe(0.30) + expect(MIN_COMPACTION_SAVINGS_RATIO).toBeGreaterThan(0) + expect(MIN_COMPACTION_SAVINGS_RATIO).toBeLessThan(1) +}) + +test('pre-check hysteresis is a small positive fraction', () => { + expect(COMPACT_PRECHECK_FOLD_HYSTERESIS).toBe(0.05) + expect(COMPACT_PRECHECK_FOLD_HYSTERESIS).toBeGreaterThan(0) + expect(COMPACT_PRECHECK_FOLD_HYSTERESIS).toBeLessThan(0.15) +}) + +// --------------------------------------------------------------------------- +// isCompactionWorthwhile +// --------------------------------------------------------------------------- + +test('isCompactionWorthwhile returns true when most of context is occupied', () => { + // 90K tokens in 100K window → 90% occupied → worthwhile + expect(isCompactionWorthwhile(90_000, 100_000)).toBe(true) +}) + +test('isCompactionWorthwhile returns true at the boundary (30%)', () => { + // 30K tokens in 100K window → exactly 30% → still worthwhile + expect(isCompactionWorthwhile(30_000, 100_000)).toBe(true) +}) + +test('isCompactionWorthwhile returns false when below threshold', () => { + // 20K tokens in 100K window → 20% → not worthwhile + expect(isCompactionWorthwhile(20_000, 100_000)).toBe(false) +}) + +test('isCompactionWorthwhile returns true when tokens exceed window (emergency)', () => { + // Emergency: tokens exceed the context window — always worthwhile + expect(isCompactionWorthwhile(105_000, 100_000)).toBe(true) +}) + +test('isCompactionWorthwhile handles large 1M context window', () => { + // 400K tokens in 1M window → 40% → worthwhile + expect(isCompactionWorthwhile(400_000, 1_000_000)).toBe(true) + + // 200K tokens in 1M window → 20% → not worthwhile + expect(isCompactionWorthwhile(200_000, 1_000_000)).toBe(false) + + // 205K tokens in 200K window → >100% → emergency → worthwhile + expect(isCompactionWorthwhile(205_000, 200_000)).toBe(true) +}) + +// --------------------------------------------------------------------------- +// computeCacheMetrics — cache economics tracking (Reasonix SessionStats parity) +// --------------------------------------------------------------------------- + +test('computeCacheMetrics with perfect cache hit', () => { + const result = computeCacheMetrics({ + input_tokens: 10_000, + cache_read_input_tokens: 9_000, + cache_creation_input_tokens: 500, + }) + expect(result.cacheHitTokens).toBe(9_000) + expect(result.cacheWriteTokens).toBe(500) + // miss = input - hit - write = 10000 - 9000 - 500 = 500 + expect(result.cacheMissTokens).toBe(500) + expect(result.totalPromptTokens).toBe(10_000) + // ratio = 9000 / (9000 + 500) ≈ 0.947 + expect(result.cacheHitRatio).toBeCloseTo(0.947, 2) +}) + +test('computeCacheMetrics with complete cache miss', () => { + const result = computeCacheMetrics({ + input_tokens: 50_000, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + }) + expect(result.cacheHitTokens).toBe(0) + expect(result.cacheMissTokens).toBe(50_000) + expect(result.cacheWriteTokens).toBe(0) + expect(result.cacheHitRatio).toBe(0) +}) + +test('computeCacheMetrics with null fields defaults to zero', () => { + const result = computeCacheMetrics({ + input_tokens: 5_000, + cache_read_input_tokens: null, + cache_creation_input_tokens: null, + }) + expect(result.cacheHitTokens).toBe(0) + expect(result.cacheWriteTokens).toBe(0) + expect(result.cacheMissTokens).toBe(5_000) +}) + +test('computeCacheMetrics with undefined fields defaults to zero', () => { + const result = computeCacheMetrics({ + input_tokens: 3_000, + }) + expect(result.cacheHitTokens).toBe(0) + expect(result.cacheWriteTokens).toBe(0) + expect(result.cacheMissTokens).toBe(3_000) +}) + +test('computeCacheMetrics handles mixed cache scenario', () => { + // 200K total prompt: 150K cache hit, 30K miss, 20K new writes + const result = computeCacheMetrics({ + input_tokens: 200_000, + cache_read_input_tokens: 150_000, + cache_creation_input_tokens: 20_000, + }) + expect(result.cacheHitTokens).toBe(150_000) + expect(result.cacheWriteTokens).toBe(20_000) + expect(result.cacheMissTokens).toBe(30_000) + // ratio = 150000 / (150000 + 30000) ≈ 0.833 + expect(result.cacheHitRatio).toBeCloseTo(0.833, 2) +}) + +test('computeCacheMetrics guards against negative miss (defensive)', () => { + // Edge case: if API reports more hit+write than input (shouldn't happen + // but the function should never return negative) + const result = computeCacheMetrics({ + input_tokens: 1_000, + cache_read_input_tokens: 800, + cache_creation_input_tokens: 300, + }) + // hit + write = 1100 > input = 1000 → miss clamped to 0 + expect(result.cacheMissTokens).toBe(0) + expect(result.cacheHitTokens).toBe(800) + expect(result.cacheWriteTokens).toBe(300) + expect(result.cacheHitRatio).toBe(1.0) // 800/(800+0) = 1.0 +}) + +// --------------------------------------------------------------------------- +// needsTurnStartPreFold — 90% threshold with hysteresis +// --------------------------------------------------------------------------- + +test('needsTurnStartPreFold returns false when below 90% threshold', () => { + // 80K tokens in 100K window → 80% → not at 90% threshold + expect(needsTurnStartPreFold(80_000, 100_000)).toBe(false) +}) + +test('needsTurnStartPreFold returns true at exactly 90%', () => { + // 90K tokens in 100K window → exactly 90% + expect(needsTurnStartPreFold(90_000, 100_000)).toBe(true) +}) + +test('needsTurnStartPreFold returns true well above 90%', () => { + expect(needsTurnStartPreFold(95_000, 100_000)).toBe(true) +}) + +test('needsTurnStartPreFold with hysteresis: skip when near previous fold', () => { + // 92K tokens in 100K window: 92% → above 90% threshold + // But we folded at 91K recently → hysteresis threshold = 90K * 1.05 = 94.5K + // 92K < 94.5K → hysteresis suppresses re-fold + expect(needsTurnStartPreFold(92_000, 100_000, 91_000)).toBe(false) +}) + +test('needsTurnStartPreFold with hysteresis: trigger when significantly above', () => { + // 96K tokens in 100K window: 96% → above 90% threshold + // Folded at 91K recently → hysteresis threshold = 94.5K + // 96K > 94.5K → hysteresis does NOT suppress + expect(needsTurnStartPreFold(96_000, 100_000, 91_000)).toBe(true) +}) + +test('needsTurnStartPreFold with hysteresis under the threshold is fine', () => { + // 89K tokens in 100K window: 89% → below 90% threshold + // Hysteresis doesn't matter when under threshold + expect(needsTurnStartPreFold(89_000, 100_000, 88_000)).toBe(false) +}) + +test('needsTurnStartPreFold with large 1M context window', () => { + // 920K in 1M window → 92% → above 90% + expect(needsTurnStartPreFold(920_000, 1_000_000)).toBe(true) + + // 880K in 1M window → 88% → below 90% + expect(needsTurnStartPreFold(880_000, 1_000_000)).toBe(false) + + // Hysteresis: 905K in 1M, folded at 900K + // Hysteresis threshold = 900K * 1.05 = 945K, 905K < 945K → suppressed + expect(needsTurnStartPreFold(905_000, 1_000_000, 900_000)).toBe(false) +}) + +// --------------------------------------------------------------------------- +// shouldPreFold — respects alreadyFoldedThisTurn +// --------------------------------------------------------------------------- + +function makeTracking(alreadyFolded: boolean): AutoCompactTrackingState { + return { + compacted: false, + turnCounter: 0, + turnId: 'test', + alreadyFoldedThisTurn: alreadyFolded, + } +} + +test('shouldPreFold returns false when already folded this turn', () => { + const tracking = makeTracking(true) + // 95K in 100K → 95% → would normally trigger, but alreadyFolded suppresses + expect(shouldPreFold(tracking, 95_000, 100_000)).toBe(false) +}) + +test('shouldPreFold returns true when not yet folded this turn', () => { + const tracking = makeTracking(false) + expect(shouldPreFold(tracking, 95_000, 100_000)).toBe(true) +}) + +test('shouldPreFold returns false when tracking is undefined', () => { + // Without tracking, assume no pre-fold needed (defensive) + expect(shouldPreFold(undefined, 95_000, 100_000)).toBe(true) + // But when under threshold, still false + expect(shouldPreFold(undefined, 85_000, 100_000)).toBe(false) +}) + +// --------------------------------------------------------------------------- +// truncateToolResultByTokens +// --------------------------------------------------------------------------- + +test('truncateToolResultByTokens returns content unchanged when under limit', () => { + const content = 'short content' + const result = truncateToolResultByTokens(content, 100) + expect(result.wasTruncated).toBe(false) + expect(result.truncated).toBe(content) +}) + +test('truncateToolResultByTokens returns content unchanged when exactly at limit', () => { + // 400 bytes → ~100 tokens at 4 bytes/token + const content = 'A'.repeat(400) + const result = truncateToolResultByTokens(content, 100) + // May or may not truncate depending on rough estimate — but marker + // should not appear when content is small enough + if (!result.wasTruncated) { + expect(result.truncated).toBe(content) + } +}) + +test('truncateToolResultByTokens truncates when well above limit', () => { + // ~50K chars → ~12,500 tokens at 4 bytes/token + const content = 'A'.repeat(50_000) + const result = truncateToolResultByTokens(content, 100) + expect(result.wasTruncated).toBe(true) + expect(result.truncated.length).toBeLessThan(content.length) + expect(result.truncated).toContain('Content truncated') +}) + +test('truncateToolResultByTokens includes marker in truncated content', () => { + const content = 'B'.repeat(10_000) + const result = truncateToolResultByTokens(content, 100) + expect(result.wasTruncated).toBe(true) + expect(result.truncated).toContain( + 'Content truncated', + ) +}) + +test('truncateToolResultByTokens handles CJK content', () => { + // CJK characters are ~1-3 tokens each, so char/4 underestimates tokens. + // The function should still gracefully handle and truncate CJK content. + const content = '中文测试内容'.repeat(5_000) + const result = truncateToolResultByTokens(content, 500) + expect(result.wasTruncated).toBe(true) + expect(result.truncated.length).toBeLessThan(content.length) +}) + +test('truncateToolResultByTokens preserves content integrity', () => { + const content = 'Hello World\nThis is a test\n'.repeat(200) + const result = truncateToolResultByTokens(content, 100) + expect(result.wasTruncated).toBe(true) + // Should not start with partial line when possible + // (if a newline was found within 70% of the budget) + const truncatedPart = result.truncated.replace( + /\n\n\[Content truncated.*\]$/s, + '', + ) + // Content before the marker should be a prefix of the original + expect(content.startsWith(truncatedPart)).toBe(true) +}) + +test('truncateToolResultByTokens handles empty content', () => { + const result = truncateToolResultByTokens('', 100) + expect(result.wasTruncated).toBe(false) + expect(result.truncated).toBe('') + expect(result.estimatedTokens).toBe(0) +})