From b7c17ad932d11c71ebdc71574eb80f1199144dd5 Mon Sep 17 00:00:00 2001 From: lishixiang Date: Wed, 11 Mar 2026 13:12:57 +0800 Subject: [PATCH] fix: CJK-aware token estimation in estimateTokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous implementation (Math.ceil(text.length / 4)) assumes ~4 chars per token, which is accurate for English/ASCII text but severely underestimates token counts for CJK (Chinese/Japanese/Korean) content. In JavaScript, string.length returns UTF-16 code units, where each CJK character counts as 1. However, most LLM tokenizers (Claude, GPT, etc.) encode CJK characters at ~1.5 tokens per character, not 0.25. This causes a ~3x underestimation for Chinese-heavy conversations: - A message with 40% CJK + 53% ASCII (28,949 chars): - Old estimate: 7,238 tokens - Corrected estimate: 22,180 tokens (3.1x higher) - In production, LCM estimated context at 59k when actual API usage was 174k, causing compaction to trigger far too late and sessions to hit the 200k hard limit. The fix applies per-character weighting: - CJK Unified Ideographs (U+4E00–U+9FFF) and Extension A (U+3400–U+4DBF): 1.5 tokens/char - All other characters: 0.25 tokens/char (unchanged from before) --- src/assembler.ts | 12 +++++++++++- src/compaction.ts | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/assembler.ts b/src/assembler.ts index 8b19187c..9e3f7b20 100644 --- a/src/assembler.ts +++ b/src/assembler.ts @@ -37,7 +37,17 @@ export interface AssembleContextResult { /** Simple token estimate: ~4 chars per token, same as VoltCode's Token.estimate */ function estimateTokens(text: string): number { - return Math.ceil(text.length / 4); + let count = 0; + for (const ch of text) { + const code = ch.codePointAt(0)!; + // CJK Unified Ideographs + Extension A: ~1.5 tokens per char + if ((code >= 0x4E00 && code <= 0x9FFF) || (code >= 0x3400 && code <= 0x4DBF)) { + count += 1.5; + } else { + count += 0.25; + } + } + return Math.ceil(count); } type SummaryPromptSignal = Pick; diff --git a/src/compaction.ts b/src/compaction.ts index ca60b057..79dfa48c 100644 --- a/src/compaction.ts +++ b/src/compaction.ts @@ -82,7 +82,17 @@ type CondensedPhaseCandidate = { /** Estimate token count from character length (~4 chars per token). */ function estimateTokens(content: string): number { - return Math.ceil(content.length / 4); + let count = 0; + for (const ch of content) { + const code = ch.codePointAt(0)!; + // CJK Unified Ideographs + Extension A: ~1.5 tokens per char + if ((code >= 0x4E00 && code <= 0x9FFF) || (code >= 0x3400 && code <= 0x4DBF)) { + count += 1.5; + } else { + count += 0.25; + } + } + return Math.ceil(count); } /** Format a timestamp as `YYYY-MM-DD HH:mm TZ` for prompt source text. */