diff --git a/src/assembler.ts b/src/assembler.ts index 8b19187c..9e3f7b20 100644 --- a/src/assembler.ts +++ b/src/assembler.ts @@ -37,7 +37,17 @@ export interface AssembleContextResult { /** Simple token estimate: ~4 chars per token, same as VoltCode's Token.estimate */ function estimateTokens(text: string): number { - return Math.ceil(text.length / 4); + let count = 0; + for (const ch of text) { + const code = ch.codePointAt(0)!; + // CJK Unified Ideographs + Extension A: ~1.5 tokens per char + if ((code >= 0x4E00 && code <= 0x9FFF) || (code >= 0x3400 && code <= 0x4DBF)) { + count += 1.5; + } else { + count += 0.25; + } + } + return Math.ceil(count); } type SummaryPromptSignal = Pick; diff --git a/src/compaction.ts b/src/compaction.ts index ca60b057..79dfa48c 100644 --- a/src/compaction.ts +++ b/src/compaction.ts @@ -82,7 +82,17 @@ type CondensedPhaseCandidate = { /** Estimate token count from character length (~4 chars per token). */ function estimateTokens(content: string): number { - return Math.ceil(content.length / 4); + let count = 0; + for (const ch of content) { + const code = ch.codePointAt(0)!; + // CJK Unified Ideographs + Extension A: ~1.5 tokens per char + if ((code >= 0x4E00 && code <= 0x9FFF) || (code >= 0x3400 && code <= 0x4DBF)) { + count += 1.5; + } else { + count += 0.25; + } + } + return Math.ceil(count); } /** Format a timestamp as `YYYY-MM-DD HH:mm TZ` for prompt source text. */