From b7c17ad932d11c71ebdc71574eb80f1199144dd5 Mon Sep 17 00:00:00 2001
From: lishixiang <lishixiang@lsxOfficeMacMini.local>
Date: Wed, 11 Mar 2026 13:12:57 +0800
Subject: [PATCH] fix: CJK-aware token estimation in estimateTokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous implementation (Math.ceil(text.length / 4)) assumes ~4
chars per token, which is accurate for English/ASCII text but severely
underestimates token counts for CJK (Chinese/Japanese/Korean) content.

In JavaScript, string.length returns UTF-16 code units, where each CJK
character counts as 1. However, most LLM tokenizers (Claude, GPT, etc.)
encode CJK characters at ~1.5 tokens per character, not 0.25.

This causes a ~3x underestimation for Chinese-heavy conversations:
- A message with 40% CJK + 53% ASCII (28,949 chars):
  - Old estimate: 7,238 tokens
  - Corrected estimate: 22,180 tokens (3.1x higher)
- In production, LCM estimated context at 59k when actual API usage
  was 174k, causing compaction to trigger far too late and sessions
  to hit the 200k hard limit.

The fix applies per-character weighting:
- CJK Unified Ideographs (U+4E00–U+9FFF) and Extension A
  (U+3400–U+4DBF): 1.5 tokens/char
- All other characters: 0.25 tokens/char (unchanged from before)
---
 src/assembler.ts  | 12 +++++++++++-
 src/compaction.ts | 12 +++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/assembler.ts b/src/assembler.ts
index 8b19187c..9e3f7b20 100644
--- a/src/assembler.ts
+++ b/src/assembler.ts
@@ -37,7 +37,17 @@ export interface AssembleContextResult {
 
 /** Simple token estimate: ~4 chars per token, same as VoltCode's Token.estimate */
 function estimateTokens(text: string): number {
-  return Math.ceil(text.length / 4);
+  let count = 0;
+  for (const ch of text) {
+    const code = ch.codePointAt(0)!;
+    // CJK Unified Ideographs + Extension A: ~1.5 tokens per char
+    if ((code >= 0x4E00 && code <= 0x9FFF) || (code >= 0x3400 && code <= 0x4DBF)) {
+      count += 1.5;
+    } else {
+      count += 0.25;
+    }
+  }
+  return Math.ceil(count);
 }
 
 type SummaryPromptSignal = Pick<SummaryRecord, "kind" | "depth" | "descendantCount">;
diff --git a/src/compaction.ts b/src/compaction.ts
index ca60b057..79dfa48c 100644
--- a/src/compaction.ts
+++ b/src/compaction.ts
@@ -82,7 +82,17 @@ type CondensedPhaseCandidate = {
 
 /** Estimate token count from character length (~4 chars per token). */
 function estimateTokens(content: string): number {
-  return Math.ceil(content.length / 4);
+  let count = 0;
+  for (const ch of content) {
+    const code = ch.codePointAt(0)!;
+    // CJK Unified Ideographs + Extension A: ~1.5 tokens per char
+    if ((code >= 0x4E00 && code <= 0x9FFF) || (code >= 0x3400 && code <= 0x4DBF)) {
+      count += 1.5;
+    } else {
+      count += 0.25;
+    }
+  }
+  return Math.ceil(count);
 }
 
 /** Format a timestamp as `YYYY-MM-DD HH:mm TZ` for prompt source text. */