-
Notifications
You must be signed in to change notification settings - Fork 73
fix(tokenizer): support Kimi K2/K2.5/K2.6 tiktoken models #1482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
CatherineSue
wants to merge
4
commits into
main
Choose a base branch
from
claude/kimi-k2-tiktoken-support
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+370
−28
Open
Changes from 3 commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
c2df5be
fix(tokenizer): support Kimi K2/K2.5/K2.6 tiktoken models
CatherineSue 9eb42a1
refactor(tokenizer): inline tiktoken loader, drop pub(crate) escape h…
CatherineSue 9c48829
fix(tokenizer): reuse parsed tokenizer_config, tighten Kimi matcher
CatherineSue 7000219
test(tokenizer): cover missing tokenizer_config.json branch
CatherineSue File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,270 @@ | ||
| //! Kimi-K2 / K2.5 / K2.6 detection and special-token helpers. | ||
| //! | ||
| //! Kimi models use the standard tiktoken BPE engine but with a Han-aware regex | ||
| //! and a 256-slot reserved-special-token range starting at `len(mergeable_ranks)`. | ||
| //! These helpers let the generic `TiktokenTokenizer` loader specialize itself | ||
| //! when it sees a Kimi directory, without exposing a separate public type. | ||
| //! | ||
| //! Upstream reference (identical across all three Kimi variants): | ||
| //! - moonshotai/Kimi-K2-Thinking/tokenization_kimi.py | ||
| //! - moonshotai/Kimi-K2.5/tokenization_kimi.py | ||
| //! - moonshotai/Kimi-K2.6/tokenization_kimi.py | ||
|
|
||
| use std::{ | ||
| collections::{HashMap, HashSet}, | ||
| path::Path, | ||
| }; | ||
|
|
||
| use serde_json::Value; | ||
|
|
||
| use crate::traits::TokenIdType; | ||
|
|
||
| const NUM_RESERVED_SPECIAL_TOKENS: usize = 256; | ||
|
|
||
| /// Han-aware tokenization regex used by Kimi K2/K2.5/K2.6. Byte-identical to | ||
| /// the `pat_str` in upstream `tokenization_kimi.py`. | ||
| pub(crate) const KIMI_K2_PATTERN: &str = r"[\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"; | ||
|
|
||
| /// Returns true if `dir` looks like a Kimi K2/K2.5/K2.6 model directory. | ||
| /// | ||
| /// Primary signal: the already-parsed `tokenizer_config.json` references | ||
| /// `tokenization_kimi` (via `auto_map`, `tokenizer_class`, etc.). Callers pass | ||
| /// the parsed JSON so we don't re-read the file the tiktoken loader already | ||
| /// parsed. Fallback: read sibling `config.json` and check `model_type` ∈ | ||
| /// `{kimi_k2, kimi_k25}`. | ||
| pub(crate) fn matches(tokenizer_config: Option<&Value>, dir: &Path) -> bool { | ||
| if tokenizer_config.is_some_and(value_mentions_kimi_tokenizer) { | ||
| return true; | ||
| } | ||
| read_json(&dir.join("config.json")).is_some_and(|config| model_config_is_kimi(&config)) | ||
| } | ||
|
|
||
| /// Fill the 256-slot reserved special-token range starting at `base_vocab_size` | ||
| /// with synthetic `<|reserved_token_{id}|>` entries, preserving any explicit | ||
| /// `added_tokens_decoder` entries that already occupy slots in that range. | ||
| /// | ||
| /// Mirrors upstream `tokenization_kimi.py`: | ||
| /// ```python | ||
| /// {special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i | ||
| /// for i in range(num_base_tokens, num_base_tokens + 256)} | ||
| /// ``` | ||
| /// where `num_base_tokens = len(mergeable_ranks)` — i.e., `encoder.len()`. | ||
| pub(crate) fn apply_reserved_special_tokens( | ||
| added_tokens: &mut HashMap<String, TokenIdType>, | ||
| base_vocab_size: usize, | ||
| ) { | ||
| let Ok(start) = TokenIdType::try_from(base_vocab_size) else { | ||
| return; | ||
| }; | ||
|
|
||
| let occupied_ids: HashSet<TokenIdType> = added_tokens.values().copied().collect(); | ||
| for offset in 0..NUM_RESERVED_SPECIAL_TOKENS { | ||
| let id = start + offset as TokenIdType; | ||
| if occupied_ids.contains(&id) { | ||
| continue; | ||
| } | ||
|
|
||
| added_tokens | ||
| .entry(format!("<|reserved_token_{id}|>")) | ||
| .or_insert(id); | ||
| } | ||
| } | ||
|
|
||
| fn read_json(path: &Path) -> Option<Value> { | ||
| let content = std::fs::read_to_string(path).ok()?; | ||
| serde_json::from_str(&content).ok() | ||
| } | ||
|
|
||
| fn model_config_is_kimi(config: &Value) -> bool { | ||
| let model_type = config.get("model_type").and_then(Value::as_str); | ||
| matches!(model_type, Some("kimi_k2") | Some("kimi_k25")) | ||
|
CatherineSue marked this conversation as resolved.
|
||
| } | ||
|
|
||
| fn value_mentions_kimi_tokenizer(value: &Value) -> bool { | ||
| match value { | ||
| Value::String(s) => mentions_kimi_tokenizer_module(s), | ||
| Value::Array(values) => values.iter().any(value_mentions_kimi_tokenizer), | ||
| Value::Object(map) => map.values().any(value_mentions_kimi_tokenizer), | ||
| _ => false, | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
| /// Match the dotted-path identifier `tokenization_kimi` as a whole segment, | ||
| /// not a substring — so `tokenization_kimi.TikTokenTokenizer` matches but | ||
| /// `tokenization_kimi_v2` or `my_tokenization_kimi_helper` do not. | ||
| fn mentions_kimi_tokenizer_module(s: &str) -> bool { | ||
| s.split('.').any(|seg| seg == "tokenization_kimi") | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use base64::{engine::general_purpose::STANDARD, Engine as _}; | ||
|
|
||
| use super::*; | ||
| use crate::{ | ||
| tiktoken::TiktokenTokenizer, | ||
| traits::{Decoder, Encoder, Tokenizer}, | ||
| }; | ||
|
|
||
| // Minimal BPE: bytes 'a' (rank 0), 'b' (rank 1). Used for tests that only | ||
| // exercise decode of synthetic special tokens (no BPE encode). | ||
| const MINIMAL_TIKTOKEN_MODEL: &str = "YQ== 0\nYg== 1\n"; | ||
| // Minimal BPE: "hello's" (rank 0), "hello" (rank 1), "'s" (rank 2). | ||
| const CONTRACTION_TIKTOKEN_MODEL: &str = "aGVsbG8ncw== 0\naGVsbG8= 1\nJ3M= 2\n"; | ||
|
|
||
| /// Build a tiktoken model file with all 256 single-byte tokens (ranks 0..256) | ||
| /// plus the given multi-byte tokens at successive ranks starting at 256. | ||
| /// tiktoken's BPE requires every input byte to have a rank, so a real-world | ||
| /// encode test needs the full byte layer. | ||
| fn full_byte_bpe(extra: &[&[u8]]) -> String { | ||
| let mut out = String::new(); | ||
| for b in 0u32..256 { | ||
| out.push_str(&format!("{} {}\n", STANDARD.encode([b as u8]), b)); | ||
| } | ||
| for (offset, bytes) in extra.iter().enumerate() { | ||
| out.push_str(&format!("{} {}\n", STANDARD.encode(bytes), 256 + offset)); | ||
| } | ||
| out | ||
| } | ||
|
|
||
| fn write_kimi_dir(model: &str, tokenizer_config: &str) -> tempfile::TempDir { | ||
| let dir = tempfile::tempdir().unwrap(); | ||
| std::fs::write(dir.path().join("tiktoken.model"), model).unwrap(); | ||
| std::fs::write(dir.path().join("tokenizer_config.json"), tokenizer_config).unwrap(); | ||
| dir | ||
| } | ||
|
|
||
| const KIMI_AUTO_MAP_CONFIG: &str = r#"{ | ||
| "tokenizer_class": "TikTokenTokenizer", | ||
| "auto_map": { | ||
| "AutoTokenizer": ["tokenization_kimi.TikTokenTokenizer", null] | ||
| } | ||
| }"#; | ||
|
|
||
| /// Read tokenizer_config.json from `dir`, mirroring what the real loader | ||
| /// hands to `matches`. Used so each test exercises the same call shape. | ||
| fn tokenizer_config(dir: &Path) -> Option<Value> { | ||
| read_json(&dir.join("tokenizer_config.json")) | ||
| } | ||
|
|
||
| #[test] | ||
| fn reserved_special_tokens_are_synthesized() { | ||
| let dir = write_kimi_dir( | ||
| MINIMAL_TIKTOKEN_MODEL, | ||
| r#"{ | ||
| "tokenizer_class": "TikTokenTokenizer", | ||
| "auto_map": { | ||
| "AutoTokenizer": ["tokenization_kimi.TikTokenTokenizer", null] | ||
| }, | ||
| "added_tokens_decoder": { | ||
| "2": { "content": "[BOS]", "special": true }, | ||
| "5": { "content": "<|im_assistant|>", "special": true } | ||
| } | ||
| }"#, | ||
| ); | ||
| let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap(); | ||
|
|
||
| assert_eq!(tokenizer.vocab_size(), 258); | ||
| assert_eq!( | ||
| tokenizer.decode(&[4], false).unwrap(), | ||
| "<|reserved_token_4|>" | ||
| ); | ||
| assert_eq!(tokenizer.decode(&[5], false).unwrap(), "<|im_assistant|>"); | ||
| assert_eq!(tokenizer.token_to_id("<|reserved_token_4|>"), Some(4)); | ||
| assert_eq!( | ||
| tokenizer.id_to_token(4).as_deref(), | ||
| Some("<|reserved_token_4|>") | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn matches_via_model_type_kimi_k2() { | ||
| let dir = tempfile::tempdir().unwrap(); | ||
| std::fs::write(dir.path().join("tiktoken.model"), MINIMAL_TIKTOKEN_MODEL).unwrap(); | ||
| std::fs::write( | ||
| dir.path().join("tokenizer_config.json"), | ||
| r#"{ "added_tokens_decoder": {} }"#, | ||
| ) | ||
| .unwrap(); | ||
| std::fs::write( | ||
| dir.path().join("config.json"), | ||
| r#"{ "model_type": "kimi_k2" }"#, | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| assert!(matches(tokenizer_config(dir.path()).as_ref(), dir.path())); | ||
| // Round-trip a synthetic reserved token to confirm Kimi load path was taken. | ||
| let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap(); | ||
| assert_eq!( | ||
| tokenizer.decode(&[42], false).unwrap(), | ||
| "<|reserved_token_42|>" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn matches_via_model_type_kimi_k25() { | ||
| let dir = tempfile::tempdir().unwrap(); | ||
| std::fs::write(dir.path().join("tiktoken.model"), MINIMAL_TIKTOKEN_MODEL).unwrap(); | ||
| std::fs::write( | ||
| dir.path().join("tokenizer_config.json"), | ||
| r#"{ "added_tokens_decoder": {} }"#, | ||
| ) | ||
| .unwrap(); | ||
| std::fs::write( | ||
| dir.path().join("config.json"), | ||
| r#"{ "model_type": "kimi_k25" }"#, | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| assert!(matches(tokenizer_config(dir.path()).as_ref(), dir.path())); | ||
| } | ||
|
|
||
| #[test] | ||
| fn substring_does_not_falsely_match_kimi() { | ||
| // Names that *contain* "tokenization_kimi" as a substring but aren't | ||
| // the module identifier itself must not trigger Kimi detection. | ||
| assert!(!mentions_kimi_tokenizer_module("tokenization_kimi_v2")); | ||
| assert!(!mentions_kimi_tokenizer_module( | ||
| "my_tokenization_kimi_helper" | ||
| )); | ||
| // Real upstream forms should still match. | ||
| assert!(mentions_kimi_tokenizer_module("tokenization_kimi")); | ||
| assert!(mentions_kimi_tokenizer_module( | ||
| "tokenization_kimi.TikTokenTokenizer" | ||
| )); | ||
| assert!(mentions_kimi_tokenizer_module( | ||
| "pkg.tokenization_kimi.TikTokenTokenizer" | ||
| )); | ||
| } | ||
|
|
||
| #[test] | ||
| fn uses_kimi_pattern_for_contractions() { | ||
| let dir = write_kimi_dir(CONTRACTION_TIKTOKEN_MODEL, KIMI_AUTO_MAP_CONFIG); | ||
| let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap(); | ||
| // The Kimi regex keeps "hello's" as a single match (contraction handling | ||
| // in the third alternation), so the BPE returns rank 0. | ||
| assert_eq!( | ||
| tokenizer.encode("hello's", false).unwrap().token_ids(), | ||
| &[0] | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn han_input_round_trips_through_kimi_pattern() { | ||
| // The Kimi regex's leading alternation is `[\p{Han}]+`. The main | ||
| // regressions this guards against are (a) the character-class | ||
| // intersection `[X&&[^\p{Han}]]` failing to compile under tiktoken-rs's | ||
| // fancy-regex backend, and (b) Han input being rejected at the | ||
| // pre-tokenizer. A minimal synthetic BPE can't reproduce a real Kimi | ||
| // vocab, so we assert byte-level round-trip rather than exact token | ||
| // IDs: encode must not panic, and decode must reconstruct the input. | ||
| let model = full_byte_bpe(&[]); | ||
| let dir = write_kimi_dir(&model, KIMI_AUTO_MAP_CONFIG); | ||
| let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap(); | ||
|
|
||
| let text = "你好世界 hello!"; | ||
| let encoding = tokenizer.encode(text, false).unwrap(); | ||
| let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap(); | ||
| assert_eq!(decoded, text); | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.