Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 270 additions & 0 deletions crates/tokenizer/src/kimi_k2_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
//! Kimi-K2 / K2.5 / K2.6 detection and special-token helpers.
//!
//! Kimi models use the standard tiktoken BPE engine but with a Han-aware regex
//! and a 256-slot reserved-special-token range starting at `len(mergeable_ranks)`.
//! These helpers let the generic `TiktokenTokenizer` loader specialize itself
//! when it sees a Kimi directory, without exposing a separate public type.
//!
//! Upstream reference (identical across all three Kimi variants):
//! - moonshotai/Kimi-K2-Thinking/tokenization_kimi.py
//! - moonshotai/Kimi-K2.5/tokenization_kimi.py
//! - moonshotai/Kimi-K2.6/tokenization_kimi.py

use std::{
collections::{HashMap, HashSet},
path::Path,
};

use serde_json::Value;

use crate::traits::TokenIdType;

const NUM_RESERVED_SPECIAL_TOKENS: usize = 256;

/// Han-aware tokenization regex used by Kimi K2/K2.5/K2.6. Byte-identical to
/// the `pat_str` in upstream `tokenization_kimi.py`.
pub(crate) const KIMI_K2_PATTERN: &str = r"[\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";

/// Returns true if `dir` looks like a Kimi K2/K2.5/K2.6 model directory.
///
/// Primary signal: the already-parsed `tokenizer_config.json` references
/// `tokenization_kimi` (via `auto_map`, `tokenizer_class`, etc.). Callers pass
/// the parsed JSON so we don't re-read the file the tiktoken loader already
/// parsed. Fallback: read sibling `config.json` and check `model_type` ∈
/// `{kimi_k2, kimi_k25}`.
pub(crate) fn matches(tokenizer_config: Option<&Value>, dir: &Path) -> bool {
if tokenizer_config.is_some_and(value_mentions_kimi_tokenizer) {
return true;
}
read_json(&dir.join("config.json")).is_some_and(|config| model_config_is_kimi(&config))
}

/// Fill the 256-slot reserved special-token range starting at `base_vocab_size`
/// with synthetic `<|reserved_token_{id}|>` entries, preserving any explicit
/// `added_tokens_decoder` entries that already occupy slots in that range.
///
/// Mirrors upstream `tokenization_kimi.py`:
/// ```python
/// {special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
/// for i in range(num_base_tokens, num_base_tokens + 256)}
/// ```
/// where `num_base_tokens = len(mergeable_ranks)` — i.e., `encoder.len()`.
pub(crate) fn apply_reserved_special_tokens(
added_tokens: &mut HashMap<String, TokenIdType>,
base_vocab_size: usize,
) {
let Ok(start) = TokenIdType::try_from(base_vocab_size) else {
return;
};

let occupied_ids: HashSet<TokenIdType> = added_tokens.values().copied().collect();
for offset in 0..NUM_RESERVED_SPECIAL_TOKENS {
let id = start + offset as TokenIdType;
Comment thread
CatherineSue marked this conversation as resolved.
if occupied_ids.contains(&id) {
continue;
}

added_tokens
.entry(format!("<|reserved_token_{id}|>"))
.or_insert(id);
}
}

fn read_json(path: &Path) -> Option<Value> {
let content = std::fs::read_to_string(path).ok()?;
serde_json::from_str(&content).ok()
}

fn model_config_is_kimi(config: &Value) -> bool {
let model_type = config.get("model_type").and_then(Value::as_str);
matches!(model_type, Some("kimi_k2") | Some("kimi_k25"))
Comment thread
CatherineSue marked this conversation as resolved.
}

fn value_mentions_kimi_tokenizer(value: &Value) -> bool {
match value {
Value::String(s) => mentions_kimi_tokenizer_module(s),
Value::Array(values) => values.iter().any(value_mentions_kimi_tokenizer),
Value::Object(map) => map.values().any(value_mentions_kimi_tokenizer),
_ => false,
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
}

/// Match the dotted-path identifier `tokenization_kimi` as a whole segment,
/// not a substring — so `tokenization_kimi.TikTokenTokenizer` matches but
/// `tokenization_kimi_v2` or `my_tokenization_kimi_helper` do not.
fn mentions_kimi_tokenizer_module(s: &str) -> bool {
s.split('.').any(|seg| seg == "tokenization_kimi")
}

#[cfg(test)]
mod tests {
use base64::{engine::general_purpose::STANDARD, Engine as _};

use super::*;
use crate::{
tiktoken::TiktokenTokenizer,
traits::{Decoder, Encoder, Tokenizer},
};

// Minimal BPE: bytes 'a' (rank 0), 'b' (rank 1). Used for tests that only
// exercise decode of synthetic special tokens (no BPE encode).
const MINIMAL_TIKTOKEN_MODEL: &str = "YQ== 0\nYg== 1\n";
// Minimal BPE: "hello's" (rank 0), "hello" (rank 1), "'s" (rank 2).
const CONTRACTION_TIKTOKEN_MODEL: &str = "aGVsbG8ncw== 0\naGVsbG8= 1\nJ3M= 2\n";

/// Build a tiktoken model file with all 256 single-byte tokens (ranks 0..256)
/// plus the given multi-byte tokens at successive ranks starting at 256.
/// tiktoken's BPE requires every input byte to have a rank, so a real-world
/// encode test needs the full byte layer.
fn full_byte_bpe(extra: &[&[u8]]) -> String {
let mut out = String::new();
for b in 0u32..256 {
out.push_str(&format!("{} {}\n", STANDARD.encode([b as u8]), b));
}
for (offset, bytes) in extra.iter().enumerate() {
out.push_str(&format!("{} {}\n", STANDARD.encode(bytes), 256 + offset));
}
out
}

fn write_kimi_dir(model: &str, tokenizer_config: &str) -> tempfile::TempDir {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("tiktoken.model"), model).unwrap();
std::fs::write(dir.path().join("tokenizer_config.json"), tokenizer_config).unwrap();
dir
}

const KIMI_AUTO_MAP_CONFIG: &str = r#"{
"tokenizer_class": "TikTokenTokenizer",
"auto_map": {
"AutoTokenizer": ["tokenization_kimi.TikTokenTokenizer", null]
}
}"#;

/// Read tokenizer_config.json from `dir`, mirroring what the real loader
/// hands to `matches`. Used so each test exercises the same call shape.
fn tokenizer_config(dir: &Path) -> Option<Value> {
read_json(&dir.join("tokenizer_config.json"))
}

#[test]
fn reserved_special_tokens_are_synthesized() {
let dir = write_kimi_dir(
MINIMAL_TIKTOKEN_MODEL,
r#"{
"tokenizer_class": "TikTokenTokenizer",
"auto_map": {
"AutoTokenizer": ["tokenization_kimi.TikTokenTokenizer", null]
},
"added_tokens_decoder": {
"2": { "content": "[BOS]", "special": true },
"5": { "content": "<|im_assistant|>", "special": true }
}
}"#,
);
let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap();

assert_eq!(tokenizer.vocab_size(), 258);
assert_eq!(
tokenizer.decode(&[4], false).unwrap(),
"<|reserved_token_4|>"
);
assert_eq!(tokenizer.decode(&[5], false).unwrap(), "<|im_assistant|>");
assert_eq!(tokenizer.token_to_id("<|reserved_token_4|>"), Some(4));
assert_eq!(
tokenizer.id_to_token(4).as_deref(),
Some("<|reserved_token_4|>")
);
}

#[test]
fn matches_via_model_type_kimi_k2() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("tiktoken.model"), MINIMAL_TIKTOKEN_MODEL).unwrap();
std::fs::write(
dir.path().join("tokenizer_config.json"),
r#"{ "added_tokens_decoder": {} }"#,
)
.unwrap();
std::fs::write(
dir.path().join("config.json"),
r#"{ "model_type": "kimi_k2" }"#,
)
.unwrap();

assert!(matches(tokenizer_config(dir.path()).as_ref(), dir.path()));
// Round-trip a synthetic reserved token to confirm Kimi load path was taken.
let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap();
assert_eq!(
tokenizer.decode(&[42], false).unwrap(),
"<|reserved_token_42|>"
);
}

#[test]
fn matches_via_model_type_kimi_k25() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("tiktoken.model"), MINIMAL_TIKTOKEN_MODEL).unwrap();
std::fs::write(
dir.path().join("tokenizer_config.json"),
r#"{ "added_tokens_decoder": {} }"#,
)
.unwrap();
std::fs::write(
dir.path().join("config.json"),
r#"{ "model_type": "kimi_k25" }"#,
)
.unwrap();

assert!(matches(tokenizer_config(dir.path()).as_ref(), dir.path()));
}

#[test]
fn substring_does_not_falsely_match_kimi() {
// Names that *contain* "tokenization_kimi" as a substring but aren't
// the module identifier itself must not trigger Kimi detection.
assert!(!mentions_kimi_tokenizer_module("tokenization_kimi_v2"));
assert!(!mentions_kimi_tokenizer_module(
"my_tokenization_kimi_helper"
));
// Real upstream forms should still match.
assert!(mentions_kimi_tokenizer_module("tokenization_kimi"));
assert!(mentions_kimi_tokenizer_module(
"tokenization_kimi.TikTokenTokenizer"
));
assert!(mentions_kimi_tokenizer_module(
"pkg.tokenization_kimi.TikTokenTokenizer"
));
}

#[test]
fn uses_kimi_pattern_for_contractions() {
let dir = write_kimi_dir(CONTRACTION_TIKTOKEN_MODEL, KIMI_AUTO_MAP_CONFIG);
let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap();
// The Kimi regex keeps "hello's" as a single match (contraction handling
// in the third alternation), so the BPE returns rank 0.
assert_eq!(
tokenizer.encode("hello's", false).unwrap().token_ids(),
&[0]
);
}

#[test]
fn han_input_round_trips_through_kimi_pattern() {
// The Kimi regex's leading alternation is `[\p{Han}]+`. The main
// regressions this guards against are (a) the character-class
// intersection `[X&&[^\p{Han}]]` failing to compile under tiktoken-rs's
// fancy-regex backend, and (b) Han input being rejected at the
// pre-tokenizer. A minimal synthetic BPE can't reproduce a real Kimi
// vocab, so we assert byte-level round-trip rather than exact token
// IDs: encode must not panic, and decode must reconstruct the input.
let model = full_byte_bpe(&[]);
let dir = write_kimi_dir(&model, KIMI_AUTO_MAP_CONFIG);
let tokenizer = TiktokenTokenizer::from_dir(dir.path()).unwrap();

let text = "你好世界 hello!";
let encoding = tokenizer.encode(text, false).unwrap();
let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap();
assert_eq!(decoded, text);
}
}
1 change: 1 addition & 0 deletions crates/tokenizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub mod traits;

pub mod chat_template;
pub mod huggingface;
mod kimi_k2_tokenizer;
pub mod tiktoken;

#[cfg(test)]
Expand Down
Loading
Loading