diff --git a/crates/tokenizer/src/encoders/kimi_k25_tools.rs b/crates/tokenizer/src/encoders/kimi_k25_tools.rs new file mode 100644 index 000000000..d4ba2a615 --- /dev/null +++ b/crates/tokenizer/src/encoders/kimi_k25_tools.rs @@ -0,0 +1,693 @@ +//! Kimi-K2.5 tool-declaration encoder. See module-level docs. +//! +//! Ported from . +//! Mirrors the upstream Python reference function-by-function. Output must be +//! byte-equal to the Python reference; gated by golden tests in +//! `tests/kimi_k25.rs`. + +use std::{collections::HashMap, fmt::Write}; + +use anyhow::Result; +use serde_json::Value; + +use crate::chat_template::{ChatTemplateParams, ChatTemplateState}; + +const TS_INDENT: &str = " "; +const TS_FIELD_DELIMITER: &str = ",\n"; + +// On overflow, emit `any` and continue — never panic on adversarial schemas. +const MAX_RECURSION_DEPTH: usize = 32; + +pub fn encode_tools_to_typescript(tools: &[Value]) -> Option { + if tools.is_empty() { + return None; + } + let function_strs: Vec = tools + .iter() + .filter_map(|t| { + if t.get("type").and_then(Value::as_str) != Some("function") { + return None; + } + let func = t.get("function")?; + Some(openai_function_to_typescript(func)) + }) + .collect(); + if function_strs.is_empty() { + return None; + } + let mut out = String::from("# Tools\n\n## functions\nnamespace functions {\n"); + out.push_str(&function_strs.join("\n")); + out.push('\n'); + out.push_str("}\n"); + Some(out) +} + +/// Renderer for `Renderer::KimiK25Tools`. Computes `tools_ts_str` and merges +/// it into `template_kwargs`, then delegates to the standard minijinja path. +pub(crate) fn apply_kimi_k25_tools( + chat_template: &ChatTemplateState, + messages: &[Value], + params: ChatTemplateParams, +) -> Result { + let ts_str = params.tools.and_then(encode_tools_to_typescript); + + let owned: Option> = match (params.template_kwargs, ts_str.as_ref()) { + (Some(existing), Some(ts)) => { + let mut m = existing.clone(); + m.insert("tools_ts_str".to_string(), Value::String(ts.clone())); + Some(m) + } + (None, Some(ts)) => { + let mut m = HashMap::with_capacity(1); + m.insert("tools_ts_str".to_string(), Value::String(ts.clone())); + Some(m) + } + _ => None, // No tools → leave tools_ts_str undefined + }; + + let new_params = ChatTemplateParams { + template_kwargs: owned.as_ref().or(params.template_kwargs), + ..params + }; + chat_template.apply(messages, new_params) +} + +// --------------------------------------------------------------------------- +// Function-level encoding +// --------------------------------------------------------------------------- + +fn openai_function_to_typescript(function: &Value) -> String { + let parameters = function + .get("parameters") + .cloned() + .unwrap_or_else(|| Value::Object(Default::default())); + let mut registry = SchemaRegistry::default(); + let parsed = ParameterTypeObject::parse(¶meters, &mut registry); + + let mut interfaces: Vec = Vec::new(); + let mut root_interface_name: Option<&str> = None; + + if registry.has_self_ref { + root_interface_name = Some("parameters"); + let body = parsed + .properties + .iter() + .map(|p| p.to_typescript(TS_INDENT, ®istry)) + .collect::>() + .join(TS_FIELD_DELIMITER); + let body = if body.is_empty() { + String::new() + } else { + format!("\n{body}\n") + }; + interfaces.push(format!("interface parameters {{{body}}}")); + } + + let mut defs_clone: Vec<(String, Value)> = registry + .definitions + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + defs_clone.sort_by(|a, b| a.0.cmp(&b.0)); + for (name, schema) in defs_clone { + let obj_type = parse_parameter_type(&schema, &mut registry); + let body = obj_type.to_typescript("", ®istry); + let mut def_str = String::new(); + if let Some(desc) = schema.get("description").and_then(Value::as_str) { + if !desc.is_empty() { + def_str.push_str(&format_description(desc, "")); + def_str.push('\n'); + } + } + #[expect( + clippy::unwrap_used, + reason = "write!/writeln! into String cannot fail" + )] + { + write!(def_str, "interface {name} {body}").unwrap(); + } + interfaces.push(def_str); + } + + let interface_str = interfaces.join("\n"); + let function_name = function + .get("name") + .and_then(Value::as_str) + .unwrap_or("function"); + let type_def = match root_interface_name { + Some(n) => format!("type {function_name} = (_: {n}) => any;"), + None => format!( + "type {function_name} = (_: {}) => any;", + parsed.to_typescript("", ®istry) + ), + }; + let description = function + .get("description") + .and_then(Value::as_str) + .unwrap_or(""); + let desc_block = if description.is_empty() { + String::new() + } else { + format_description(description, "") + }; + + [interface_str, desc_block, type_def] + .into_iter() + .filter(|s| !s.is_empty()) + .collect::>() + .join("\n") +} + +// --------------------------------------------------------------------------- +// Schema parsing +// --------------------------------------------------------------------------- + +#[derive(Default)] +struct SchemaRegistry { + definitions: HashMap, + has_self_ref: bool, + depth: usize, +} + +impl SchemaRegistry { + fn register_definitions(&mut self, defs: &Value) { + if let Some(map) = defs.as_object() { + for (name, schema) in map { + self.definitions.insert(name.clone(), schema.clone()); + } + } + } + + fn resolve_ref(&mut self, reference: &str) -> Option { + if reference == "#" { + self.has_self_ref = true; + return Some(serde_json::json!({"$self_ref": true})); + } + if let Some(name) = reference.strip_prefix("#/$defs/") { + return self.definitions.get(name).cloned(); + } + None + } +} + +enum ParameterType { + Scalar(ParameterTypeScalar), + Object(ParameterTypeObject), + Array(ParameterTypeArray), + Enum(ParameterTypeEnum), + AnyOf(ParameterTypeAnyOf), + Union(ParameterTypeUnion), + Ref(ParameterTypeRef), +} + +impl ParameterType { + fn format_docstring(&self, indent: &str) -> String { + match self { + ParameterType::Scalar(s) => s.base.format_docstring(indent), + ParameterType::Object(o) => o.base.format_docstring(indent), + ParameterType::Array(a) => a.base.format_docstring(indent), + ParameterType::Enum(e) => e.base.format_docstring(indent), + ParameterType::AnyOf(a) => a.base.format_docstring(indent), + ParameterType::Union(u) => u.base.format_docstring(indent), + ParameterType::Ref(r) => r.base.format_docstring(indent), + } + } + + fn to_typescript(&self, indent: &str, registry: &SchemaRegistry) -> String { + match self { + ParameterType::Scalar(s) => s.to_typescript(), + ParameterType::Object(o) => o.to_typescript(indent, registry), + ParameterType::Array(a) => a.to_typescript(indent, registry), + ParameterType::Enum(e) => e.to_typescript(), + ParameterType::AnyOf(a) => a.to_typescript(indent, registry), + ParameterType::Union(u) => u.to_typescript(), + ParameterType::Ref(r) => r.to_typescript(), + } + } +} + +#[derive(Default)] +struct BaseType { + description: String, + constraints: Vec<(String, Value)>, +} + +impl BaseType { + fn from_extra_props(props: &Value, allowed_keys: &[&str]) -> Self { + let description = props + .get("description") + .and_then(Value::as_str) + .unwrap_or("") + .to_string(); + let mut constraints: Vec<(String, Value)> = props + .as_object() + .map(|m| { + m.iter() + .filter(|(k, _)| allowed_keys.contains(&k.as_str())) + .map(|(k, v)| (k.clone(), v.clone())) + .collect() + }) + .unwrap_or_default(); + constraints.sort_by(|a, b| a.0.cmp(&b.0)); + Self { + description, + constraints, + } + } + + fn format_docstring(&self, indent: &str) -> String { + let mut out = String::new(); + if !self.description.is_empty() { + out.push_str(&format_description(&self.description, indent)); + out.push('\n'); + } + if !self.constraints.is_empty() { + let parts: Vec = self + .constraints + .iter() + .map(|(k, v)| format!("{k}: {}", json_inline(v))) + .collect(); + #[expect( + clippy::unwrap_used, + reason = "write!/writeln! into String cannot fail" + )] + { + writeln!(out, "{indent}// {}", parts.join(", ")).unwrap(); + } + } + out + } +} + +struct ParameterTypeScalar { + base: BaseType, + typ: String, +} + +impl ParameterTypeScalar { + fn parse(typ: &str, props: &Value) -> Self { + let allowed: &[&str] = match typ { + "string" => &["maxLength", "minLength", "pattern"], + "number" | "integer" => &["maximum", "minimum"], + _ => &[], + }; + Self { + base: BaseType::from_extra_props(props, allowed), + typ: typ.to_string(), + } + } + + fn any() -> Self { + Self { + base: BaseType::default(), + typ: "any".to_string(), + } + } + + fn to_typescript(&self) -> String { + match self.typ.as_str() { + "integer" => "number".to_string(), + other => other.to_string(), + } + } +} + +struct Parameter { + name: String, + typ: ParameterType, + optional: bool, + default: Option, +} + +impl Parameter { + fn to_typescript(&self, indent: &str, registry: &SchemaRegistry) -> String { + let mut out = self.typ.format_docstring(indent); + if let Some(d) = &self.default { + let repr = match d { + Value::Bool(true) => "True".to_string(), + Value::Bool(false) => "False".to_string(), + Value::Number(_) => d.to_string(), + _ => serde_json::to_string(d).unwrap_or_else(|_| "null".to_string()), + }; + #[expect( + clippy::unwrap_used, + reason = "write!/writeln! into String cannot fail" + )] + { + writeln!(out, "{indent}// Default: {repr}").unwrap(); + } + } + let opt_marker = if self.optional { "?" } else { "" }; + #[expect( + clippy::unwrap_used, + reason = "write!/writeln! into String cannot fail" + )] + { + write!( + out, + "{indent}{}{opt_marker}: {}", + self.name, + self.typ.to_typescript(indent, registry) + ) + .unwrap(); + } + out + } +} + +struct ParameterTypeObject { + base: BaseType, + properties: Vec, + additional_properties: AdditionalProperties, +} + +enum AdditionalProperties { + None, + True, + False, + Schema(Box), +} + +impl ParameterTypeObject { + fn parse(schema: &Value, registry: &mut SchemaRegistry) -> Self { + let base = BaseType::from_extra_props(schema, &[]); + if let Some(defs) = schema.get("$defs") { + registry.register_definitions(defs); + } + + let additional_properties = match schema.get("additionalProperties") { + None => AdditionalProperties::None, + Some(Value::Bool(true)) => AdditionalProperties::True, + Some(Value::Bool(false)) => AdditionalProperties::False, + Some(other) => { + AdditionalProperties::Schema(Box::new(parse_parameter_type(other, registry))) + } + }; + + let props_map = schema.get("properties").and_then(Value::as_object); + let required: Vec<&str> = schema + .get("required") + .and_then(Value::as_array) + .map(|arr| arr.iter().filter_map(Value::as_str).collect()) + .unwrap_or_default(); + + let properties: Vec = props_map + .map(|props| { + props + .iter() + .map(|(name, prop_schema)| { + let optional = !required.contains(&name.as_str()); + let default = prop_schema.get("default").cloned(); + let typ = parse_parameter_type(prop_schema, registry); + Parameter { + name: name.clone(), + typ, + optional, + default, + } + }) + .collect() + }) + .unwrap_or_default(); + + Self { + base, + properties, + additional_properties, + } + } + + fn to_typescript(&self, indent: &str, registry: &SchemaRegistry) -> String { + let mut required: Vec<&Parameter> = + self.properties.iter().filter(|p| !p.optional).collect(); + let mut optional: Vec<&Parameter> = self.properties.iter().filter(|p| p.optional).collect(); + required.sort_by(|a, b| a.name.cmp(&b.name)); + optional.sort_by(|a, b| a.name.cmp(&b.name)); + let inner_indent = format!("{indent}{TS_INDENT}"); + let mut parts: Vec = required + .into_iter() + .chain(optional) + .map(|p| p.to_typescript(&inner_indent, registry)) + .collect(); + + match &self.additional_properties { + AdditionalProperties::None => {} + AdditionalProperties::True => parts.push(format!("{inner_indent}[k: string]: any")), + AdditionalProperties::False => parts.push(format!("{inner_indent}[k: string]: never")), + AdditionalProperties::Schema(inner) => { + let ty = inner.to_typescript(&inner_indent, registry); + parts.push(format!("{inner_indent}[k: string]: {ty}")); + } + } + + if parts.is_empty() { + return "{}".to_string(); + } + let body = parts.join(TS_FIELD_DELIMITER); + let body = format!("\n{body}\n"); + format!("{{{body}{indent}}}") + } +} + +struct ParameterTypeArray { + base: BaseType, + item: Box, +} + +impl ParameterTypeArray { + fn parse(schema: &Value, registry: &mut SchemaRegistry) -> Self { + let base = BaseType::from_extra_props(schema, &["minItems", "maxItems"]); + let item = match schema.get("items") { + Some(items) if !items.is_null() => parse_parameter_type(items, registry), + _ => ParameterType::Scalar(ParameterTypeScalar::any()), + }; + Self { + base, + item: Box::new(item), + } + } + + fn to_typescript(&self, indent: &str, registry: &SchemaRegistry) -> String { + let inner_indent = format!("{indent}{TS_INDENT}"); + let item_doc = self.item.format_docstring(&inner_indent); + let item_ts = self.item.to_typescript(&inner_indent, registry); + if item_doc.is_empty() { + format!("Array<{item_ts}>") + } else { + format!("Array<\n{item_doc}{inner_indent}{item_ts}\n{indent}>") + } + } +} + +struct ParameterTypeEnum { + base: BaseType, + values: Vec, +} + +impl ParameterTypeEnum { + fn parse(schema: &Value) -> Self { + let values = schema + .get("enum") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + Self { + base: BaseType::from_extra_props(schema, &[]), + values, + } + } + + fn to_typescript(&self) -> String { + self.values + .iter() + .map(|v| match v { + Value::String(s) => format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")), + Value::Null => "None".to_string(), + Value::Bool(true) => "True".to_string(), + Value::Bool(false) => "False".to_string(), + other => other.to_string(), + }) + .collect::>() + .join(" | ") + } +} + +struct ParameterTypeAnyOf { + base: BaseType, + branches: Vec, +} + +impl ParameterTypeAnyOf { + fn parse(schema: &Value, registry: &mut SchemaRegistry) -> Self { + let branches = schema + .get("anyOf") + .and_then(Value::as_array) + .map(|arr| { + arr.iter() + .map(|s| parse_parameter_type(s, registry)) + .collect() + }) + .unwrap_or_default(); + Self { + base: BaseType::from_extra_props(schema, &[]), + branches, + } + } + + fn to_typescript(&self, indent: &str, registry: &SchemaRegistry) -> String { + self.branches + .iter() + .map(|b| b.to_typescript(indent, registry)) + .collect::>() + .join(" | ") + } +} + +struct ParameterTypeUnion { + base: BaseType, + types: Vec, +} + +struct ParameterTypeRef { + base: BaseType, + ref_name: String, +} + +impl ParameterTypeUnion { + fn parse(schema: &Value) -> Self { + let raw_types = schema + .get("type") + .and_then(Value::as_array) + .map(|arr| arr.iter().filter_map(Value::as_str).collect::>()) + .unwrap_or_default(); + let types = raw_types + .into_iter() + .map(|t| match t { + "string" => "string".to_string(), + "number" => "number".to_string(), + "integer" => "number".to_string(), + "boolean" => "boolean".to_string(), + "null" => "null".to_string(), + "object" => "{}".to_string(), + "array" => "Array".to_string(), + other => other.to_string(), + }) + .collect(); + Self { + base: BaseType::from_extra_props(schema, &[]), + types, + } + } + + fn to_typescript(&self) -> String { + self.types.join(" | ") + } +} + +impl ParameterTypeRef { + fn parse(schema: &Value, registry: &mut SchemaRegistry) -> Self { + let reference = schema.get("$ref").and_then(Value::as_str).unwrap_or(""); + let resolved = registry.resolve_ref(reference); + let ref_name = match resolved { + Some(ref v) if v.get("$self_ref").and_then(Value::as_bool) == Some(true) => { + "parameters".to_string() + } + Some(_) => reference.rsplit('/').next().unwrap_or("").to_string(), + None => "any".to_string(), + }; + Self { + base: BaseType::from_extra_props(schema, &[]), + ref_name, + } + } + + fn to_typescript(&self) -> String { + self.ref_name.clone() + } +} + +fn parse_parameter_type(schema: &Value, registry: &mut SchemaRegistry) -> ParameterType { + if registry.depth >= MAX_RECURSION_DEPTH { + return ParameterType::Scalar(ParameterTypeScalar::any()); + } + registry.depth += 1; + let result = parse_parameter_type_inner(schema, registry); + registry.depth -= 1; + result +} + +fn parse_parameter_type_inner(schema: &Value, registry: &mut SchemaRegistry) -> ParameterType { + if schema.is_boolean() { + return ParameterType::Scalar(ParameterTypeScalar { + base: BaseType::default(), + typ: if schema.as_bool() == Some(true) { + "any" + } else { + "null" + } + .to_string(), + }); + } + let obj = match schema.as_object() { + Some(o) => o, + None => return ParameterType::Scalar(ParameterTypeScalar::any()), + }; + + if obj.contains_key("$ref") { + return ParameterType::Ref(ParameterTypeRef::parse(schema, registry)); + } + + if obj.contains_key("anyOf") { + return ParameterType::AnyOf(ParameterTypeAnyOf::parse(schema, registry)); + } + if obj.contains_key("enum") { + return ParameterType::Enum(ParameterTypeEnum::parse(schema)); + } + + if let Some(typ_value) = obj.get("type") { + if typ_value.is_array() { + return ParameterType::Union(ParameterTypeUnion::parse(schema)); + } + if let Some(typ) = typ_value.as_str() { + return match typ { + "object" => ParameterType::Object(ParameterTypeObject::parse(schema, registry)), + "array" => ParameterType::Array(ParameterTypeArray::parse(schema, registry)), + other => ParameterType::Scalar(ParameterTypeScalar::parse(other, schema)), + }; + } + } + if obj.is_empty() { + return ParameterType::Scalar(ParameterTypeScalar::any()); + } + // Fallthrough: schemas with no type/anyOf/enum/$ref. Degrade to `any` + // permissively rather than erroring — matches the Python reference. + ParameterType::Scalar(ParameterTypeScalar::any()) +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn format_description(description: &str, indent: &str) -> String { + description + .split('\n') + .map(|line| { + if line.is_empty() { + String::new() + } else { + format!("{indent}// {line}") + } + }) + .collect::>() + .join("\n") +} + +fn json_inline(v: &Value) -> String { + match v { + Value::String(s) => s.clone(), + Value::Bool(b) => b.to_string(), + Value::Number(n) => n.to_string(), + Value::Null => "null".to_string(), + other => serde_json::to_string(other).unwrap_or_default(), + } +} diff --git a/crates/tokenizer/src/encoders/mod.rs b/crates/tokenizer/src/encoders/mod.rs index a1be5a254..a20d380e0 100644 --- a/crates/tokenizer/src/encoders/mod.rs +++ b/crates/tokenizer/src/encoders/mod.rs @@ -1,2 +1,3 @@ pub mod deepseek_v32; pub mod deepseek_v4; +pub mod kimi_k25_tools; diff --git a/crates/tokenizer/src/tiktoken.rs b/crates/tokenizer/src/tiktoken.rs index eb5178832..933616b5e 100644 --- a/crates/tokenizer/src/tiktoken.rs +++ b/crates/tokenizer/src/tiktoken.rs @@ -13,10 +13,17 @@ use crate::{ load_chat_template_from_file, ChatTemplateContentFormat, ChatTemplateParams, ChatTemplateState, ThinkingKeyName, ThinkingToggle, }, + encoders::kimi_k25_tools::apply_kimi_k25_tools, factory::discover_chat_template_in_dir, traits::{Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait}, }; +#[derive(Debug, Clone, Copy)] +enum Renderer { + Jinja, + KimiK25Tools, +} + /// Regex pattern for cl100k_base tokenization. /// /// This pattern is correct for OpenAI models and most open-source tiktoken models (e.g. @@ -131,6 +138,7 @@ pub struct TiktokenTokenizer { vocab_size: usize, chat_template: ChatTemplateState, eos_token_ids: Vec, + renderer: Renderer, } /// Supported Tiktoken models @@ -180,6 +188,7 @@ impl TiktokenTokenizer { vocab_size, chat_template: ChatTemplateState::empty(), eos_token_ids: Vec::new(), // No directory path in from_model + renderer: Renderer::Jinja, }) } @@ -264,6 +273,9 @@ impl TiktokenTokenizer { // Load merged EOS token IDs from config.json + generation_config.json let eos_token_ids = crate::eos::load_eos_token_ids(dir); + // Detect which chat-template renderer to use based on config.json::architectures + let renderer = detect_renderer_from_config(dir); + Ok(TiktokenTokenizer { tokenizer, special_tokens: config.special_tokens, @@ -272,6 +284,7 @@ impl TiktokenTokenizer { vocab_size, chat_template: ChatTemplateState::new(chat_template)?, eos_token_ids, + renderer, }) } @@ -518,14 +531,18 @@ impl TokenizerTrait for TiktokenTokenizer { params: ChatTemplateParams, ) -> Result { // Inject special tokens if the caller didn't provide them - if params.special_tokens.is_some() { - return self.chat_template.apply(messages, params); - } - let params = ChatTemplateParams { - special_tokens: Some(&self.special_tokens), - ..params + let params = if params.special_tokens.is_some() { + params + } else { + ChatTemplateParams { + special_tokens: Some(&self.special_tokens), + ..params + } }; - self.chat_template.apply(messages, params) + match self.renderer { + Renderer::Jinja => self.chat_template.apply(messages, params), + Renderer::KimiK25Tools => apply_kimi_k25_tools(&self.chat_template, messages, params), + } } fn chat_template_content_format(&self) -> ChatTemplateContentFormat { @@ -552,6 +569,45 @@ impl TokenizerTrait for TiktokenTokenizer { } } +// --------------------------------------------------------------------------- +// Renderer detection (config.json::architectures) +// --------------------------------------------------------------------------- +/// Inspect the sibling `config.json` to decide which chat-template renderer to +/// use. Missing / unreadable / malformed config falls back to `Renderer::Jinja` +/// silently with a debug log, mirroring `huggingface.rs::detect_renderer_from_config`. +fn detect_renderer_from_config(dir: &Path) -> Renderer { + let path = dir.join("config.json"); + if !path.exists() { + return Renderer::Jinja; + } + let content = match std::fs::read_to_string(&path) { + Ok(c) => c, + Err(err) => { + tracing::debug!(?err, ?path, "config.json unreadable; using Jinja renderer"); + return Renderer::Jinja; + } + }; + let value: serde_json::Value = match serde_json::from_str(&content) { + Ok(v) => v, + Err(err) => { + tracing::debug!(?err, ?path, "config.json malformed; using Jinja renderer"); + return Renderer::Jinja; + } + }; + let is_kimi = value + .get("architectures") + .and_then(|v| v.as_array()) + .is_some_and(|a| { + a.iter() + .any(|v| v.as_str() == Some("KimiK25ForConditionalGeneration")) + }); + if is_kimi { + tracing::debug!(?path, "selected KimiK25Tools chat-template renderer"); + return Renderer::KimiK25Tools; + } + Renderer::Jinja +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/tokenizer/tests/fixtures/kimi_k25/chat_template.jinja b/crates/tokenizer/tests/fixtures/kimi_k25/chat_template.jinja new file mode 100644 index 000000000..c8812e3ac --- /dev/null +++ b/crates/tokenizer/tests/fixtures/kimi_k25/chat_template.jinja @@ -0,0 +1,108 @@ +{%- macro render_content(msg) -%} + {%- set c = msg.get('content') -%} + {%- if c is string -%} + {{ c }} + {%- elif c is not none -%} + {% for content in c -%} + {% if content['type'] == 'image' or content['type'] == 'image_url' -%} + <|media_begin|>image<|media_content|><|media_pad|><|media_end|> + {% elif content['type'] == 'video' or content['type']== 'video_url'-%} + <|kimi_k25_video_placeholder|> + {% else -%} + {{ content['text'] }} + {%- endif -%} + {%- endfor -%} + {%- endif -%} +{%- endmacro -%} + +{% macro set_roles(message) -%} + {%- set role_name = message.get('name') or message['role'] -%} + {%- if message['role'] == 'user' -%} + <|im_user|>{{role_name}}<|im_middle|> + {%- elif message['role'] == 'assistant' -%} + <|im_assistant|>{{role_name}}<|im_middle|> + {%- else -%} + <|im_system|>{{role_name}}<|im_middle|> + {%- endif -%} +{%- endmacro -%} + + +{%- macro render_toolcalls(message) -%} + <|tool_calls_section_begin|> + {%- for tool_call in message['tool_calls'] -%} + {%- set formatted_id = tool_call['id'] -%} + <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|> + {%- endfor -%} + <|tool_calls_section_end|> +{%- endmacro -%} + + +{# Find last non-tool-call assistant message #} +{%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%} +{%- for idx in range(messages|length-1, -1, -1) -%} + {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%} + {%- set ns.last_non_tool_call_assistant_msg = idx -%} + {%- break -%} + {%- endif -%} +{%- endfor -%} + +{# split all messages into history & suffix, reasoning_content in suffix should be reserved.#} +{%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%} +{%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%} + +{%- if tools -%} + {%- if tools_ts_str -%} + <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|> + {%- else -%} + <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|> + {%- endif -%} +{%- endif -%} + +{%- for message in hist_msgs -%} + {{set_roles(message)}} + {%- if message['role'] == 'assistant' -%} + {{render_content(message)}} + {%- if message.get('tool_calls') -%} + {{render_toolcalls(message)}} + {%- endif -%} + {%- elif message['role'] == 'tool' -%} + {%- set tool_call_id = message.tool_call_id -%} + ## Return of {{ tool_call_id }} +{{render_content(message)}} + {%- elif message['content'] is not none -%} + {{render_content(message)}} + {%- endif -%} + <|im_end|> +{%- endfor -%} + +{%- for message in suffix_msgs -%} + {{set_roles(message)}} + {%- if message['role'] == 'assistant' -%} + {%- if thinking is defined and thinking is false -%} + {{render_content(message)}} + {%- else -%} + {%- set rc = message.get('reasoning_content', '') -%} + {{rc}}{{render_content(message)}} + {%- endif -%} + {%- if message.get('tool_calls') -%} + {{render_toolcalls(message)}} + {%- endif -%} + {%- elif message['role'] == 'tool' -%} + {%- set tool_call_id = message.tool_call_id -%} + ## Return of {{ tool_call_id }} +{{render_content(message)}} + {%- elif message['content'] is not none -%} + {{render_content(message)}} + {%- endif -%} + <|im_end|> +{%- endfor -%} + + +{%- if add_generation_prompt -%} + <|im_assistant|>assistant<|im_middle|> + {%- if thinking is defined and thinking is false -%} + + {%- else -%} + + {%- endif -%} +{%- endif -%} diff --git a/crates/tokenizer/tests/kimi_k25.rs b/crates/tokenizer/tests/kimi_k25.rs new file mode 100644 index 000000000..b1af20a66 --- /dev/null +++ b/crates/tokenizer/tests/kimi_k25.rs @@ -0,0 +1,131 @@ +//! Essential coverage for the Kimi-K2.5 tool renderer: +//! - encoder shapes a nested-object schema with `?` for optional fields +//! - encoder emits a TS union for `enum` schemas +//! - end-to-end: a tokenizer loaded with `KimiK25ForConditionalGeneration` +//! dispatches tools through the TS-namespace encoder (not the JSON fallback) + +#![allow(clippy::expect_used, clippy::unwrap_used)] + +use std::fs; + +use llm_tokenizer::{ + chat_template::ChatTemplateParams, encoders::kimi_k25_tools::encode_tools_to_typescript, + traits::Tokenizer as TokenizerTrait, TiktokenTokenizer, +}; +use serde_json::{json, Value}; +use tempfile::TempDir; + +const MIN_TIKTOKEN_MODEL: &str = "aGVsbG8= 0\n"; + +#[test] +fn encoder_renders_nested_object_with_required() { + let tools: Vec = serde_json::from_str( + r#"[{ + "type": "function", + "function": { + "name": "create_user", + "description": "Create a new user record.", + "parameters": { + "type": "object", + "properties": { + "user": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name"] + } + }, + "required": ["user"] + } + } + }]"#, + ) + .unwrap(); + + let expected = "# Tools\n\n## functions\nnamespace functions {\n// Create a new user record.\ntype create_user = (_: {\n user: {\n name: string,\n age?: number\n }\n}) => any;\n}\n"; + assert_eq!( + encode_tools_to_typescript(&tools).as_deref(), + Some(expected) + ); +} + +#[test] +fn encoder_renders_enum_as_union() { + let tools: Vec = serde_json::from_str( + r#"[{ + "type": "function", + "function": { + "name": "set_status", + "parameters": { + "type": "object", + "properties": { + "status": {"enum": ["active", "paused", "done"]} + }, + "required": ["status"] + } + } + }]"#, + ) + .unwrap(); + + let expected = "# Tools\n\n## functions\nnamespace functions {\ntype set_status = (_: {\n status: \"active\" | \"paused\" | \"done\"\n}) => any;\n}\n"; + assert_eq!( + encode_tools_to_typescript(&tools).as_deref(), + Some(expected) + ); +} + +#[test] +fn chat_template_renders_typescript_namespace() { + let dir = TempDir::new().unwrap(); + let template = fs::read_to_string( + std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures/kimi_k25/chat_template.jinja"), + ) + .expect("vendored chat_template.jinja must exist"); + fs::write(dir.path().join("tiktoken.model"), MIN_TIKTOKEN_MODEL).unwrap(); + fs::write( + dir.path().join("config.json"), + r#"{"architectures": ["KimiK25ForConditionalGeneration"]}"#, + ) + .unwrap(); + fs::write(dir.path().join("chat_template.jinja"), template).unwrap(); + fs::write(dir.path().join("tokenizer_config.json"), "{}").unwrap(); + + let tok = TiktokenTokenizer::from_dir(dir.path()).expect("tokenizer should load"); + let messages = vec![json!({"role": "user", "content": "what's 2+2?"})]; + let tools = vec![json!({ + "type": "function", + "function": { + "name": "calc", + "description": "Compute an expression.", + "parameters": { + "type": "object", + "properties": {"expr": {"type": "string"}}, + "required": ["expr"] + } + } + })]; + + let rendered = tok + .apply_chat_template( + &messages, + ChatTemplateParams { + add_generation_prompt: true, + tools: Some(&tools), + ..Default::default() + }, + ) + .expect("render should succeed"); + + assert!( + rendered.contains("namespace functions") && rendered.contains("type calc = (_:"), + "expected TS namespace block, got:\n{rendered}" + ); + assert!( + !rendered.contains(r#"[{"function":"#), + "rendered prompt fell into JSON fallback path:\n{rendered}" + ); +}