Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4da2617
fix(grpc): skip reasoning parser when constrained decoding is active
vschandramourya Apr 15, 2026
b5830db
fix(grpc): register raise_exception in chat templates and coerce tool…
ConnorLi96 Apr 22, 2026
404afea
fix(tool_parser): fix function call parsing for models with native to…
ConnorLi96 Apr 7, 2026
163dc59
fix(gateway): comprehensive func call and response quality fixes
ConnorLi96 Apr 8, 2026
0a999ea
fix(tokenizer): load merged EOS token IDs from config.json + generati…
ConnorLi96 Apr 8, 2026
efc36fa
fix(tokenizer): reduce tiktoken partial UTF-8 decode log from warn to…
ConnorLi96 Apr 10, 2026
35de058
feat(protocol): add thinking param to Chat API and support bare strin…
ConnorLi96 Apr 11, 2026
2d10dbb
fix(reasoning): run reasoning parser before JSON/tool post-processing…
ConnorLi96 Apr 18, 2026
849382a
style: fix formatting, clippy warnings, and merge artifacts from cher…
ConnorLi96 Apr 23, 2026
1ea9977
fix(streaming): enable reasoning parser for constrained outputs
ConnorLi96 Apr 23, 2026
4e123d9
fix(kimik2): rewrite tool_call IDs and fix cross-chunk fence stripping
ConnorLi96 Apr 24, 2026
0bc9f24
feat(health): make /health_generate issue a real backend probe with l…
ConnorLi96 Apr 24, 2026
65dc892
feat(logging): pass through user-supplied request_id to engine
ConnorLi96 Apr 24, 2026
ddccd3b
feat(logging): compute per-message SHA-256 hashes for session reconst…
ConnorLi96 Apr 24, 2026
9edea19
fix(streaming): replace fence_buffer with simple cross-chunk fence st…
ConnorLi96 Apr 24, 2026
7086874
refactor(streaming): extract strip_json_fence helper with unit tests
ConnorLi96 Apr 25, 2026
52dd84b
feat(grpc): pass message_hashes through gRPC proto to TRT-LLM
ConnorLi96 Apr 25, 2026
f979166
fix(health): increase health_generate probe timeout from 3s to 60s
ConnorLi96 Apr 28, 2026
65ebdbf
fix(reasoning): skip reasoning parsing for structured output requests
ConnorLi96 Apr 28, 2026
0061eea
feat(health): skip inference probe when tokens forwarded recently
ConnorLi96 Apr 28, 2026
10af42d
fix(multimodal): use 1 placeholder per image for Kimi-K2.5
ConnorLi96 Apr 28, 2026
6764b57
fix(multimodal): collapse media placeholders for TRT-LLM only
ConnorLi96 Apr 29, 2026
4398a4f
fix(logging): downgrade message_hash log from info to debug
ConnorLi96 Apr 29, 2026
6a2278f
feat(protocols): add response_format.type=regex support
vschandramourya Apr 18, 2026
ad34ef9
Fix regex structured output reasoning parsing
May 2, 2026
883c8e7
Fix lint formatting
May 2, 2026
3c42c86
Make health_generate probe timeout configurable
May 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bindings/python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ struct Router {
reasoning_parser: Option<String>,
tool_call_parser: Option<String>,
mcp_config_path: Option<String>,
enable_message_hash: bool,
storage_hook_wasm_path: Option<String>,
backend: BackendType,
history_backend: HistoryBackendType,
Expand Down Expand Up @@ -731,6 +732,7 @@ impl Router {
.maybe_tool_call_parser(self.tool_call_parser.as_ref())
.maybe_mcp_config_path(self.mcp_config_path.as_ref())
.maybe_storage_hook_wasm_path(self.storage_hook_wasm_path.as_deref())
.enable_message_hash(self.enable_message_hash)
.dp_aware(self.dp_aware)
.retries(!self.disable_retries)
.circuit_breaker(!self.disable_circuit_breaker)
Expand Down Expand Up @@ -833,6 +835,7 @@ impl Router {
reasoning_parser = None,
tool_call_parser = None,
mcp_config_path = None,
enable_message_hash = false,
storage_hook_wasm_path = None,
backend = BackendType::Sglang,
history_backend = HistoryBackendType::Memory,
Expand Down Expand Up @@ -942,6 +945,7 @@ impl Router {
reasoning_parser: Option<String>,
tool_call_parser: Option<String>,
mcp_config_path: Option<String>,
enable_message_hash: bool,
storage_hook_wasm_path: Option<String>,
backend: BackendType,
history_backend: HistoryBackendType,
Expand Down Expand Up @@ -1062,6 +1066,7 @@ impl Router {
reasoning_parser,
tool_call_parser,
mcp_config_path,
enable_message_hash,
storage_hook_wasm_path,
backend,
history_backend,
Expand Down
8 changes: 8 additions & 0 deletions bindings/python/src/smg/router_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ class RouterArgs:
mcp_config_path: str | None = None
# Backend selection
backend: str = "sglang"
# Message hash logging for session reconstruction
enable_message_hash: bool = False
# Storage hooks (WASM)
storage_hook_wasm_path: str | None = None
# History backend configuration
Expand Down Expand Up @@ -472,6 +474,12 @@ def add_cli_args(
default=RouterArgs.log_json,
help="Output logs in JSON format",
)
logging_group.add_argument(
f"--{prefix}enable-message-hash",
action="store_true",
default=RouterArgs.enable_message_hash,
help="Compute per-message SHA-256 hashes for session reconstruction logging",
)

# Service discovery configuration
k8s_group.add_argument(
Expand Down
10 changes: 10 additions & 0 deletions crates/grpc_client/proto/trtllm_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,16 @@ message GenerateRequest {
// When true, stop token IDs are retained in output_token_ids instead of
// being stripped.
bool include_stop_token_in_output = 26;

// Per-message SHA-256 hashes for session reconstruction.
repeated MessageHash message_hashes = 27;
}

// Per-message hash for session reconstruction auditing.
// Hash is the first 12 hex chars of sha256(role + "\x00" + content).
message MessageHash {
string role = 1;
string hash = 2;
}

// Tokenized input from router
Expand Down
3 changes: 3 additions & 0 deletions crates/grpc_client/src/sglang_scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,9 @@ impl SglangSchedulerClient {
.map_err(|e| format!("Failed to serialize JSON schema: {e}"))?;
constraints.push(proto::sampling_params::Constraint::JsonSchema(schema_str));
}
Some(ResponseFormat::Regex { pattern }) => {
constraints.push(proto::sampling_params::Constraint::Regex(pattern.clone()));
}
Some(ResponseFormat::Text) | None => {
// No constraint for text format
}
Expand Down
58 changes: 57 additions & 1 deletion crates/grpc_client/src/trtllm_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ impl TrtllmServiceClient {
clippy::unused_self,
reason = "method receiver kept for consistent public API across gRPC backends"
)]
#[expect(
clippy::too_many_arguments,
reason = "gRPC request builder requires all fields for the proto message"
)]
pub fn build_generate_request_from_chat(
&self,
request_id: String,
Expand All @@ -273,6 +277,8 @@ impl TrtllmServiceClient {
token_ids: Vec<u32>,
multimodal_input: Option<proto::MultimodalInput>,
tool_call_constraint: Option<(String, String)>, // (constraint_type, constraint_value)
eos_token_ids: &[u32],
message_hashes: Option<Vec<(String, String)>>,
) -> Result<proto::GenerateRequest, String> {
// Build sampling config
let sampling_config = Self::build_sampling_config_from_chat(body);
Expand All @@ -287,6 +293,23 @@ impl TrtllmServiceClient {

let max_tokens = body.max_completion_tokens.unwrap_or(2048);

// Pass merged EOS token IDs from config.json + generation_config.json.
// TRT-LLM's gRPC path does not reliably merge these internally,
// so we provide them explicitly via the standard stop_token_ids field.
let stop_token_ids: Vec<u32> = if body.ignore_eos {
vec![]
} else {
eos_token_ids.to_vec()
};

let proto_message_hashes = message_hashes
.map(|h| {
h.into_iter()
.map(|(role, hash)| proto::MessageHash { role, hash })
.collect()
})
.unwrap_or_default();

let grpc_request = proto::GenerateRequest {
request_id,
tokenized: Some(proto::TokenizedInput {
Expand All @@ -299,7 +322,7 @@ impl TrtllmServiceClient {
max_tokens,
streaming: body.stream,
stop,
stop_token_ids: vec![],
stop_token_ids,
ignore_eos: body.ignore_eos,
bad: vec![],
bad_token_ids: vec![],
Expand All @@ -314,6 +337,7 @@ impl TrtllmServiceClient {
cache_salt_id: None,
arrival_time: None,
include_stop_token_in_output: false,
message_hashes: proto_message_hashes,
};

Ok(grpc_request)
Expand Down Expand Up @@ -397,6 +421,7 @@ impl TrtllmServiceClient {
cache_salt_id: None,
arrival_time: None,
include_stop_token_in_output: false,
message_hashes: vec![],
};

Ok(grpc_request)
Expand All @@ -414,6 +439,7 @@ impl TrtllmServiceClient {
processed_text: String,
token_ids: Vec<u32>,
constraint: Option<(String, String)>,
message_hashes: Option<Vec<(String, String)>>,
) -> Result<proto::GenerateRequest, String> {
let sampling_config = Self::build_sampling_config_from_responses(body);
let output_config = proto::OutputConfig {
Expand All @@ -430,6 +456,14 @@ impl TrtllmServiceClient {

let max_tokens = body.max_output_tokens.unwrap_or(2048);

let proto_message_hashes = message_hashes
.map(|h| {
h.into_iter()
.map(|(role, hash)| proto::MessageHash { role, hash })
.collect()
})
.unwrap_or_default();

let grpc_request = proto::GenerateRequest {
request_id,
tokenized: Some(proto::TokenizedInput {
Expand Down Expand Up @@ -457,6 +491,7 @@ impl TrtllmServiceClient {
cache_salt_id: None,
arrival_time: None,
include_stop_token_in_output: false,
message_hashes: proto_message_hashes,
};

Ok(grpc_request)
Expand Down Expand Up @@ -545,6 +580,12 @@ impl TrtllmServiceClient {
guide: schema_str,
});
}
Some(ResponseFormat::Regex { pattern }) => {
return Ok(Some(proto::GuidedDecodingParams {
guide_type: proto::guided_decoding_params::GuideType::Regex as i32,
guide: pattern.clone(),
}));
}
Some(ResponseFormat::Text) | None => {}
}

Expand Down Expand Up @@ -641,6 +682,10 @@ impl TrtllmServiceClient {
clippy::unused_self,
reason = "method receiver kept for consistent public API"
)]
#[expect(
clippy::too_many_arguments,
reason = "gRPC request builder needs all fields from the Messages API request"
)]
pub fn build_generate_request_from_messages(
&self,
request_id: String,
Expand All @@ -649,6 +694,7 @@ impl TrtllmServiceClient {
token_ids: Vec<u32>,
multimodal_input: Option<proto::MultimodalInput>,
tool_call_constraint: Option<(String, String)>,
message_hashes: Option<Vec<(String, String)>>,
) -> Result<proto::GenerateRequest, String> {
let sampling_config = Self::build_sampling_config_from_messages(body);
let output_config = proto::OutputConfig {
Expand All @@ -666,6 +712,14 @@ impl TrtllmServiceClient {
let stop = body.stop_sequences.clone().unwrap_or_default();
let max_tokens = body.max_tokens;

let proto_message_hashes = message_hashes
.map(|h| {
h.into_iter()
.map(|(role, hash)| proto::MessageHash { role, hash })
.collect()
})
.unwrap_or_default();

let grpc_request = proto::GenerateRequest {
request_id,
tokenized: Some(proto::TokenizedInput {
Expand Down Expand Up @@ -693,6 +747,7 @@ impl TrtllmServiceClient {
cache_salt_id: None,
arrival_time: None,
include_stop_token_in_output: false,
message_hashes: proto_message_hashes,
};

Ok(grpc_request)
Expand Down Expand Up @@ -784,6 +839,7 @@ impl TrtllmServiceClient {
cache_salt_id: None,
arrival_time: None,
include_stop_token_in_output: body.no_stop_trim,
message_hashes: vec![],
};

Ok(grpc_request)
Expand Down
3 changes: 3 additions & 0 deletions crates/grpc_client/src/vllm_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,9 @@ impl VllmEngineClient {
.map_err(|e| format!("Failed to serialize JSON schema: {e}"))?;
constraints.push(proto::sampling_params::Constraint::JsonSchema(schema_str));
}
Some(ResponseFormat::Regex { pattern }) => {
constraints.push(proto::sampling_params::Constraint::Regex(pattern.clone()));
}
Some(ResponseFormat::Text) | None => {
// No constraint for text format
}
Expand Down
6 changes: 5 additions & 1 deletion crates/multimodal/src/registry/kimi_k25.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ impl ModelProcessorSpec for KimiK25VisionSpec {
) -> RegistryResult<Vec<PromptReplacement>> {
let pad_token_id = Self::pad_token_id(metadata)?;
let placeholder_token = self.placeholder_token(metadata)?;
// Expand to N pad tokens per image. SGLang uses these directly for
// embedding lookup. TRT-LLM needs only 1 (it re-expands server-side),
// so the router collapses consecutive runs before sending to TRT-LLM
// via `collapse_media_placeholders` in multimodal.rs.
Ok(preprocessed
.num_img_tokens
.iter()
Expand Down Expand Up @@ -142,7 +146,7 @@ mod tests {
)
.unwrap();

// 256 pad tokens (no start/end wrapper — SGLang handles that in the chat template)
// N pad tokens per image (SGLang uses directly; TRT-LLM collapses to 1)
assert_eq!(replacements[0].tokens.len(), 256);
assert!(replacements[0].tokens.iter().all(|&t| t == 163605));
}
Expand Down
9 changes: 1 addition & 8 deletions crates/multimodal/src/registry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,7 @@ pub(super) mod test_helpers {

fn get_special_tokens(&self) -> &SpecialTokens {
static TOKENS: Lazy<SpecialTokens> = Lazy::new(|| SpecialTokens {
bos_token: None,
eos_token: None,
unk_token: None,
sep_token: None,
pad_token: None,
cls_token: None,
mask_token: None,
additional_special_tokens: vec![],
..Default::default()
});
&TOKENS
}
Expand Down
35 changes: 35 additions & 0 deletions crates/protocols/src/chat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use super::{
StringOrArray, Tool, ToolCall, ToolCallDelta, ToolChoice, ToolChoiceValue, ToolReference,
Usage,
},
messages::ThinkingConfig,
sampling_params::{validate_top_k_value, validate_top_p_value},
};
use crate::{
Expand Down Expand Up @@ -48,6 +49,7 @@ pub enum ChatMessage {
Tool {
content: MessageContent,
tool_call_id: String,
name: Option<String>,
},
#[serde(rename = "function")]
Function { content: String, name: String },
Expand Down Expand Up @@ -201,6 +203,10 @@ pub struct ChatCompletionRequest {
/// Effort level for reasoning models (low, medium, high)
pub reasoning_effort: Option<String>,

/// Configuration for extended thinking (Anthropic-style).
/// Maps to chat_template_kwargs for thinking-capable models.
pub thinking: Option<ThinkingConfig>,

/// An object specifying the format that the model must output
pub response_format: Option<ResponseFormat>,

Expand Down Expand Up @@ -317,6 +323,11 @@ pub struct ChatCompletionRequest {
/// Random seed for sampling for deterministic outputs
pub sampling_seed: Option<u64>,

/// User-supplied request ID for log correlation.
/// If set, SMG passes it through to the engine instead of generating its own UUID.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub request_id: Option<String>,

/// Additional fields not explicitly defined above (e.g. engine-specific parameters)
#[serde(flatten)]
pub other: Map<String, Value>,
Expand Down Expand Up @@ -403,6 +414,7 @@ fn validate_chat_cross_parameters(
req.regex.is_some(),
req.ebnf.is_some(),
matches!(req.response_format, Some(ResponseFormat::JsonSchema { .. })),
matches!(req.response_format, Some(ResponseFormat::Regex { .. })),
]
.iter()
.filter(|&&x| x)
Expand Down Expand Up @@ -567,6 +579,29 @@ impl Normalizable for ChatCompletionRequest {
self.function_call = None; // Clear deprecated field
}

// Migrate thinking → chat_template_kwargs
if let Some(ref thinking) = self.thinking {
let kwargs = self.chat_template_kwargs.get_or_insert_with(HashMap::new);
match thinking {
ThinkingConfig::Enabled { .. } => {
kwargs
.entry("enable_thinking".to_string())
.or_insert(Value::Bool(true));
kwargs
.entry("thinking".to_string())
.or_insert(Value::Bool(true));
Comment on lines +586 to +592
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Nit: ThinkingConfig::Enabled { .. } silently drops the budget_tokens field. Users who send {"type": "enabled", "budget_tokens": 4096} will get thinking enabled but their budget constraint ignored. Consider passing budget_tokens through as a kwarg (e.g., "budget_tokens") so chat templates or downstream engines that support it can respect the value.

Suggested change
ThinkingConfig::Enabled { .. } => {
kwargs
.entry("enable_thinking".to_string())
.or_insert(Value::Bool(true));
kwargs
.entry("thinking".to_string())
.or_insert(Value::Bool(true));
ThinkingConfig::Enabled { budget_tokens } => {
kwargs
.entry("enable_thinking".to_string())
.or_insert(Value::Bool(true));
kwargs
.entry("thinking".to_string())
.or_insert(Value::Bool(true));
kwargs
.entry("budget_tokens".to_string())
.or_insert(Value::Number((*budget_tokens).into()));

}
ThinkingConfig::Disabled => {
kwargs
.entry("enable_thinking".to_string())
.or_insert(Value::Bool(false));
kwargs
.entry("thinking".to_string())
.or_insert(Value::Bool(false));
}
}
}

// Apply tool_choice defaults
if self.tool_choice.is_none() {
if let Some(tools) = &self.tools {
Expand Down
Loading
Loading