lightseekorg · gongwei-130 · Apr 15, 2026 · Apr 22, 2026 · Apr 7, 2026 · Apr 8, 2026
@@ -445,6 +445,7 @@ struct Router {
     reasoning_parser: Option<String>,
     tool_call_parser: Option<String>,
     mcp_config_path: Option<String>,
+    enable_message_hash: bool,
     storage_hook_wasm_path: Option<String>,
     backend: BackendType,
     history_backend: HistoryBackendType,
@@ -731,6 +732,7 @@ impl Router {
             .maybe_tool_call_parser(self.tool_call_parser.as_ref())
             .maybe_mcp_config_path(self.mcp_config_path.as_ref())
             .maybe_storage_hook_wasm_path(self.storage_hook_wasm_path.as_deref())
+            .enable_message_hash(self.enable_message_hash)
             .dp_aware(self.dp_aware)
             .retries(!self.disable_retries)
             .circuit_breaker(!self.disable_circuit_breaker)
@@ -833,6 +835,7 @@ impl Router {
         reasoning_parser = None,
         tool_call_parser = None,
         mcp_config_path = None,
+        enable_message_hash = false,
         storage_hook_wasm_path = None,
         backend = BackendType::Sglang,
         history_backend = HistoryBackendType::Memory,
@@ -942,6 +945,7 @@ impl Router {
         reasoning_parser: Option<String>,
         tool_call_parser: Option<String>,
         mcp_config_path: Option<String>,
+        enable_message_hash: bool,
         storage_hook_wasm_path: Option<String>,
         backend: BackendType,
         history_backend: HistoryBackendType,
@@ -1062,6 +1066,7 @@ impl Router {
             reasoning_parser,
             tool_call_parser,
             mcp_config_path,
+            enable_message_hash,
             storage_hook_wasm_path,
             backend,
             history_backend,

@@ -119,6 +119,8 @@ class RouterArgs:
     mcp_config_path: str | None = None
     # Backend selection
     backend: str = "sglang"
+    # Message hash logging for session reconstruction
+    enable_message_hash: bool = False
     # Storage hooks (WASM)
     storage_hook_wasm_path: str | None = None
     # History backend configuration
@@ -472,6 +474,12 @@ def add_cli_args(
             default=RouterArgs.log_json,
             help="Output logs in JSON format",
         )
+        logging_group.add_argument(
+            f"--{prefix}enable-message-hash",
+            action="store_true",
+            default=RouterArgs.enable_message_hash,
+            help="Compute per-message SHA-256 hashes for session reconstruction logging",
+        )
 
         # Service discovery configuration
         k8s_group.add_argument(

@@ -132,6 +132,16 @@ message GenerateRequest {
   // When true, stop token IDs are retained in output_token_ids instead of
   // being stripped.
   bool include_stop_token_in_output = 26;
+
+  // Per-message SHA-256 hashes for session reconstruction.
+  repeated MessageHash message_hashes = 27;
+}
+
+// Per-message hash for session reconstruction auditing.
+// Hash is the first 12 hex chars of sha256(role + "\x00" + content).
+message MessageHash {
+  string role = 1;
+  string hash = 2;
 }
 
 // Tokenized input from router

@@ -497,6 +497,9 @@ impl SglangSchedulerClient {
                     .map_err(|e| format!("Failed to serialize JSON schema: {e}"))?;
                 constraints.push(proto::sampling_params::Constraint::JsonSchema(schema_str));
             }
+            Some(ResponseFormat::Regex { pattern }) => {
+                constraints.push(proto::sampling_params::Constraint::Regex(pattern.clone()));
+            }
             Some(ResponseFormat::Text) | None => {
                 // No constraint for text format
             }

@@ -265,6 +265,10 @@ impl TrtllmServiceClient {
         clippy::unused_self,
         reason = "method receiver kept for consistent public API across gRPC backends"
     )]
+    #[expect(
+        clippy::too_many_arguments,
+        reason = "gRPC request builder requires all fields for the proto message"
+    )]
     pub fn build_generate_request_from_chat(
         &self,
         request_id: String,
@@ -273,6 +277,8 @@ impl TrtllmServiceClient {
         token_ids: Vec<u32>,
         multimodal_input: Option<proto::MultimodalInput>,
         tool_call_constraint: Option<(String, String)>, // (constraint_type, constraint_value)
+        eos_token_ids: &[u32],
+        message_hashes: Option<Vec<(String, String)>>,
     ) -> Result<proto::GenerateRequest, String> {
         // Build sampling config
         let sampling_config = Self::build_sampling_config_from_chat(body);
@@ -287,6 +293,23 @@ impl TrtllmServiceClient {
 
         let max_tokens = body.max_completion_tokens.unwrap_or(2048);
 
+        // Pass merged EOS token IDs from config.json + generation_config.json.
+        // TRT-LLM's gRPC path does not reliably merge these internally,
+        // so we provide them explicitly via the standard stop_token_ids field.
+        let stop_token_ids: Vec<u32> = if body.ignore_eos {
+            vec![]
+        } else {
+            eos_token_ids.to_vec()
+        };
+
+        let proto_message_hashes = message_hashes
+            .map(|h| {
+                h.into_iter()
+                    .map(|(role, hash)| proto::MessageHash { role, hash })
+                    .collect()
+            })
+            .unwrap_or_default();
+
         let grpc_request = proto::GenerateRequest {
             request_id,
             tokenized: Some(proto::TokenizedInput {
@@ -299,7 +322,7 @@ impl TrtllmServiceClient {
             max_tokens,
             streaming: body.stream,
             stop,
-            stop_token_ids: vec![],
+            stop_token_ids,
             ignore_eos: body.ignore_eos,
             bad: vec![],
             bad_token_ids: vec![],
@@ -314,6 +337,7 @@ impl TrtllmServiceClient {
             cache_salt_id: None,
             arrival_time: None,
             include_stop_token_in_output: false,
+            message_hashes: proto_message_hashes,
         };
 
         Ok(grpc_request)
@@ -397,6 +421,7 @@ impl TrtllmServiceClient {
             cache_salt_id: None,
             arrival_time: None,
             include_stop_token_in_output: false,
+            message_hashes: vec![],
         };
 
         Ok(grpc_request)
@@ -414,6 +439,7 @@ impl TrtllmServiceClient {
         processed_text: String,
         token_ids: Vec<u32>,
         constraint: Option<(String, String)>,
+        message_hashes: Option<Vec<(String, String)>>,
     ) -> Result<proto::GenerateRequest, String> {
         let sampling_config = Self::build_sampling_config_from_responses(body);
         let output_config = proto::OutputConfig {
@@ -430,6 +456,14 @@ impl TrtllmServiceClient {
 
         let max_tokens = body.max_output_tokens.unwrap_or(2048);
 
+        let proto_message_hashes = message_hashes
+            .map(|h| {
+                h.into_iter()
+                    .map(|(role, hash)| proto::MessageHash { role, hash })
+                    .collect()
+            })
+            .unwrap_or_default();
+
         let grpc_request = proto::GenerateRequest {
             request_id,
             tokenized: Some(proto::TokenizedInput {
@@ -457,6 +491,7 @@ impl TrtllmServiceClient {
             cache_salt_id: None,
             arrival_time: None,
             include_stop_token_in_output: false,
+            message_hashes: proto_message_hashes,
         };
 
         Ok(grpc_request)
@@ -545,6 +580,12 @@ impl TrtllmServiceClient {
                     guide: schema_str,
                 });
             }
+            Some(ResponseFormat::Regex { pattern }) => {
+                return Ok(Some(proto::GuidedDecodingParams {
+                    guide_type: proto::guided_decoding_params::GuideType::Regex as i32,
+                    guide: pattern.clone(),
+                }));
+            }
             Some(ResponseFormat::Text) | None => {}
         }
 
@@ -641,6 +682,10 @@ impl TrtllmServiceClient {
         clippy::unused_self,
         reason = "method receiver kept for consistent public API"
     )]
+    #[expect(
+        clippy::too_many_arguments,
+        reason = "gRPC request builder needs all fields from the Messages API request"
+    )]
     pub fn build_generate_request_from_messages(
         &self,
         request_id: String,
@@ -649,6 +694,7 @@ impl TrtllmServiceClient {
         token_ids: Vec<u32>,
         multimodal_input: Option<proto::MultimodalInput>,
         tool_call_constraint: Option<(String, String)>,
+        message_hashes: Option<Vec<(String, String)>>,
     ) -> Result<proto::GenerateRequest, String> {
         let sampling_config = Self::build_sampling_config_from_messages(body);
         let output_config = proto::OutputConfig {
@@ -666,6 +712,14 @@ impl TrtllmServiceClient {
         let stop = body.stop_sequences.clone().unwrap_or_default();
         let max_tokens = body.max_tokens;
 
+        let proto_message_hashes = message_hashes
+            .map(|h| {
+                h.into_iter()
+                    .map(|(role, hash)| proto::MessageHash { role, hash })
+                    .collect()
+            })
+            .unwrap_or_default();
+
         let grpc_request = proto::GenerateRequest {
             request_id,
             tokenized: Some(proto::TokenizedInput {
@@ -693,6 +747,7 @@ impl TrtllmServiceClient {
             cache_salt_id: None,
             arrival_time: None,
             include_stop_token_in_output: false,
+            message_hashes: proto_message_hashes,
         };
 
         Ok(grpc_request)
@@ -784,6 +839,7 @@ impl TrtllmServiceClient {
             cache_salt_id: None,
             arrival_time: None,
             include_stop_token_in_output: body.no_stop_trim,
+            message_hashes: vec![],
         };
 
         Ok(grpc_request)

@@ -435,6 +435,9 @@ impl VllmEngineClient {
                     .map_err(|e| format!("Failed to serialize JSON schema: {e}"))?;
                 constraints.push(proto::sampling_params::Constraint::JsonSchema(schema_str));
             }
+            Some(ResponseFormat::Regex { pattern }) => {
+                constraints.push(proto::sampling_params::Constraint::Regex(pattern.clone()));
+            }
             Some(ResponseFormat::Text) | None => {
                 // No constraint for text format
             }

@@ -61,6 +61,10 @@ impl ModelProcessorSpec for KimiK25VisionSpec {
     ) -> RegistryResult<Vec<PromptReplacement>> {
         let pad_token_id = Self::pad_token_id(metadata)?;
         let placeholder_token = self.placeholder_token(metadata)?;
+        // Expand to N pad tokens per image. SGLang uses these directly for
+        // embedding lookup. TRT-LLM needs only 1 (it re-expands server-side),
+        // so the router collapses consecutive runs before sending to TRT-LLM
+        // via `collapse_media_placeholders` in multimodal.rs.
         Ok(preprocessed
             .num_img_tokens
             .iter()
@@ -142,7 +146,7 @@ mod tests {
             )
             .unwrap();
 
-        // 256 pad tokens (no start/end wrapper — SGLang handles that in the chat template)
+        // N pad tokens per image (SGLang uses directly; TRT-LLM collapses to 1)
         assert_eq!(replacements[0].tokens.len(), 256);
         assert!(replacements[0].tokens.iter().all(|&t| t == 163605));
     }

@@ -126,14 +126,7 @@ pub(super) mod test_helpers {
 
         fn get_special_tokens(&self) -> &SpecialTokens {
             static TOKENS: Lazy<SpecialTokens> = Lazy::new(|| SpecialTokens {
-                bos_token: None,
-                eos_token: None,
-                unk_token: None,
-                sep_token: None,
-                pad_token: None,
-                cls_token: None,
-                mask_token: None,
-                additional_special_tokens: vec![],
+                ..Default::default()
             });
             &TOKENS
         }

@@ -11,6 +11,7 @@ use super::{
         StringOrArray, Tool, ToolCall, ToolCallDelta, ToolChoice, ToolChoiceValue, ToolReference,
         Usage,
     },
+    messages::ThinkingConfig,
     sampling_params::{validate_top_k_value, validate_top_p_value},
 };
 use crate::{
@@ -48,6 +49,7 @@ pub enum ChatMessage {
     Tool {
         content: MessageContent,
         tool_call_id: String,
+        name: Option<String>,
     },
     #[serde(rename = "function")]
     Function { content: String, name: String },
@@ -201,6 +203,10 @@ pub struct ChatCompletionRequest {
     /// Effort level for reasoning models (low, medium, high)
     pub reasoning_effort: Option<String>,
 
+    /// Configuration for extended thinking (Anthropic-style).
+    /// Maps to chat_template_kwargs for thinking-capable models.
+    pub thinking: Option<ThinkingConfig>,
+
     /// An object specifying the format that the model must output
     pub response_format: Option<ResponseFormat>,
 
@@ -317,6 +323,11 @@ pub struct ChatCompletionRequest {
     /// Random seed for sampling for deterministic outputs
     pub sampling_seed: Option<u64>,
 
+    /// User-supplied request ID for log correlation.
+    /// If set, SMG passes it through to the engine instead of generating its own UUID.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub request_id: Option<String>,
+
     /// Additional fields not explicitly defined above (e.g. engine-specific parameters)
     #[serde(flatten)]
     pub other: Map<String, Value>,
@@ -403,6 +414,7 @@ fn validate_chat_cross_parameters(
         req.regex.is_some(),
         req.ebnf.is_some(),
         matches!(req.response_format, Some(ResponseFormat::JsonSchema { .. })),
+        matches!(req.response_format, Some(ResponseFormat::Regex { .. })),
     ]
     .iter()
     .filter(|&&x| x)
@@ -567,6 +579,29 @@ impl Normalizable for ChatCompletionRequest {
             self.function_call = None; // Clear deprecated field
         }
 
+        // Migrate thinking → chat_template_kwargs
+        if let Some(ref thinking) = self.thinking {
+            let kwargs = self.chat_template_kwargs.get_or_insert_with(HashMap::new);
+            match thinking {
+                ThinkingConfig::Enabled { .. } => {
+                    kwargs
+                        .entry("enable_thinking".to_string())
+                        .or_insert(Value::Bool(true));
+                    kwargs
+                        .entry("thinking".to_string())
+                        .or_insert(Value::Bool(true));
-                ThinkingConfig::Enabled { .. } => {
-                    kwargs
-                        .entry("enable_thinking".to_string())
-                        .or_insert(Value::Bool(true));
-                    kwargs
-                        .entry("thinking".to_string())
-                        .or_insert(Value::Bool(true));
+                ThinkingConfig::Enabled { budget_tokens } => {
+                    kwargs
+                        .entry("enable_thinking".to_string())
+                        .or_insert(Value::Bool(true));
+                    kwargs
+                        .entry("thinking".to_string())
+                        .or_insert(Value::Bool(true));
+                    kwargs
+                        .entry("budget_tokens".to_string())
+                        .or_insert(Value::Number((*budget_tokens).into()));
-                ThinkingConfig::Enabled { .. } => {
-                    kwargs
-                        .entry("enable_thinking".to_string())
-                        .or_insert(Value::Bool(true));
-                    kwargs
-                        .entry("thinking".to_string())
-                        .or_insert(Value::Bool(true));
+                ThinkingConfig::Enabled { budget_tokens } => {
+                    kwargs
+                        .entry("enable_thinking".to_string())
+                        .or_insert(Value::Bool(true));
+                    kwargs
+                        .entry("thinking".to_string())
+                        .or_insert(Value::Bool(true));
+                    kwargs
+                        .entry("budget_tokens".to_string())
+                        .or_insert(Value::Number((*budget_tokens).into()));
+                }
+                ThinkingConfig::Disabled => {
+                    kwargs
+                        .entry("enable_thinking".to_string())
+                        .or_insert(Value::Bool(false));
+                    kwargs
+                        .entry("thinking".to_string())
+                        .or_insert(Value::Bool(false));
+                }
+            }
+        }
+
         // Apply tool_choice defaults
         if self.tool_choice.is_none() {
             if let Some(tools) = &self.tools {