diff --git a/src/server/src/routes/openai/chat_completions.rs b/src/server/src/routes/openai/chat_completions.rs index 106dcea..3afd395 100644 --- a/src/server/src/routes/openai/chat_completions.rs +++ b/src/server/src/routes/openai/chat_completions.rs @@ -26,8 +26,9 @@ use vllm_engine_core_client::protocol::StopReason; use crate::error::{ApiError, bail_server_error, server_error}; use crate::routes::openai::chat_completions::convert::prepare_chat_request; use crate::routes::openai::chat_completions::types::{ - ChatCompletionChoice, ChatCompletionMessage, ChatCompletionRequest, ChatCompletionResponse, - ChatCompletionStreamChoice, ChatCompletionStreamResponse, ChatMessageDelta, + AssistantRole, ChatCompletionChoice, ChatCompletionMessage, ChatCompletionRequest, + ChatCompletionResponse, ChatCompletionStreamChoice, ChatCompletionStreamResponse, + ChatMessageDelta, }; use crate::routes::openai::utils::logprobs::{ decoded_logprobs_to_openai_chat, decoded_prompt_logprobs_to_maps, @@ -202,7 +203,7 @@ async fn collect_chat_completion( choices: vec![ChatCompletionChoice { index: 0, message: ChatCompletionMessage { - role: "assistant".to_string(), + role: AssistantRole, content: match &echo { Some(prefix) => Some(format!("{prefix}{}", message.text())), None => Some(message.text()).filter(|t| !t.is_empty()), @@ -612,7 +613,7 @@ fn start_chunk( let mut chunk = ChatCompletionStreamResponse::new(request_id, response_model, created); chunk.choices.push(ChatCompletionStreamChoice { delta: ChatMessageDelta { - role: Some("assistant".to_string()), + role: Some(AssistantRole), ..Default::default() }, ..Default::default() diff --git a/src/server/src/routes/openai/chat_completions/convert.rs b/src/server/src/routes/openai/chat_completions/convert.rs index 4f2ee1c..1d010a9 100644 --- a/src/server/src/routes/openai/chat_completions/convert.rs +++ b/src/server/src/routes/openai/chat_completions/convert.rs @@ -200,16 +200,14 @@ fn convert_message(message: ChatMessage) -> Result { ChatMessage::Assistant { content, tool_calls, - reasoning_content, + reasoning, name: _, } => { let mut blocks = Vec::new(); - if let Some(reasoning_content) = reasoning_content - && !reasoning_content.is_empty() + if let Some(reasoning) = reasoning + && !reasoning.is_empty() { - blocks.push(AssistantContentBlock::Reasoning { - text: reasoning_content, - }); + blocks.push(AssistantContentBlock::Reasoning { text: reasoning }); } if let Some(content) = content { blocks.extend(convert_assistant_text_blocks(content)?); @@ -345,7 +343,9 @@ mod tests { use vllm_text::output::TextDecodeOptions; use super::prepare_chat_request; - use crate::routes::openai::chat_completions::types::ChatCompletionRequest; + use crate::routes::openai::chat_completions::types::{ + AssistantRole, ChatCompletionMessage, ChatCompletionRequest, + }; use crate::routes::openai::utils::types::{ ChatMessage, ContentPart, Function, FunctionCallResponse, ImageUrl, MessageContent, Tool, ToolCall, ToolChoice, ToolChoiceValue, @@ -377,7 +377,7 @@ mod tests { }])), name: None, tool_calls: None, - reasoning_content: None, + reasoning: None, }]; request.add_generation_prompt = Some(false); request.continue_final_message = true; @@ -583,13 +583,18 @@ mod tests { #[test] fn prepare_chat_request_accepts_assistant_reasoning_history() { + let message = ChatCompletionMessage { + role: AssistantRole, + content: Some("answer".to_string()), + tool_calls: None, + reasoning: Some("inner".to_string()), + }; + let message_json = serde_json::to_value(message).expect("message serializes"); + let request = ChatCompletionRequest { - messages: vec![ChatMessage::Assistant { - content: Some(MessageContent::Text("answer".to_string())), - name: None, - tool_calls: None, - reasoning_content: Some("inner".to_string()), - }], + messages: vec![ + serde_json::from_value(message_json).expect("response message is valid history"), + ], add_generation_prompt: Some(false), ..base_request() }; @@ -615,6 +620,40 @@ mod tests { assert_eq!(prepared.chat_request.tool_choice, ChatToolChoice::Auto); } + #[test] + fn prepare_chat_request_accepts_legacy_reasoning_content_alias() { + let request = ChatCompletionRequest { + messages: vec![ + serde_json::from_value(json!({ + "role": "assistant", + "content": "answer", + "reasoning_content": "inner", + })) + .expect("legacy reasoning_content alias is accepted"), + ], + add_generation_prompt: Some(false), + ..base_request() + }; + + let prepared = prepare_chat_request( + request, + "Qwen/Qwen1.5-0.5B-Chat", + ResolvedRequestContext::default(), + ) + .expect("request is valid"); + assert_eq!( + prepared.chat_request.messages, + vec![VllmChatMessage::assistant_blocks(vec![ + AssistantContentBlock::Reasoning { + text: "inner".to_string(), + }, + AssistantContentBlock::Text { + text: "answer".to_string(), + }, + ])] + ); + } + #[test] fn prepare_chat_request_accepts_tools_and_tool_history() { let request = ChatCompletionRequest { @@ -630,7 +669,7 @@ mod tests { arguments: Some(r#"{"city":"Paris"}"#.to_string()), }, }]), - reasoning_content: None, + reasoning: None, }, ChatMessage::Tool { content: MessageContent::Text("Sunny".to_string()), @@ -799,7 +838,7 @@ mod tests { content: Some(MessageContent::Text("hello".to_string())), name: None, tool_calls: None, - reasoning_content: None, + reasoning: None, }]; request.continue_final_message = true; @@ -839,7 +878,7 @@ mod tests { content: Some(MessageContent::Text("hello".to_string())), name: None, tool_calls: None, - reasoning_content: None, + reasoning: None, }], ..base_request() }; diff --git a/src/server/src/routes/openai/chat_completions/types.rs b/src/server/src/routes/openai/chat_completions/types.rs index 82c14d9..a334e01 100644 --- a/src/server/src/routes/openai/chat_completions/types.rs +++ b/src/server/src/routes/openai/chat_completions/types.rs @@ -1,7 +1,9 @@ use std::collections::HashMap; +use std::fmt; use serde::{Deserialize, Serialize}; use serde_json::Value; +use serde_with::SerializeDisplay; use validator::Validate; use vllm_chat::ReasoningEffort; @@ -347,11 +349,22 @@ pub(super) struct ChatCompletionChoice { pub token_ids: Option>, } +/// A literal type for the "assistant" role, since the API only allows that specific value in +/// responses. +#[derive(Debug, Clone, Copy, PartialEq, Eq, SerializeDisplay)] +pub(super) struct AssistantRole; + +impl fmt::Display for AssistantRole { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("assistant") + } +} + /// Mirrors the Python vLLM response `ChatMessage` class. #[serde_with::skip_serializing_none] #[derive(Debug, Clone, Serialize)] pub(super) struct ChatCompletionMessage { - pub role: String, + pub role: AssistantRole, pub content: Option, pub tool_calls: Option>, pub reasoning: Option, @@ -401,7 +414,7 @@ pub(super) struct ChatCompletionStreamChoice { #[serde_with::skip_serializing_none] #[derive(Debug, Clone, Default, Serialize)] pub(super) struct ChatMessageDelta { - pub role: Option, + pub role: Option, pub content: Option, pub tool_calls: Option>, pub reasoning: Option, diff --git a/src/server/src/routes/openai/utils/types.rs b/src/server/src/routes/openai/utils/types.rs index 8187717..505713c 100644 --- a/src/server/src/routes/openai/utils/types.rs +++ b/src/server/src/routes/openai/utils/types.rs @@ -273,8 +273,10 @@ pub enum ChatMessage { content: Option, name: Option, tool_calls: Option>, - /// Reasoning content for O1-style models. - reasoning_content: Option, + /// Reasoning content for reasoning-capable models. + #[serde(alias = "reasoning_content")] + #[serde(alias = "thinking")] + reasoning: Option, }, #[serde(rename = "tool")] Tool {