Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions src/server/src/routes/openai/chat_completions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ use vllm_engine_core_client::protocol::StopReason;
use crate::error::{ApiError, bail_server_error, server_error};
use crate::routes::openai::chat_completions::convert::prepare_chat_request;
use crate::routes::openai::chat_completions::types::{
ChatCompletionChoice, ChatCompletionMessage, ChatCompletionRequest, ChatCompletionResponse,
ChatCompletionStreamChoice, ChatCompletionStreamResponse, ChatMessageDelta,
AssistantRole, ChatCompletionChoice, ChatCompletionMessage, ChatCompletionRequest,
ChatCompletionResponse, ChatCompletionStreamChoice, ChatCompletionStreamResponse,
ChatMessageDelta,
};
use crate::routes::openai::utils::logprobs::{
decoded_logprobs_to_openai_chat, decoded_prompt_logprobs_to_maps,
Expand Down Expand Up @@ -202,7 +203,7 @@ async fn collect_chat_completion(
choices: vec![ChatCompletionChoice {
index: 0,
message: ChatCompletionMessage {
role: "assistant".to_string(),
role: AssistantRole,
content: match &echo {
Some(prefix) => Some(format!("{prefix}{}", message.text())),
None => Some(message.text()).filter(|t| !t.is_empty()),
Expand Down Expand Up @@ -612,7 +613,7 @@ fn start_chunk(
let mut chunk = ChatCompletionStreamResponse::new(request_id, response_model, created);
chunk.choices.push(ChatCompletionStreamChoice {
delta: ChatMessageDelta {
role: Some("assistant".to_string()),
role: Some(AssistantRole),
..Default::default()
},
..Default::default()
Expand Down
73 changes: 56 additions & 17 deletions src/server/src/routes/openai/chat_completions/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,16 +200,14 @@ fn convert_message(message: ChatMessage) -> Result<VllmChatMessage, ApiError> {
ChatMessage::Assistant {
content,
tool_calls,
reasoning_content,
reasoning,
name: _,
} => {
let mut blocks = Vec::new();
if let Some(reasoning_content) = reasoning_content
&& !reasoning_content.is_empty()
if let Some(reasoning) = reasoning
&& !reasoning.is_empty()
{
blocks.push(AssistantContentBlock::Reasoning {
text: reasoning_content,
});
blocks.push(AssistantContentBlock::Reasoning { text: reasoning });
Comment thread
BugenZhao marked this conversation as resolved.
}
if let Some(content) = content {
blocks.extend(convert_assistant_text_blocks(content)?);
Expand Down Expand Up @@ -345,7 +343,9 @@ mod tests {
use vllm_text::output::TextDecodeOptions;

use super::prepare_chat_request;
use crate::routes::openai::chat_completions::types::ChatCompletionRequest;
use crate::routes::openai::chat_completions::types::{
AssistantRole, ChatCompletionMessage, ChatCompletionRequest,
};
use crate::routes::openai::utils::types::{
ChatMessage, ContentPart, Function, FunctionCallResponse, ImageUrl, MessageContent, Tool,
ToolCall, ToolChoice, ToolChoiceValue,
Expand Down Expand Up @@ -377,7 +377,7 @@ mod tests {
}])),
name: None,
tool_calls: None,
reasoning_content: None,
reasoning: None,
}];
request.add_generation_prompt = Some(false);
request.continue_final_message = true;
Expand Down Expand Up @@ -583,13 +583,18 @@ mod tests {

#[test]
fn prepare_chat_request_accepts_assistant_reasoning_history() {
let message = ChatCompletionMessage {
role: AssistantRole,
content: Some("answer".to_string()),
tool_calls: None,
reasoning: Some("inner".to_string()),
};
let message_json = serde_json::to_value(message).expect("message serializes");

let request = ChatCompletionRequest {
messages: vec![ChatMessage::Assistant {
content: Some(MessageContent::Text("answer".to_string())),
name: None,
tool_calls: None,
reasoning_content: Some("inner".to_string()),
}],
messages: vec![
serde_json::from_value(message_json).expect("response message is valid history"),
],
add_generation_prompt: Some(false),
..base_request()
};
Expand All @@ -615,6 +620,40 @@ mod tests {
assert_eq!(prepared.chat_request.tool_choice, ChatToolChoice::Auto);
}

#[test]
fn prepare_chat_request_accepts_legacy_reasoning_content_alias() {
let request = ChatCompletionRequest {
messages: vec![
serde_json::from_value(json!({
"role": "assistant",
"content": "answer",
"reasoning_content": "inner",
}))
.expect("legacy reasoning_content alias is accepted"),
],
add_generation_prompt: Some(false),
..base_request()
};

let prepared = prepare_chat_request(
request,
"Qwen/Qwen1.5-0.5B-Chat",
ResolvedRequestContext::default(),
)
.expect("request is valid");
assert_eq!(
prepared.chat_request.messages,
vec![VllmChatMessage::assistant_blocks(vec![
AssistantContentBlock::Reasoning {
text: "inner".to_string(),
},
AssistantContentBlock::Text {
text: "answer".to_string(),
},
])]
);
}

#[test]
fn prepare_chat_request_accepts_tools_and_tool_history() {
let request = ChatCompletionRequest {
Expand All @@ -630,7 +669,7 @@ mod tests {
arguments: Some(r#"{"city":"Paris"}"#.to_string()),
},
}]),
reasoning_content: None,
reasoning: None,
},
ChatMessage::Tool {
content: MessageContent::Text("Sunny".to_string()),
Expand Down Expand Up @@ -799,7 +838,7 @@ mod tests {
content: Some(MessageContent::Text("hello".to_string())),
name: None,
tool_calls: None,
reasoning_content: None,
reasoning: None,
}];
request.continue_final_message = true;

Expand Down Expand Up @@ -839,7 +878,7 @@ mod tests {
content: Some(MessageContent::Text("hello".to_string())),
name: None,
tool_calls: None,
reasoning_content: None,
reasoning: None,
}],
..base_request()
};
Expand Down
17 changes: 15 additions & 2 deletions src/server/src/routes/openai/chat_completions/types.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use std::collections::HashMap;
use std::fmt;

use serde::{Deserialize, Serialize};
use serde_json::Value;
use serde_with::SerializeDisplay;
use validator::Validate;
use vllm_chat::ReasoningEffort;

Expand Down Expand Up @@ -347,11 +349,22 @@ pub(super) struct ChatCompletionChoice {
pub token_ids: Option<Vec<u32>>,
}

/// A literal type for the "assistant" role, since the API only allows that specific value in
/// responses.
#[derive(Debug, Clone, Copy, PartialEq, Eq, SerializeDisplay)]
pub(super) struct AssistantRole;

impl fmt::Display for AssistantRole {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("assistant")
}
}

/// Mirrors the Python vLLM response `ChatMessage` class.
#[serde_with::skip_serializing_none]
#[derive(Debug, Clone, Serialize)]
pub(super) struct ChatCompletionMessage {
pub role: String,
pub role: AssistantRole,
pub content: Option<String>,
pub tool_calls: Option<Vec<ToolCall>>,
pub reasoning: Option<String>,
Expand Down Expand Up @@ -401,7 +414,7 @@ pub(super) struct ChatCompletionStreamChoice {
#[serde_with::skip_serializing_none]
#[derive(Debug, Clone, Default, Serialize)]
pub(super) struct ChatMessageDelta {
pub role: Option<String>,
pub role: Option<AssistantRole>,
pub content: Option<String>,
pub tool_calls: Option<Vec<ToolCallDelta>>,
pub reasoning: Option<String>,
Expand Down
6 changes: 4 additions & 2 deletions src/server/src/routes/openai/utils/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,10 @@ pub enum ChatMessage {
content: Option<MessageContent>,
name: Option<String>,
tool_calls: Option<Vec<ToolCall>>,
/// Reasoning content for O1-style models.
reasoning_content: Option<String>,
/// Reasoning content for reasoning-capable models.
#[serde(alias = "reasoning_content")]
#[serde(alias = "thinking")]
reasoning: Option<String>,
Comment thread
BugenZhao marked this conversation as resolved.
},
#[serde(rename = "tool")]
Tool {
Expand Down
Loading