Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions rustfmt.toml
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
style_edition = "2024"
chain_width = 80
use_field_init_shorthand = true
8 changes: 6 additions & 2 deletions rustfmt.unstable.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
# Apply manually with:
# cargo +nightly fmt -- --config-path rustfmt.unstable.toml

unstable_features = true
style_edition = "2024"
comment_width = 100
chain_width = 80
use_field_init_shorthand = true

# Unstable features go here.
unstable_features = true

format_code_in_doc_comments = true
format_macro_matchers = true
normalize_comments = true
Expand Down
9 changes: 2 additions & 7 deletions src/chat/examples/external_engine_chat_qwen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,7 @@ async fn main() -> Result<()> {
println!("request_id={request_id}");
println!("prompt={}", args.prompt);

let mut stream = chat
.chat(request)
.await
.context("failed to submit chat request")?;
let mut stream = chat.chat(request).await.context("failed to submit chat request")?;
let output = tokio::time::timeout(output_timeout, async {
let mut final_reasoning = String::new();
let mut final_text = String::new();
Expand Down Expand Up @@ -170,9 +167,7 @@ async fn main() -> Result<()> {
.await
.context("timed out waiting for chat output")??;

chat.shutdown()
.await
.context("failed to shut down chat client")?;
chat.shutdown().await.context("failed to shut down chat client")?;

println!("final_reasoning={:?}", output.0);
println!("final_text={:?}", output.1);
Expand Down
19 changes: 11 additions & 8 deletions src/chat/src/backend/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ pub trait ChatBackend: Send + Sync {
/// Return the renderer used for chat-prompt construction.
fn chat_renderer(&self) -> DynChatRenderer;

/// Create a request-scoped output processor after request-level adjustments are applied.
/// Create a request-scoped output processor after request-level adjustments
/// are applied.
fn new_chat_output_processor(
&self,
request: &mut ChatRequest,
Expand All @@ -36,10 +37,12 @@ pub trait ChatBackend: Send + Sync {
/// Shared trait-object form of [`ChatBackend`].
pub type DynChatBackend = Arc<dyn ChatBackend>;

/// Convenience trait for backends that can serve both raw text generation and chat templating.
/// Convenience trait for backends that can serve both raw text generation and
/// chat templating.
///
/// This is mainly useful in tests and small examples, where one mock/backend often implements
/// both sides and callers want `ChatLlm` to wire the shared object into `TextLlm` automatically.
/// This is mainly useful in tests and small examples, where one mock/backend
/// often implements both sides and callers want `ChatLlm` to wire the shared
/// object into `TextLlm` automatically.
pub trait ChatTextBackend: ChatBackend + TextBackend {}

impl<T> ChatTextBackend for T where T: ChatBackend + TextBackend + ?Sized {}
Expand All @@ -54,11 +57,11 @@ pub struct LoadModelBackendsOptions {
pub renderer: RendererSelection,
/// How to serialize `message.content` when rendering the chat template.
pub chat_template_content_format: ChatTemplateContentFormatOption,
/// Optional server-default chat template override, provided either as an inline template or
/// as a path to a template file.
/// Optional server-default chat template override, provided either as an
/// inline template or as a path to a template file.
pub chat_template: Option<String>,
/// Optional server-default keyword arguments merged into every chat-template render before
/// request-level `chat_template_kwargs`.
/// Optional server-default keyword arguments merged into every
/// chat-template render before request-level `chat_template_kwargs`.
pub default_chat_template_kwargs: HashMap<String, Value>,
}

Expand Down
15 changes: 9 additions & 6 deletions src/chat/src/event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ impl [AssistantContentBlock] {
.filter(|s: &String| !s.is_empty())
}

/// Return whether this assistant message contains any non-empty reasoning text blocks.
/// Return whether this assistant message contains any non-empty reasoning
/// text blocks.
pub fn has_reasoning(&self) -> bool {
self.iter().any(|block| match block {
AssistantContentBlock::Reasoning { text } => !text.is_empty(),
Expand All @@ -95,8 +96,7 @@ impl [AssistantContentBlock] {

/// Return whether this assistant message contains any tool-call blocks.
pub fn has_tool_calls(&self) -> bool {
self.iter()
.any(|block| matches!(block, AssistantContentBlock::ToolCall(_)))
self.iter().any(|block| matches!(block, AssistantContentBlock::ToolCall(_)))
}
}

Expand Down Expand Up @@ -124,7 +124,8 @@ impl AssistantMessage {
/// Streamed chat event emitted by [`crate::ChatEventStream`].
#[derive(Debug, Clone, PartialEq)]
pub enum ChatEvent {
/// The request was accepted, streaming has started, and prompt metadata is ready.
/// The request was accepted, streaming has started, and prompt metadata is
/// ready.
Start {
/// The actual prompt token IDs for this request.
prompt_token_ids: Arc<[u32]>,
Expand Down Expand Up @@ -158,14 +159,16 @@ pub enum ChatEvent {
id: String,
name: String,
},
/// One incremental tool-call arguments delta for the currently open tool call.
/// One incremental tool-call arguments delta for the currently open tool
/// call.
ToolCallArgumentsDelta { index: usize, delta: String },
/// One tool call has ended.
ToolCallEnd {
index: usize,
call: AssistantToolCall,
},
/// Terminal event carrying the final assembled assistant message and finish metadata.
/// Terminal event carrying the final assembled assistant message and finish
/// metadata.
Done {
message: AssistantMessage,
/// Number of prompt tokens actually sent to the engine after chat
Expand Down
34 changes: 17 additions & 17 deletions src/chat/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
//! Minimal chat facade above [`vllm_text`].
//!
//! This crate keeps the northbound boundary intentionally small:
//! `messages -> rendered prompt -> tokenized prompt -> engine request -> streamed structured
//! assistant events`. The request side remains text-first, while the response side can emit
//! structured reasoning and final-answer blocks. It is closer to vLLM's internal chat-rendering
//! flow than to a full OpenAI-compatible surface.
//! `messages -> rendered prompt -> tokenized prompt -> engine request ->
//! streamed structured assistant events`. The request side remains text-first,
//! while the response side can emit structured reasoning and final-answer
//! blocks. It is closer to vLLM's internal chat-rendering flow than to a full
//! OpenAI-compatible surface.

pub use backend::hf::HfChatBackend;
pub use backend::{
Expand Down Expand Up @@ -90,8 +91,9 @@ pub fn validate_parser_overrides(

/// Structured chat facade above [`TextLlm`].
///
/// This layer stays above raw text semantics: it takes care of chat-template rendering, exposes
/// structured assistant events, and adds chat-specific request semantics such as tool calls.
/// This layer stays above raw text semantics: it takes care of chat-template
/// rendering, exposes structured assistant events, and adds chat-specific
/// request semantics such as tool calls.
pub struct ChatLlm {
text: TextLlm,
backend: DynChatBackend,
Expand All @@ -102,7 +104,8 @@ pub struct ChatLlm {
}

impl ChatLlm {
/// Create a new chat facade from a text-generation facade plus a chat backend.
/// Create a new chat facade from a text-generation facade plus a chat
/// backend.
pub fn new(text: TextLlm, backend: DynChatBackend) -> Self {
Self {
text,
Expand All @@ -112,8 +115,8 @@ impl ChatLlm {
}
}

/// Convenience constructor for one shared backend object that implements both text and chat
/// responsibilities.
/// Convenience constructor for one shared backend object that implements
/// both text and chat responsibilities.
pub fn from_shared_backend(llm: Llm, backend: DynChatTextBackend) -> Self {
let text = TextLlm::new(llm, backend.clone());
Self::new(text, backend)
Expand All @@ -131,7 +134,8 @@ impl ChatLlm {
self
}

/// Expose the underlying text facade for raw text-generation routes such as `/v1/completions`.
/// Expose the underlying text facade for raw text-generation routes such as
/// `/v1/completions`.
pub fn text(&self) -> &TextLlm {
&self.text
}
Expand All @@ -141,7 +145,8 @@ impl ChatLlm {
self.text.model_id()
}

/// Expose the underlying engine-core client for low-level utility/admin calls.
/// Expose the underlying engine-core client for low-level utility/admin
/// calls.
pub fn engine_core_client(&self) -> &EngineCoreClient {
self.text.engine_core_client()
}
Expand Down Expand Up @@ -171,12 +176,7 @@ impl ChatLlm {
add_special_tokens: request.add_special_tokens,
data_parallel_rank: request.data_parallel_rank,
};
let decoded_stream = self
.text
.generate(text_request)
.await?
.map_err(Error::from)
.boxed();
let decoded_stream = self.text.generate(text_request).await?.map_err(Error::from).boxed();

let structured_stream = output_processor.process(decoded_stream)?;

Expand Down
50 changes: 26 additions & 24 deletions src/chat/src/output/default/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,25 @@ trait_set! {
trait ContentEventStream = Stream<Item = Result<ContentEvent>> + Send + 'static;
}

/// Default request-scoped output processor used by Hugging Face style chat backends.
/// Default request-scoped output processor used by Hugging Face style chat
/// backends.
///
/// This implementation assumes the backend already emitted decoded text deltas, then optionally
/// layers reasoning parsing and tool-call parsing before assembling final structured chat events.
/// This implementation assumes the backend already emitted decoded text deltas,
/// then optionally layers reasoning parsing and tool-call parsing before
/// assembling final structured chat events.
pub struct DefaultChatOutputProcessor {
intermediate: bool,
reasoning_parser: Option<Box<dyn ReasoningParser>>,
tool_parser: Option<Box<dyn ToolParser>>,
}

impl DefaultChatOutputProcessor {
/// Build the default output processor and apply any parser-specific request adjustments.
/// Build the default output processor and apply any parser-specific request
/// adjustments.
///
/// Parser resolution happens here so that request validation, prompt rendering, and streaming
/// all observe the same parser-adjusted request state.
/// Parser resolution happens here so that request validation, prompt
/// rendering, and streaming all observe the same parser-adjusted
/// request state.
pub fn new(
request: &mut ChatRequest,
model_id: &str,
Expand Down Expand Up @@ -77,8 +81,9 @@ impl DefaultChatOutputProcessor {

/// Build the plain-text-only default output processor.
///
/// This keeps the default structured chat-event assembly but disables both reasoning parsing
/// and tool-call parsing completely, so that all content is treated as opaque text.
/// This keeps the default structured chat-event assembly but disables both
/// reasoning parsing and tool-call parsing completely, so that all
/// content is treated as opaque text.
pub fn plain_text_only() -> Self {
Self {
intermediate: false,
Expand Down Expand Up @@ -106,13 +111,11 @@ impl DefaultChatOutputProcessor {

let parser = factory.create(parser_name, &request.tools)?;

parser
.adjust_request(request)
.map_err(|error| Error::ParserInitialization {
kind: "tool",
name: parser_name.to_string(),
error: error.into(),
})?;
parser.adjust_request(request).map_err(|error| Error::ParserInitialization {
kind: "tool",
name: parser_name.to_string(),
error: error.into(),
})?;

TOOL_PARSER_LOG_ONCE.call_once(|| info!(parser_name, "using tool parser"));
Ok(parser)
Expand All @@ -138,13 +141,11 @@ impl DefaultChatOutputProcessor {

let parser = factory.create(parser_name, tokenizer)?;

parser
.adjust_request(request)
.map_err(|error| Error::ParserInitialization {
kind: "reasoning",
name: parser_name.to_string(),
error: error.into(),
})?;
parser.adjust_request(request).map_err(|error| Error::ParserInitialization {
kind: "reasoning",
name: parser_name.to_string(),
error: error.into(),
})?;

REASONING_PARSER_LOG_ONCE.call_once(|| info!(parser_name, "using reasoning parser"));
Ok(Some(parser))
Expand All @@ -155,8 +156,9 @@ static TOOL_PARSER_LOG_ONCE: Once = Once::new();
static REASONING_PARSER_LOG_ONCE: Once = Once::new();

impl ChatOutputProcessor for DefaultChatOutputProcessor {
/// Transforms a raw generate-output token stream into structured chat events
/// through three sequential stages once text decoding has already happened:
/// Transforms a raw generate-output token stream into structured chat
/// events through three sequential stages once text decoding has
/// already happened:
///
/// 1. [`reasoning_event_stream`] — reasoning/content separation
/// 2. [`tool_event_stream`] — tool-call parsing
Expand Down
3 changes: 2 additions & 1 deletion src/chat/src/output/default/reasoning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ impl ReasoningState {
}
}

/// Convert one decoded text delta into zero or more semantic assistant deltas.
/// Convert one decoded text delta into zero or more semantic assistant
/// deltas.
fn process_delta(&mut self, delta: String) -> Vec<ContentEvent> {
// If the parser has already failed, skip parsing and return plain text deltas.
if self.parser_failed {
Expand Down
7 changes: 4 additions & 3 deletions src/chat/src/output/default/tool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,10 @@ fn push_text_delta(events: &mut Vec<AssistantEvent>, kind: AssistantBlockKind, d

/// Tool parsing when `intermediate=false` (`FinalOnly` mode).
///
/// We keep this separate because some adaptor-backed parsers may not correctly handle the full text
/// passed to incremental `push` interface, but override `parse_complete()` with a dedicated
/// one-shot implementation to ensure correctness.
/// We keep this separate because some adaptor-backed parsers may not correctly
/// handle the full text passed to incremental `push` interface, but override
/// `parse_complete()` with a dedicated one-shot implementation to ensure
/// correctness.
#[try_stream]
async fn final_only_tool_event_stream(
stream: impl ContentEventStream,
Expand Down
Loading
Loading