diff --git a/functionary/vllm_inference.py b/functionary/vllm_inference.py index 2ad427d..1f6a876 100644 --- a/functionary/vllm_inference.py +++ b/functionary/vllm_inference.py @@ -48,7 +48,7 @@ def create_error_response( async def check_all_errors(request, served_model) -> Optional[JSONResponse]: - if request.model != served_model: + if request.model not in served_model: return create_error_response( status_code=HTTPStatus.NOT_FOUND, message=f"The model `{request.model}` does not exist.", @@ -144,7 +144,7 @@ async def process_chat_completion( request: ChatCompletionRequest, raw_request: Optional[Request], tokenizer: Any, - served_model: str, + served_model: List[str], engine_model_config: Any, enable_grammar_sampling: bool, engine: Any, @@ -250,7 +250,7 @@ async def completion_stream_generator( async for response in generate_openai_format_from_stream_async( generator, prompt_template, tool_choice, tools_or_functions ): - + # Convert tool_calls to function_call if request.functions is provided if ( functions @@ -290,8 +290,7 @@ async def completion_stream_generator( } if response["finish_reason"] == "function_call": response["finish_reason"] = "tool_calls" - - + # Workaround Fixes response["delta"]["role"] = "assistant" if ( @@ -302,10 +301,11 @@ async def completion_stream_generator( for tool_call in response["delta"]["tool_calls"]: if tool_call.get("type") is None: tool_call["type"] = "function" - - + chunk = StreamChoice(**response) - result = ChatCompletionChunk(id=request_id, choices=[chunk], model=served_model) + result = ChatCompletionChunk( + id=request_id, choices=[chunk], model=model_name + ) chunk_dic = result.dict(exclude_unset=True) chunk_data = json.dumps(chunk_dic, ensure_ascii=False) yield f"data: {chunk_data}\n\n"