MeetKai · jeffreymeetkai · Aug 22, 2024 · Aug 21, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/functionary/vllm_inference.py b/functionary/vllm_inference.py
@@ -48,7 +48,7 @@ def create_error_response(
 
 
 async def check_all_errors(request, served_model) -> Optional[JSONResponse]:
-    if request.model != served_model:
+    if request.model not in served_model:
         return create_error_response(
             status_code=HTTPStatus.NOT_FOUND,
             message=f"The model `{request.model}` does not exist.",
@@ -144,7 +144,7 @@ async def process_chat_completion(
     request: ChatCompletionRequest,
     raw_request: Optional[Request],
     tokenizer: Any,
-    served_model: str,
+    served_model: List[str],
     engine_model_config: Any,
     enable_grammar_sampling: bool,
     engine: Any,
@@ -250,7 +250,7 @@ async def completion_stream_generator(
         async for response in generate_openai_format_from_stream_async(
             generator, prompt_template, tool_choice, tools_or_functions
         ):
-            
+
             # Convert tool_calls to function_call if request.functions is provided
             if (
                 functions
@@ -290,8 +290,7 @@ async def completion_stream_generator(
                     }
                 if response["finish_reason"] == "function_call":
                     response["finish_reason"] = "tool_calls"
-
-
+
             # Workaround Fixes
             response["delta"]["role"] = "assistant"
             if (
@@ -302,10 +301,11 @@ async def completion_stream_generator(
                 for tool_call in response["delta"]["tool_calls"]:
                     if tool_call.get("type") is None:
                         tool_call["type"] = "function"
-
-
+
             chunk = StreamChoice(**response)
-            result = ChatCompletionChunk(id=request_id, choices=[chunk], model=served_model)
+            result = ChatCompletionChunk(
+                id=request_id, choices=[chunk], model=model_name
+            )
             chunk_dic = result.dict(exclude_unset=True)
             chunk_data = json.dumps(chunk_dic, ensure_ascii=False)
             yield f"data: {chunk_data}\n\n"