From dcf6bb300bb907468691def26be1c43365796904 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <azureuser@denvr-inf.kifxisxbiwme5gt4kkwqsfdjuh.dx.internal.cloudapp.net>
Date: Wed, 20 Aug 2025 01:13:09 +0000
Subject: [PATCH 1/2] dynamic model switch

Signed-off-by: Ubuntu <azureuser@denvr-inf.kifxisxbiwme5gt4kkwqsfdjuh.dx.internal.cloudapp.net>
---
 ChatQnA/chatqna.py | 4 ++--
 CodeGen/codegen.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index 2e462b0f6e..733084e8a4 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -73,7 +73,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
     elif self.services[cur_node].service_type == ServiceType.LLM:
         # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
         next_inputs = {}
-        next_inputs["model"] = LLM_MODEL
+        next_inputs["model"] = inputs["model"]
         next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
         next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
         next_inputs["top_p"] = llm_parameters_dict["top_p"]
@@ -396,7 +396,7 @@ async def handle_request(self, request: Request):
             repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             stream=stream_opt,
             chat_template=chat_request.chat_template if chat_request.chat_template else None,
-            model=chat_request.model if chat_request.model else None,
+            model=chat_request.model if chat_request.model else LLM_MODEL,
         )
         retriever_parameters = RetrieverParms(
             search_type=chat_request.search_type if chat_request.search_type else "similarity",
diff --git a/CodeGen/codegen.py b/CodeGen/codegen.py
index af9afdf715..a535c8e8f7 100644
--- a/CodeGen/codegen.py
+++ b/CodeGen/codegen.py
@@ -76,7 +76,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
     elif self.services[cur_node].service_type == ServiceType.LLM:
         # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
         next_inputs = {}
-        next_inputs["model"] = LLM_MODEL_ID
+        next_inputs["model"] = inputs["model"]
         next_inputs["messages"] = [{"role": "user", "content": inputs["query"]}]
         next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
         next_inputs["top_p"] = llm_parameters_dict["top_p"]
@@ -195,6 +195,7 @@ async def handle_request(self, request: Request):
             repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             stream=stream_opt,
             index_name=chat_request.index_name,
+            model=chat_request.model if chat_request.model else LLM_MODEL_ID
         )
 
         # Initialize the initial inputs with the generated prompt

From 0e2472889b6e27629d16a3d9d82a4e6d7ee5df8c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 20 Aug 2025 01:15:14 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CodeGen/codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CodeGen/codegen.py b/CodeGen/codegen.py
index a535c8e8f7..f95ec94e09 100644
--- a/CodeGen/codegen.py
+++ b/CodeGen/codegen.py
@@ -195,7 +195,7 @@ async def handle_request(self, request: Request):
             repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
             stream=stream_opt,
             index_name=chat_request.index_name,
-            model=chat_request.model if chat_request.model else LLM_MODEL_ID
+            model=chat_request.model if chat_request.model else LLM_MODEL_ID,
         )
 
         # Initialize the initial inputs with the generated prompt