From a213598e75aae86283b7b80a38677965e2e035d8 Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Mon, 2 Mar 2026 21:35:23 +0800
Subject: [PATCH 1/9] add support qwen3

---
 validator/modules/llm_judge/environment.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validator/modules/llm_judge/environment.yml b/validator/modules/llm_judge/environment.yml
index cc75b2f..0fd595b 100644
--- a/validator/modules/llm_judge/environment.yml
+++ b/validator/modules/llm_judge/environment.yml
@@ -8,11 +8,11 @@ dependencies:
     - openai>=1.0.0  # OpenAI API client
     - httpx  # HTTP client for OpenAI requests
     - pydantic>=2.0.0  # Data validation and parsing
-    - transformers==4.49.0  # HuggingFace transformers library
+    - transformers==4.53.3  # HuggingFace transformers library
     - torch>=1.13.1  # PyTorch for model inference
     - accelerate>=0.27.2  # For efficient model loading
     - loguru>=0.6.0  # Logging library
-    - huggingface-hub==0.29.1
+    - huggingface-hub>=0.30.0,<1.0
     - tenacity
     - peft>=0.10.0,<0.18.0
     - python-dotenv  # Load environment variables from .env file
\ No newline at end of file

From ea5f3faca679643fd40f4e3ecc45b80f87ce3374 Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Wed, 11 Mar 2026 09:05:07 +0800
Subject: [PATCH 2/9] update dependencies

---
 validator/modules/llm_judge/environment.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/validator/modules/llm_judge/environment.yml b/validator/modules/llm_judge/environment.yml
index 0fd595b..361d284 100644
--- a/validator/modules/llm_judge/environment.yml
+++ b/validator/modules/llm_judge/environment.yml
@@ -8,11 +8,11 @@ dependencies:
     - openai>=1.0.0  # OpenAI API client
     - httpx  # HTTP client for OpenAI requests
     - pydantic>=2.0.0  # Data validation and parsing
-    - transformers==4.53.3  # HuggingFace transformers library
+    - transformers==5.3.0  # HuggingFace transformers library
     - torch>=1.13.1  # PyTorch for model inference
     - accelerate>=0.27.2  # For efficient model loading
     - loguru>=0.6.0  # Logging library
-    - huggingface-hub>=0.30.0,<1.0
+    - huggingface-hub==1.5.0
     - tenacity
-    - peft>=0.10.0,<0.18.0
+    - peft==0.18.1
     - python-dotenv  # Load environment variables from .env file
\ No newline at end of file

From 46c9983f3d1218342800e8f3ee2739460dd43879 Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Thu, 12 Mar 2026 12:00:09 +0800
Subject: [PATCH 3/9] only support for qwen model

---
 validator/modules/llm_judge/__init__.py |  95 ++++---------------
 validator/modules/llm_judge/template.py | 118 +-----------------------
 2 files changed, 20 insertions(+), 193 deletions(-)

diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py
index 0bfe3d3..03ed708 100644
--- a/validator/modules/llm_judge/__init__.py
+++ b/validator/modules/llm_judge/__init__.py
@@ -37,7 +37,7 @@
 class LLMJudgeConfig(BaseConfig):
     gen_batch_size: int = 1
     eval_batch_size: int = 16
-    gen_temperature: float = 0.1
+    gen_temperature: float = 0.7
 
 
 class LLMJudgeMetrics(BaseMetrics):
@@ -139,7 +139,6 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No
         model_kwargs = dict(
             trust_remote_code=True,
             torch_dtype=compute_dtype,
-            use_cache=False,
             device_map="auto",
         )
         if is_lora:
@@ -180,74 +179,6 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No
             )
             raise InvalidModelParametersException(f"Model parameters {total} exceed limit {max_params}")
 
-    def _construct_conversation_template(
-            self, conversation: List[Dict[str, str]], base_model: str,
-    ) -> str:
-        try:
-            if base_model not in template_dict:
-                logger.info(f"Template {base_model} not found, using default")
-                base_model = "default"
-
-            template = template_dict[base_model]
-
-            conversation_parts = []
-
-            # Validate conversation structure
-            if not isinstance(conversation, dict):
-                raise LLMJudgeException(f"Conversation must be a dict, got {type(conversation)}")
-            
-            if "conversations" not in conversation:
-                raise LLMJudgeException(f"Conversation dict must have 'conversations' key")
-            
-            if not conversation["conversations"]:
-                raise LLMJudgeException(f"Conversation 'conversations' list is empty")
-
-            # Use provided system_text or fall back to template default
-            if template.system_format:
-                system_prompt = (
-                    conversation["system"] if "system" in conversation else None
-                )
-                system_content = (
-                    system_prompt if system_prompt else "You are a helpful assistant."
-                )
-                if system_content:
-                    formatted_system = template.system_format.format(
-                        content=system_content
-                    )
-                    conversation_parts.append(formatted_system)
-
-            # Multi-turn conversation: format each message according to template
-            for msg in conversation["conversations"]:
-                if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
-                    logger.warning(f"Skipping invalid message: {msg}")
-                    continue
-                
-                if msg["role"] == "user":
-                    user_text = template.user_format.format(
-                        content=msg["content"],
-                        stop_token=self.hf_tokenizer.eos_token,
-                    )
-                    conversation_parts.append(user_text)
-                elif msg["role"] == "assistant":
-                    assistant_text = template.assistant_format.format(
-                        content=msg["content"],
-                        stop_token=self.hf_tokenizer.eos_token,
-                    )
-                    conversation_parts.append(assistant_text)
-
-            conversation_format = "".join(conversation_parts)
-            
-            if not conversation_format.strip():
-                logger.error(f"Empty template generated. Template: {base_model}, Conversation: {conversation}, Parts: {conversation_parts}")
-                raise LLMJudgeException(f"Generated conversation template is empty after formatting")
-                
-        except Exception as e:
-            raise LLMJudgeException(
-                f"Failed to construct conversation template: {e}"
-            ) from e
-
-        return conversation_format
-
     def _generate_response(
             self,
             context_length: int,
@@ -272,10 +203,22 @@ def _generate_response(
                 # Apply chat template with fallback
                 batch_conversation_templates = []
                 for conversation in batch_conversations:
-                    template = self._construct_conversation_template(
-                        conversation, base_model=base_model,
+
+                    messages = []
+                    if "system" in conversation:
+                        messages.append({
+                            "role": "system",
+                            "content": conversation["system"]
+                        })
+
+                    messages += conversation["conversations"]
+                    template = self.hf_tokenizer.apply_chat_template(
+                        messages,
+                        tokenize=False,
+                        add_generation_prompt=True,
+                        enable_thinking=False,
                     )
-                    
+
                     # Validate template is not empty
                     if not template or not template.strip():
                         logger.error(f"Empty template generated for conversation: {conversation}")
@@ -313,10 +256,10 @@ def _generate_response(
                     outputs = self.hf_model.generate(
                         **model_inputs,
                         max_new_tokens=max_length,
-                        temperature=self.config.gen_temperature,
+                        temperature=self.config.gen_temperature,  # Non thinking-General 0.7 ,Reasoning 1
                         do_sample=True,
-                        top_p=0.95,  # Nucleus sampling for stability
-                        top_k=50,    # Limit vocabulary for stability
+                        top_p=0.8,  # Non thinking-General 0.8 ,Reasoning 0.95
+                        top_k=20,    #
                         pad_token_id=self.hf_tokenizer.eos_token_id,
                         eos_token_id=self.hf_tokenizer.eos_token_id,
                     )
diff --git a/validator/modules/llm_judge/template.py b/validator/modules/llm_judge/template.py
index 862c52f..6be151e 100644
--- a/validator/modules/llm_judge/template.py
+++ b/validator/modules/llm_judge/template.py
@@ -56,7 +56,7 @@ def register_template(
 
 
 register_template(
-    template_name="qwen1.5",
+    template_name="qwen",
     system_format="<|im_start|>system\n{content}<|im_end|>\n",
     user_format="<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n",
     assistant_format="{content}<|im_end|>\n",
@@ -66,119 +66,3 @@ def register_template(
     system="You are a helpful assistant.",
     stop_word="<|im_end|>",
 )
-
-register_template(
-    template_name="yi",
-    system_format="<|im_start|>system\n{content}<|im_end|>\n",
-    user_format="<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n",
-    assistant_format="{content}<|im_end|>\n",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="<|im_start|>tool\n{content}<im_end>\n<|im_start|>assistant\n",
-    system=None,
-    stop_word="<|im_end|>",
-)
-
-
-register_template(
-    template_name="zephyr",
-    system_format="<|system|>\n{content}</s>",
-    user_format="<|user|>\n{content}</s>\n<|assistant|>\n",
-    assistant_format="{content}</s>\n",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="<|tool|>\n{content}</s>\n<|assistant|>\n",
-    system=None,
-    stop_word="</s>",
-)
-
-register_template(
-    template_name="mistral",
-    system_format="<s>",
-    user_format="[INST]{content}[/INST]",
-    assistant_format="{content}</s>",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="{content}",
-    system="",
-    stop_word="</s>",
-)
-
-register_template(
-    template_name="mixtral",
-    system_format="<s>",
-    user_format="[INST]{content}[/INST]",
-    assistant_format="{content}</s>",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="{content}",
-    system="",
-    stop_word="</s>",
-)
-
-register_template(
-    template_name="llama2",
-    system_format="<<SYS>>\n{content}\n<</SYS>>\n\n",
-    user_format="[INST]{content}[/INST]",
-    assistant_format="{content} </s>",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="{content}",
-    system="You are a helpful, respectful and honest assistant. "
-    "Always answer as helpfully as possible, while being safe. "
-    "Your answers should not include any harmful, unethical, "
-    "racist, sexist, toxic, dangerous, or illegal content. "
-    "Please ensure that your responses are socially unbiased and positive in nature.\n\n"
-    "If a question does not make any sense, or is not factually coherent, "
-    "explain why instead of answering something not correct. "
-    "If you don't know the answer to a question, please don't share false information.",
-    stop_word="</s>",
-)
-
-register_template(
-    template_name="gemma",
-    system_format="<bos>",
-    user_format="<start_of_turn>user\n{content}<end_of_turn>\n<start_of_turn>model\n",
-    assistant_format="{content}<eos>\n",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="<start_of_turn>tool\n{content}<end_of_turn>\n<start_of_turn>model\n",
-    system="",
-    stop_word="<eos>",
-)
-
-register_template(
-    template_name="llama3",
-    system_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>",
-    user_format="<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-    assistant_format="{content}<|eot_id|>",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="<|start_header_id|>tool<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-    system=None,
-    stop_word="<|eot_id|>",
-)
-
-register_template(
-    template_name="phi3",
-    system_format=None,
-    user_format="<|user|>\n{content}<|end|>\n<|assistant|>",
-    assistant_format="{content}<|end|>\n",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="<|tool|>\n{content}<|end|>\n<|assistant|>",
-    system=None,
-    stop_word="<|end|>",
-)
-
-register_template(
-    template_name="phi4",
-    system_format=None,
-    user_format="<|user|>\n{content}<|end|>\n<|assistant|>",
-    assistant_format="{content}<|end|>\n",
-    tool_format="<|tool|>{content}<|/tool|>",
-    function_format="<|tool_call|>{content}<|/tool_call|>",
-    observation_format="<|tool|>\n{content}<|end|>\n<|assistant|>",
-    system=None,
-    stop_word="<|end|>",
-)

From cbfeac91bab5c232c2cb121e55771f72a7756650 Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Tue, 17 Mar 2026 16:48:13 +0800
Subject: [PATCH 4/9] delete template.py

---
 validator/modules/llm_judge/__init__.py |  1 -
 validator/modules/llm_judge/template.py | 68 -------------------------
 2 files changed, 69 deletions(-)
 delete mode 100644 validator/modules/llm_judge/template.py

diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py
index 03ed708..cbeb9ea 100644
--- a/validator/modules/llm_judge/__init__.py
+++ b/validator/modules/llm_judge/__init__.py
@@ -15,7 +15,6 @@
 from validator.modules.llm_judge.prompt import get_prompt
 from validator.modules.llm_judge.utils import download_file
 from validator.exceptions import LLMJudgeException, InvalidModelParametersException
-from validator.modules.llm_judge.template import template_dict
 from peft import PeftModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from validator.modules.base import (
diff --git a/validator/modules/llm_judge/template.py b/validator/modules/llm_judge/template.py
deleted file mode 100644
index 6be151e..0000000
--- a/validator/modules/llm_judge/template.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from dataclasses import dataclass
-from typing import Dict
-
-
-@dataclass
-class Template:
-    template_name: str
-    system_format: str
-    user_format: str
-    assistant_format: str
-    tool_format: str
-    function_format: str
-    observation_format: str
-    system: str
-    stop_word: str
-
-
-template_dict: Dict[str, Template] = dict()
-
-
-def register_template(
-    template_name,
-    system_format,
-    user_format,
-    assistant_format,
-    tool_format,
-    function_format,
-    observation_format,
-    system,
-    stop_word=None,
-):
-    template_dict[template_name] = Template(
-        template_name=template_name,
-        system_format=system_format,
-        user_format=user_format,
-        assistant_format=assistant_format,
-        tool_format=tool_format,
-        function_format=function_format,
-        observation_format=observation_format,
-        system=system,
-        stop_word=stop_word,
-    )
-
-
-register_template(
-    template_name="default",
-    system_format="System: {content}\n\n",
-    user_format="User: {content}\nAssistant: ",
-    assistant_format="{content} {stop_token}",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="Tool\n{content}\n",
-    system=None,
-    stop_word=None,
-)
-
-
-register_template(
-    template_name="qwen",
-    system_format="<|im_start|>system\n{content}<|im_end|>\n",
-    user_format="<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n",
-    assistant_format="{content}<|im_end|>\n",
-    tool_format="{content}",
-    function_format="{content}",
-    observation_format="<|im_start|>tool\n{content}<im_end>\n<|im_start|>assistant\n",
-    system="You are a helpful assistant.",
-    stop_word="<|im_end|>",
-)

From 13df7e527ca15829f462cc4a53bfe7d046b61e1c Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Tue, 17 Mar 2026 23:49:47 +0800
Subject: [PATCH 5/9] add hf_tokenizer mapping

---
 validator/modules/llm_judge/__init__.py | 18 ++++++-
 validator/modules/llm_judge/constant.py | 66 +++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 validator/modules/llm_judge/constant.py

diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py
index 0ce8bda..6655830 100644
--- a/validator/modules/llm_judge/__init__.py
+++ b/validator/modules/llm_judge/__init__.py
@@ -14,6 +14,7 @@
 from typing import List, Dict, Any
 from validator.modules.llm_judge.prompt import get_prompt
 from validator.modules.llm_judge.utils import download_file
+from validator.modules.llm_judge.constant import SUPPORTED_BASE_MODELS
 from validator.exceptions import LLMJudgeException, InvalidModelParametersException
 from peft import PeftModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -155,6 +156,18 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No
             with open("judge/adapter_config.json", "r") as f:
                 adapter_config = json.load(f)
             base_model = adapter_config["base_model_name_or_path"]
+            if base_model in SUPPORTED_BASE_MODELS:
+                logger.info(
+                    f"LoRA's base model '{base_model}' is in SUPPORTED_BASE_MODELS. "
+                    f"Using it for tokenizer."
+                )
+            else:
+                logger.error(
+                    f"LoRA's base model '{base_model}' is not in SUPPORTED_BASE_MODELS. "
+                    f"Marking assignment as failed."
+                )
+                raise
+
             self.hf_tokenizer = AutoTokenizer.from_pretrained(
                 base_model, trust_remote_code=True, use_fast=True, padding_side="left"
             )
@@ -843,7 +856,10 @@ def validate(self, data: LLMJudgeInputData, **kwargs) -> LLMJudgeMetrics:
             self._load_model(data.hg_repo_id, data.revision, data.max_params)
         except InvalidModelParametersException as e:
             # lowest possible reward for invalid model parameters
-            logger.info(f"Invalid model parameters: {e}")
+            logger.error(f"Invalid model parameters: {e}")
+            return LLMJudgeMetrics(score=LOWEST_POSSIBLE_SCORE)
+        except Exception as e:
+            logger.error(f"Exception when load model: {e}")
             return LLMJudgeMetrics(score=LOWEST_POSSIBLE_SCORE)
 
         # Stage 1: Generate all responses
diff --git a/validator/modules/llm_judge/constant.py b/validator/modules/llm_judge/constant.py
new file mode 100644
index 0000000..772fb3e
--- /dev/null
+++ b/validator/modules/llm_judge/constant.py
@@ -0,0 +1,66 @@
+SUPPORTED_BASE_MODELS = [
+    # qwen3.5
+    "Qwen/Qwen3.5-0.8B",
+    "Qwen/Qwen3.5-0.8B-Base",
+    "Qwen/Qwen3.5-2B",
+    "Qwen/Qwen3.5-2B-Base",
+    "Qwen/Qwen3.5-4B",
+    "Qwen/Qwen3.5-4B-Base",
+    "Qwen/Qwen3.5-9B",
+    "Qwen/Qwen3.5-9B-Base",
+    "Qwen/Qwen3.5-27B",
+    # qwen3
+    "Qwen/Qwen3-0.6B",
+    "Qwen/Qwen3-1.7B",
+    "Qwen/Qwen3-4B",
+    "Qwen/Qwen3-4B-Base",
+    "Qwen/Qwen3-4B-Instruct-2507",
+    "Qwen/Qwen3-8B",
+    "Qwen/Qwen3-8B-Base",
+    "Qwen/Qwen3-14B",
+    "Qwen/Qwen3-14B-Base",
+    "Qwen/Qwen3-32B",
+    # qwen2.5
+    "Qwen/Qwen2.5-0.5B",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "Qwen/Qwen2.5-1.5B",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen2.5-3B",
+    "Qwen/Qwen2.5-3B-Instruct",
+    "Qwen/Qwen2.5-7B",
+    "Qwen/Qwen2.5-7B-Instruct",
+    "Qwen/Qwen2.5-14B",
+    "Qwen/Qwen2.5-14B-Instruct",
+    "Qwen/Qwen2.5-32B",
+    "Qwen/Qwen2.5-32B-Instruct",
+    "Qwen/Qwen2.5-72B",
+    "Qwen/Qwen2.5-72B-Instruct",
+    # yi 1.5
+    "01-ai/Yi-1.5-6B",
+    "01-ai/Yi-1.5-6B-Chat",
+    "01-ai/Yi-1.5-9B",
+    "01-ai/Yi-1.5-9B-Chat",
+    "01-ai/Yi-1.5-34B",
+    "01-ai/Yi-1.5-34B-Chat",
+    # mistral
+    "mistralai/Mistral-7B-v0.3",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+    "mistralai/Ministral-8B-Instruct-2410",
+    # gemma2
+    "google/gemma-2-2b",
+    "google/gemma-2-9b",
+    "google/gemma-2-27b",
+    "google/gemma-2-2b-it",
+    "google/gemma-2-9b-it",
+    "google/gemma-2-27b-it",
+    # llama3
+    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "meta-llama/Meta-Llama-3-70B",
+    "meta-llama/Meta-Llama-3-70B-Instruct",
+    # llama3.1
+    "meta-llama/Meta-Llama-3.1-8B",
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "meta-llama/Meta-Llama-3.1-70B",
+    "meta-llama/Meta-Llama-3.1-70B-Instruct",
+]
\ No newline at end of file

From be3280d27f510216cc4427d10a470c7e48b0ac3e Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Wed, 18 Mar 2026 11:52:19 +0800
Subject: [PATCH 6/9] only keep qwen3.5 model

---
 validator/modules/llm_judge/constant.py | 54 -------------------------
 1 file changed, 54 deletions(-)

diff --git a/validator/modules/llm_judge/constant.py b/validator/modules/llm_judge/constant.py
index 772fb3e..30efd9f 100644
--- a/validator/modules/llm_judge/constant.py
+++ b/validator/modules/llm_judge/constant.py
@@ -9,58 +9,4 @@
     "Qwen/Qwen3.5-9B",
     "Qwen/Qwen3.5-9B-Base",
     "Qwen/Qwen3.5-27B",
-    # qwen3
-    "Qwen/Qwen3-0.6B",
-    "Qwen/Qwen3-1.7B",
-    "Qwen/Qwen3-4B",
-    "Qwen/Qwen3-4B-Base",
-    "Qwen/Qwen3-4B-Instruct-2507",
-    "Qwen/Qwen3-8B",
-    "Qwen/Qwen3-8B-Base",
-    "Qwen/Qwen3-14B",
-    "Qwen/Qwen3-14B-Base",
-    "Qwen/Qwen3-32B",
-    # qwen2.5
-    "Qwen/Qwen2.5-0.5B",
-    "Qwen/Qwen2.5-0.5B-Instruct",
-    "Qwen/Qwen2.5-1.5B",
-    "Qwen/Qwen2.5-1.5B-Instruct",
-    "Qwen/Qwen2.5-3B",
-    "Qwen/Qwen2.5-3B-Instruct",
-    "Qwen/Qwen2.5-7B",
-    "Qwen/Qwen2.5-7B-Instruct",
-    "Qwen/Qwen2.5-14B",
-    "Qwen/Qwen2.5-14B-Instruct",
-    "Qwen/Qwen2.5-32B",
-    "Qwen/Qwen2.5-32B-Instruct",
-    "Qwen/Qwen2.5-72B",
-    "Qwen/Qwen2.5-72B-Instruct",
-    # yi 1.5
-    "01-ai/Yi-1.5-6B",
-    "01-ai/Yi-1.5-6B-Chat",
-    "01-ai/Yi-1.5-9B",
-    "01-ai/Yi-1.5-9B-Chat",
-    "01-ai/Yi-1.5-34B",
-    "01-ai/Yi-1.5-34B-Chat",
-    # mistral
-    "mistralai/Mistral-7B-v0.3",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    "mistralai/Ministral-8B-Instruct-2410",
-    # gemma2
-    "google/gemma-2-2b",
-    "google/gemma-2-9b",
-    "google/gemma-2-27b",
-    "google/gemma-2-2b-it",
-    "google/gemma-2-9b-it",
-    "google/gemma-2-27b-it",
-    # llama3
-    "meta-llama/Meta-Llama-3-8B",
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-    "meta-llama/Meta-Llama-3-70B",
-    "meta-llama/Meta-Llama-3-70B-Instruct",
-    # llama3.1
-    "meta-llama/Meta-Llama-3.1-8B",
-    "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "meta-llama/Meta-Llama-3.1-70B",
-    "meta-llama/Meta-Llama-3.1-70B-Instruct",
 ]
\ No newline at end of file

From ff452a59b205d9a455bfd059b9d0ca59a5ec7d8f Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Fri, 20 Mar 2026 00:44:54 +0800
Subject: [PATCH 7/9] fix qwen3.5 model function calling bug.

---
 validator/modules/llm_judge/__init__.py | 59 ++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 6 deletions(-)

diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py
index 6655830..311c0c4 100644
--- a/validator/modules/llm_judge/__init__.py
+++ b/validator/modules/llm_judge/__init__.py
@@ -151,7 +151,7 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No
                 repo_id=repo_id,
                 local_dir="judge",
                 revision=revision,
-                force_download=True,  # Disable fallback to cached files
+                # force_download=True,  # Disable fallback to cached files
             )
             with open("judge/adapter_config.json", "r") as f:
                 adapter_config = json.load(f)
@@ -232,8 +232,16 @@ def _generate_response(
                         })
 
                     messages += conversation["conversations"]
+                    tools_for_template = conversation.get("tools", None)
+                    try:
+                        if isinstance(tools_for_template, str):
+                            tools_for_template = json.loads(tools_for_template)
+                    except Exception:
+                        # leave tools_for_template as-is if parsing fails
+                        pass
                     template = self.hf_tokenizer.apply_chat_template(
                         messages,
+                        tools=tools_for_template,
                         tokenize=False,
                         add_generation_prompt=True,
                         enable_thinking=False,
@@ -556,10 +564,47 @@ def _load_jsonl_conversations(
                         for msg in conversations:
                             role = msg.get("role", "")
                             content = msg.get("content", "").strip()
-                            if (
-                                role in ["user", "assistant", "function_call"]
-                                and content
-                            ):
+                            if not content:
+                                continue
+                            if role == "function_call":
+                                # Convert function_call to assistant message with tool_calls
+                                try:
+                                    call_data = json.loads(content)
+                                    tool_call_msg = {
+                                        "role": "assistant",
+                                        "tool_calls": [
+                                            {
+                                                "id": f"call_{len(conversation_to_process)}",
+                                                "type": "function",
+                                                "function": {
+                                                    "name": call_data.get("name", ""),
+                                                    "arguments": call_data.get("arguments", {})
+                                                },
+                                            }
+                                        ],
+                                    }
+                                    conversation_to_process.append(tool_call_msg)
+                                except (json.JSONDecodeError, KeyError):
+                                    # Fallback: treat as plain assistant message
+                                    conversation_to_process.append(
+                                        {"role": "assistant", "content": content}
+                                    )
+                            elif role == "observation":
+                                # Convert observation to tool result message
+                                # Find the last tool_call id to reference
+                                tool_call_id = "call_0"
+                                for prev_msg in reversed(conversation_to_process):
+                                    if prev_msg.get("role") == "assistant" and prev_msg.get("tool_calls"):
+                                        tool_call_id = prev_msg["tool_calls"][0]["id"]
+                                        break
+                                conversation_to_process.append(
+                                    {
+                                        "role": "tool",
+                                        "tool_call_id": tool_call_id,
+                                        "content": content,
+                                    }
+                                )
+                            elif role in ["user", "assistant"]:
                                 conversation_to_process.append(
                                     {"role": role, "content": content}
                                 )
@@ -567,7 +612,7 @@ def _load_jsonl_conversations(
                         # Extract reference response (last assistant or function_call message)
                         reference_response = None
                         if conversation_to_process:
-                            last_msg = conversation_to_process[-1]
+                            last_msg = conversations[-1]
                             if last_msg["role"] in ["assistant", "function_call"]:
                                 reference_response = last_msg["content"]
                                 conversation_to_process = conversation_to_process[:-1]
@@ -591,6 +636,8 @@ def _load_jsonl_conversations(
                     continue
 
                 input_conversations_data["conversations"] = conversation_to_process
+                if tools_info is not None:
+                    input_conversations_data["tools"] = tools_info
 
                 input_conversations.append(
                     {

From 4ec68cfccdec3c295730e39b4fb63ff3b8f5004b Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Fri, 20 Mar 2026 00:45:44 +0800
Subject: [PATCH 8/9] fix qwen3.5 model function calling bug.

---
 validator/modules/llm_judge/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py
index 311c0c4..5a76084 100644
--- a/validator/modules/llm_judge/__init__.py
+++ b/validator/modules/llm_judge/__init__.py
@@ -151,7 +151,7 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No
                 repo_id=repo_id,
                 local_dir="judge",
                 revision=revision,
-                # force_download=True,  # Disable fallback to cached files
+                force_download=True,  # Disable fallback to cached files
             )
             with open("judge/adapter_config.json", "r") as f:
                 adapter_config = json.load(f)

From 6440edc12da9313813acb3e5783dad5d07963836 Mon Sep 17 00:00:00 2001
From: feng <471899214@qq.com>
Date: Fri, 20 Mar 2026 23:42:20 +0800
Subject: [PATCH 9/9] fix some Tool call bug

---
 validator/modules/llm_judge/__init__.py | 40 +++++++++++++++--------
 validator/modules/llm_judge/prompt.py   | 42 +++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py
index 5a76084..cb11487 100644
--- a/validator/modules/llm_judge/__init__.py
+++ b/validator/modules/llm_judge/__init__.py
@@ -12,11 +12,12 @@
 from loguru import logger
 from huggingface_hub import HfApi
 from typing import List, Dict, Any
-from validator.modules.llm_judge.prompt import get_prompt
+from validator.modules.llm_judge.prompt import get_prompt,template_str
 from validator.modules.llm_judge.utils import download_file
 from validator.modules.llm_judge.constant import SUPPORTED_BASE_MODELS
 from validator.exceptions import LLMJudgeException, InvalidModelParametersException
 from peft import PeftModel
+from jinja2 import Environment
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from validator.modules.base import (
     BaseValidationModule,
@@ -556,7 +557,8 @@ def _load_jsonl_conversations(
                 conversation_to_process = []
                 reference_response = None
                 tools_info = None
-
+                pending_tool_call_ids: list[str] = []
+                tool_call_counter = 0
                 if "conversations" in json_data:
                     conversations = json_data["conversations"]
                     if isinstance(conversations, list) and conversations:
@@ -567,14 +569,17 @@ def _load_jsonl_conversations(
                             if not content:
                                 continue
                             if role == "function_call":
-                                # Convert function_call to assistant message with tool_calls
+                                tool_call_counter += 1
+                                tool_call_id = f"call_{tool_call_counter}"
+
                                 try:
                                     call_data = json.loads(content)
                                     tool_call_msg = {
                                         "role": "assistant",
+                                        "content": "",
                                         "tool_calls": [
                                             {
-                                                "id": f"call_{len(conversation_to_process)}",
+                                                "id": tool_call_id,
                                                 "type": "function",
                                                 "function": {
                                                     "name": call_data.get("name", ""),
@@ -585,18 +590,19 @@ def _load_jsonl_conversations(
                                     }
                                     conversation_to_process.append(tool_call_msg)
                                 except (json.JSONDecodeError, KeyError):
-                                    # Fallback: treat as plain assistant message
+                                    tool_call_id = None
                                     conversation_to_process.append(
                                         {"role": "assistant", "content": content}
                                     )
+                                if tool_call_id:
+                                    pending_tool_call_ids.append(tool_call_id)
+
                             elif role == "observation":
-                                # Convert observation to tool result message
-                                # Find the last tool_call id to reference
-                                tool_call_id = "call_0"
-                                for prev_msg in reversed(conversation_to_process):
-                                    if prev_msg.get("role") == "assistant" and prev_msg.get("tool_calls"):
-                                        tool_call_id = prev_msg["tool_calls"][0]["id"]
-                                        break
+                                if pending_tool_call_ids:
+                                    tool_call_id = pending_tool_call_ids.pop(0)
+                                else:
+                                    tool_call_id = "call_unknown"
+
                                 conversation_to_process.append(
                                     {
                                         "role": "tool",
@@ -611,11 +617,19 @@ def _load_jsonl_conversations(
 
                         # Extract reference response (last assistant or function_call message)
                         reference_response = None
+
                         if conversation_to_process:
                             last_msg = conversations[-1]
-                            if last_msg["role"] in ["assistant", "function_call"]:
+                            if last_msg["role"] in ["assistant"]:
                                 reference_response = last_msg["content"]
                                 conversation_to_process = conversation_to_process[:-1]
+                            elif last_msg["role"] in ["function_call"]:
+                                env = Environment(trim_blocks=True, lstrip_blocks=True)
+                                conversation_template = env.from_string(template_str)
+                                reference_response = conversation_template.render(
+                                    messages=[conversation_to_process[-1]], trim_blocks=True,
+                                                         lstrip_blocks=True)
+                                conversation_to_process = conversation_to_process[:-1]
 
                         # Extract tools information if available (for function_call evaluation)
                         if "tools" in json_data:
diff --git a/validator/modules/llm_judge/prompt.py b/validator/modules/llm_judge/prompt.py
index 726d3e7..5bbf37e 100644
--- a/validator/modules/llm_judge/prompt.py
+++ b/validator/modules/llm_judge/prompt.py
@@ -181,3 +181,45 @@ def function_call_ref_eval_prompt(
         tools=Tools,
         assistant_response=assistant_response,
     )
+
+template_str= """{% for message in messages %}
+
+{% if message.role == "system" %}
+<system>
+{{ message.content }}
+</system>
+
+{% elif message.role == "user" %}
+<user>
+{{ message.content }}
+</user>
+
+{% elif message.role == "assistant" %}
+
+    {% if message.tool_calls %}
+<tool_call>
+    {% for tool in message.tool_calls %}
+<function={{ tool.function.name }}>
+        {% set args = tool.function.arguments %}
+        {% if args is string %}
+        {% set args = args | from_json %}
+        {% endif %}
+        {% for key, value in args.items() %}
+<parameter={{ key }}>{{ value }}</parameter>
+        {% endfor %}
+</function>
+    {% endfor %}
+</tool_call>
+    {% else %}
+<assistant>
+{{ message.content }}
+</assistant>
+    {% endif %}
+{% elif message.role == "tool" %}
+<tool_response>
+{{ message.content }}
+</tool_response>
+
+{% endif %}
+
+{% endfor %}"""
\ No newline at end of file