From a213598e75aae86283b7b80a38677965e2e035d8 Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Mon, 2 Mar 2026 21:35:23 +0800 Subject: [PATCH 1/9] add support qwen3 --- validator/modules/llm_judge/environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validator/modules/llm_judge/environment.yml b/validator/modules/llm_judge/environment.yml index cc75b2f..0fd595b 100644 --- a/validator/modules/llm_judge/environment.yml +++ b/validator/modules/llm_judge/environment.yml @@ -8,11 +8,11 @@ dependencies: - openai>=1.0.0 # OpenAI API client - httpx # HTTP client for OpenAI requests - pydantic>=2.0.0 # Data validation and parsing - - transformers==4.49.0 # HuggingFace transformers library + - transformers==4.53.3 # HuggingFace transformers library - torch>=1.13.1 # PyTorch for model inference - accelerate>=0.27.2 # For efficient model loading - loguru>=0.6.0 # Logging library - - huggingface-hub==0.29.1 + - huggingface-hub>=0.30.0,<1.0 - tenacity - peft>=0.10.0,<0.18.0 - python-dotenv # Load environment variables from .env file \ No newline at end of file From ea5f3faca679643fd40f4e3ecc45b80f87ce3374 Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Wed, 11 Mar 2026 09:05:07 +0800 Subject: [PATCH 2/9] update dependencies --- validator/modules/llm_judge/environment.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/validator/modules/llm_judge/environment.yml b/validator/modules/llm_judge/environment.yml index 0fd595b..361d284 100644 --- a/validator/modules/llm_judge/environment.yml +++ b/validator/modules/llm_judge/environment.yml @@ -8,11 +8,11 @@ dependencies: - openai>=1.0.0 # OpenAI API client - httpx # HTTP client for OpenAI requests - pydantic>=2.0.0 # Data validation and parsing - - transformers==4.53.3 # HuggingFace transformers library + - transformers==5.3.0 # HuggingFace transformers library - torch>=1.13.1 # PyTorch for model inference - accelerate>=0.27.2 # For efficient model loading - loguru>=0.6.0 # Logging library - - huggingface-hub>=0.30.0,<1.0 + - huggingface-hub==1.5.0 - tenacity - - peft>=0.10.0,<0.18.0 + - peft==0.18.1 - python-dotenv # Load environment variables from .env file \ No newline at end of file From 46c9983f3d1218342800e8f3ee2739460dd43879 Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Thu, 12 Mar 2026 12:00:09 +0800 Subject: [PATCH 3/9] only support for qwen model --- validator/modules/llm_judge/__init__.py | 95 ++++--------------- validator/modules/llm_judge/template.py | 118 +----------------------- 2 files changed, 20 insertions(+), 193 deletions(-) diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py index 0bfe3d3..03ed708 100644 --- a/validator/modules/llm_judge/__init__.py +++ b/validator/modules/llm_judge/__init__.py @@ -37,7 +37,7 @@ class LLMJudgeConfig(BaseConfig): gen_batch_size: int = 1 eval_batch_size: int = 16 - gen_temperature: float = 0.1 + gen_temperature: float = 0.7 class LLMJudgeMetrics(BaseMetrics): @@ -139,7 +139,6 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No model_kwargs = dict( trust_remote_code=True, torch_dtype=compute_dtype, - use_cache=False, device_map="auto", ) if is_lora: @@ -180,74 +179,6 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No ) raise InvalidModelParametersException(f"Model parameters {total} exceed limit {max_params}") - def _construct_conversation_template( - self, conversation: List[Dict[str, str]], base_model: str, - ) -> str: - try: - if base_model not in template_dict: - logger.info(f"Template {base_model} not found, using default") - base_model = "default" - - template = template_dict[base_model] - - conversation_parts = [] - - # Validate conversation structure - if not isinstance(conversation, dict): - raise LLMJudgeException(f"Conversation must be a dict, got {type(conversation)}") - - if "conversations" not in conversation: - raise LLMJudgeException(f"Conversation dict must have 'conversations' key") - - if not conversation["conversations"]: - raise LLMJudgeException(f"Conversation 'conversations' list is empty") - - # Use provided system_text or fall back to template default - if template.system_format: - system_prompt = ( - conversation["system"] if "system" in conversation else None - ) - system_content = ( - system_prompt if system_prompt else "You are a helpful assistant." - ) - if system_content: - formatted_system = template.system_format.format( - content=system_content - ) - conversation_parts.append(formatted_system) - - # Multi-turn conversation: format each message according to template - for msg in conversation["conversations"]: - if not isinstance(msg, dict) or "role" not in msg or "content" not in msg: - logger.warning(f"Skipping invalid message: {msg}") - continue - - if msg["role"] == "user": - user_text = template.user_format.format( - content=msg["content"], - stop_token=self.hf_tokenizer.eos_token, - ) - conversation_parts.append(user_text) - elif msg["role"] == "assistant": - assistant_text = template.assistant_format.format( - content=msg["content"], - stop_token=self.hf_tokenizer.eos_token, - ) - conversation_parts.append(assistant_text) - - conversation_format = "".join(conversation_parts) - - if not conversation_format.strip(): - logger.error(f"Empty template generated. Template: {base_model}, Conversation: {conversation}, Parts: {conversation_parts}") - raise LLMJudgeException(f"Generated conversation template is empty after formatting") - - except Exception as e: - raise LLMJudgeException( - f"Failed to construct conversation template: {e}" - ) from e - - return conversation_format - def _generate_response( self, context_length: int, @@ -272,10 +203,22 @@ def _generate_response( # Apply chat template with fallback batch_conversation_templates = [] for conversation in batch_conversations: - template = self._construct_conversation_template( - conversation, base_model=base_model, + + messages = [] + if "system" in conversation: + messages.append({ + "role": "system", + "content": conversation["system"] + }) + + messages += conversation["conversations"] + template = self.hf_tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + enable_thinking=False, ) - + # Validate template is not empty if not template or not template.strip(): logger.error(f"Empty template generated for conversation: {conversation}") @@ -313,10 +256,10 @@ def _generate_response( outputs = self.hf_model.generate( **model_inputs, max_new_tokens=max_length, - temperature=self.config.gen_temperature, + temperature=self.config.gen_temperature, # Non thinking-General 0.7 ,Reasoning 1 do_sample=True, - top_p=0.95, # Nucleus sampling for stability - top_k=50, # Limit vocabulary for stability + top_p=0.8, # Non thinking-General 0.8 ,Reasoning 0.95 + top_k=20, # pad_token_id=self.hf_tokenizer.eos_token_id, eos_token_id=self.hf_tokenizer.eos_token_id, ) diff --git a/validator/modules/llm_judge/template.py b/validator/modules/llm_judge/template.py index 862c52f..6be151e 100644 --- a/validator/modules/llm_judge/template.py +++ b/validator/modules/llm_judge/template.py @@ -56,7 +56,7 @@ def register_template( register_template( - template_name="qwen1.5", + template_name="qwen", system_format="<|im_start|>system\n{content}<|im_end|>\n", user_format="<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n", assistant_format="{content}<|im_end|>\n", @@ -66,119 +66,3 @@ def register_template( system="You are a helpful assistant.", stop_word="<|im_end|>", ) - -register_template( - template_name="yi", - system_format="<|im_start|>system\n{content}<|im_end|>\n", - user_format="<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n", - assistant_format="{content}<|im_end|>\n", - tool_format="{content}", - function_format="{content}", - observation_format="<|im_start|>tool\n{content}\n<|im_start|>assistant\n", - system=None, - stop_word="<|im_end|>", -) - - -register_template( - template_name="zephyr", - system_format="<|system|>\n{content}", - user_format="<|user|>\n{content}\n<|assistant|>\n", - assistant_format="{content}\n", - tool_format="{content}", - function_format="{content}", - observation_format="<|tool|>\n{content}\n<|assistant|>\n", - system=None, - stop_word="", -) - -register_template( - template_name="mistral", - system_format="", - user_format="[INST]{content}[/INST]", - assistant_format="{content}", - tool_format="{content}", - function_format="{content}", - observation_format="{content}", - system="", - stop_word="", -) - -register_template( - template_name="mixtral", - system_format="", - user_format="[INST]{content}[/INST]", - assistant_format="{content}", - tool_format="{content}", - function_format="{content}", - observation_format="{content}", - system="", - stop_word="", -) - -register_template( - template_name="llama2", - system_format="<>\n{content}\n<>\n\n", - user_format="[INST]{content}[/INST]", - assistant_format="{content} ", - tool_format="{content}", - function_format="{content}", - observation_format="{content}", - system="You are a helpful, respectful and honest assistant. " - "Always answer as helpfully as possible, while being safe. " - "Your answers should not include any harmful, unethical, " - "racist, sexist, toxic, dangerous, or illegal content. " - "Please ensure that your responses are socially unbiased and positive in nature.\n\n" - "If a question does not make any sense, or is not factually coherent, " - "explain why instead of answering something not correct. " - "If you don't know the answer to a question, please don't share false information.", - stop_word="", -) - -register_template( - template_name="gemma", - system_format="", - user_format="user\n{content}\nmodel\n", - assistant_format="{content}\n", - tool_format="{content}", - function_format="{content}", - observation_format="tool\n{content}\nmodel\n", - system="", - stop_word="", -) - -register_template( - template_name="llama3", - system_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>", - user_format="<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - assistant_format="{content}<|eot_id|>", - tool_format="{content}", - function_format="{content}", - observation_format="<|start_header_id|>tool<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - system=None, - stop_word="<|eot_id|>", -) - -register_template( - template_name="phi3", - system_format=None, - user_format="<|user|>\n{content}<|end|>\n<|assistant|>", - assistant_format="{content}<|end|>\n", - tool_format="{content}", - function_format="{content}", - observation_format="<|tool|>\n{content}<|end|>\n<|assistant|>", - system=None, - stop_word="<|end|>", -) - -register_template( - template_name="phi4", - system_format=None, - user_format="<|user|>\n{content}<|end|>\n<|assistant|>", - assistant_format="{content}<|end|>\n", - tool_format="<|tool|>{content}<|/tool|>", - function_format="<|tool_call|>{content}<|/tool_call|>", - observation_format="<|tool|>\n{content}<|end|>\n<|assistant|>", - system=None, - stop_word="<|end|>", -) From cbfeac91bab5c232c2cb121e55771f72a7756650 Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Tue, 17 Mar 2026 16:48:13 +0800 Subject: [PATCH 4/9] delete template.py --- validator/modules/llm_judge/__init__.py | 1 - validator/modules/llm_judge/template.py | 68 ------------------------- 2 files changed, 69 deletions(-) delete mode 100644 validator/modules/llm_judge/template.py diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py index 03ed708..cbeb9ea 100644 --- a/validator/modules/llm_judge/__init__.py +++ b/validator/modules/llm_judge/__init__.py @@ -15,7 +15,6 @@ from validator.modules.llm_judge.prompt import get_prompt from validator.modules.llm_judge.utils import download_file from validator.exceptions import LLMJudgeException, InvalidModelParametersException -from validator.modules.llm_judge.template import template_dict from peft import PeftModel from transformers import AutoTokenizer, AutoModelForCausalLM from validator.modules.base import ( diff --git a/validator/modules/llm_judge/template.py b/validator/modules/llm_judge/template.py deleted file mode 100644 index 6be151e..0000000 --- a/validator/modules/llm_judge/template.py +++ /dev/null @@ -1,68 +0,0 @@ -from dataclasses import dataclass -from typing import Dict - - -@dataclass -class Template: - template_name: str - system_format: str - user_format: str - assistant_format: str - tool_format: str - function_format: str - observation_format: str - system: str - stop_word: str - - -template_dict: Dict[str, Template] = dict() - - -def register_template( - template_name, - system_format, - user_format, - assistant_format, - tool_format, - function_format, - observation_format, - system, - stop_word=None, -): - template_dict[template_name] = Template( - template_name=template_name, - system_format=system_format, - user_format=user_format, - assistant_format=assistant_format, - tool_format=tool_format, - function_format=function_format, - observation_format=observation_format, - system=system, - stop_word=stop_word, - ) - - -register_template( - template_name="default", - system_format="System: {content}\n\n", - user_format="User: {content}\nAssistant: ", - assistant_format="{content} {stop_token}", - tool_format="{content}", - function_format="{content}", - observation_format="Tool\n{content}\n", - system=None, - stop_word=None, -) - - -register_template( - template_name="qwen", - system_format="<|im_start|>system\n{content}<|im_end|>\n", - user_format="<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n", - assistant_format="{content}<|im_end|>\n", - tool_format="{content}", - function_format="{content}", - observation_format="<|im_start|>tool\n{content}\n<|im_start|>assistant\n", - system="You are a helpful assistant.", - stop_word="<|im_end|>", -) From 13df7e527ca15829f462cc4a53bfe7d046b61e1c Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Tue, 17 Mar 2026 23:49:47 +0800 Subject: [PATCH 5/9] add hf_tokenizer mapping --- validator/modules/llm_judge/__init__.py | 18 ++++++- validator/modules/llm_judge/constant.py | 66 +++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 validator/modules/llm_judge/constant.py diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py index 0ce8bda..6655830 100644 --- a/validator/modules/llm_judge/__init__.py +++ b/validator/modules/llm_judge/__init__.py @@ -14,6 +14,7 @@ from typing import List, Dict, Any from validator.modules.llm_judge.prompt import get_prompt from validator.modules.llm_judge.utils import download_file +from validator.modules.llm_judge.constant import SUPPORTED_BASE_MODELS from validator.exceptions import LLMJudgeException, InvalidModelParametersException from peft import PeftModel from transformers import AutoTokenizer, AutoModelForCausalLM @@ -155,6 +156,18 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No with open("judge/adapter_config.json", "r") as f: adapter_config = json.load(f) base_model = adapter_config["base_model_name_or_path"] + if base_model in SUPPORTED_BASE_MODELS: + logger.info( + f"LoRA's base model '{base_model}' is in SUPPORTED_BASE_MODELS. " + f"Using it for tokenizer." + ) + else: + logger.error( + f"LoRA's base model '{base_model}' is not in SUPPORTED_BASE_MODELS. " + f"Marking assignment as failed." + ) + raise + self.hf_tokenizer = AutoTokenizer.from_pretrained( base_model, trust_remote_code=True, use_fast=True, padding_side="left" ) @@ -843,7 +856,10 @@ def validate(self, data: LLMJudgeInputData, **kwargs) -> LLMJudgeMetrics: self._load_model(data.hg_repo_id, data.revision, data.max_params) except InvalidModelParametersException as e: # lowest possible reward for invalid model parameters - logger.info(f"Invalid model parameters: {e}") + logger.error(f"Invalid model parameters: {e}") + return LLMJudgeMetrics(score=LOWEST_POSSIBLE_SCORE) + except Exception as e: + logger.error(f"Exception when load model: {e}") return LLMJudgeMetrics(score=LOWEST_POSSIBLE_SCORE) # Stage 1: Generate all responses diff --git a/validator/modules/llm_judge/constant.py b/validator/modules/llm_judge/constant.py new file mode 100644 index 0000000..772fb3e --- /dev/null +++ b/validator/modules/llm_judge/constant.py @@ -0,0 +1,66 @@ +SUPPORTED_BASE_MODELS = [ + # qwen3.5 + "Qwen/Qwen3.5-0.8B", + "Qwen/Qwen3.5-0.8B-Base", + "Qwen/Qwen3.5-2B", + "Qwen/Qwen3.5-2B-Base", + "Qwen/Qwen3.5-4B", + "Qwen/Qwen3.5-4B-Base", + "Qwen/Qwen3.5-9B", + "Qwen/Qwen3.5-9B-Base", + "Qwen/Qwen3.5-27B", + # qwen3 + "Qwen/Qwen3-0.6B", + "Qwen/Qwen3-1.7B", + "Qwen/Qwen3-4B", + "Qwen/Qwen3-4B-Base", + "Qwen/Qwen3-4B-Instruct-2507", + "Qwen/Qwen3-8B", + "Qwen/Qwen3-8B-Base", + "Qwen/Qwen3-14B", + "Qwen/Qwen3-14B-Base", + "Qwen/Qwen3-32B", + # qwen2.5 + "Qwen/Qwen2.5-0.5B", + "Qwen/Qwen2.5-0.5B-Instruct", + "Qwen/Qwen2.5-1.5B", + "Qwen/Qwen2.5-1.5B-Instruct", + "Qwen/Qwen2.5-3B", + "Qwen/Qwen2.5-3B-Instruct", + "Qwen/Qwen2.5-7B", + "Qwen/Qwen2.5-7B-Instruct", + "Qwen/Qwen2.5-14B", + "Qwen/Qwen2.5-14B-Instruct", + "Qwen/Qwen2.5-32B", + "Qwen/Qwen2.5-32B-Instruct", + "Qwen/Qwen2.5-72B", + "Qwen/Qwen2.5-72B-Instruct", + # yi 1.5 + "01-ai/Yi-1.5-6B", + "01-ai/Yi-1.5-6B-Chat", + "01-ai/Yi-1.5-9B", + "01-ai/Yi-1.5-9B-Chat", + "01-ai/Yi-1.5-34B", + "01-ai/Yi-1.5-34B-Chat", + # mistral + "mistralai/Mistral-7B-v0.3", + "mistralai/Mistral-7B-Instruct-v0.3", + "mistralai/Ministral-8B-Instruct-2410", + # gemma2 + "google/gemma-2-2b", + "google/gemma-2-9b", + "google/gemma-2-27b", + "google/gemma-2-2b-it", + "google/gemma-2-9b-it", + "google/gemma-2-27b-it", + # llama3 + "meta-llama/Meta-Llama-3-8B", + "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3-70B", + "meta-llama/Meta-Llama-3-70B-Instruct", + # llama3.1 + "meta-llama/Meta-Llama-3.1-8B", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "meta-llama/Meta-Llama-3.1-70B", + "meta-llama/Meta-Llama-3.1-70B-Instruct", +] \ No newline at end of file From be3280d27f510216cc4427d10a470c7e48b0ac3e Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Wed, 18 Mar 2026 11:52:19 +0800 Subject: [PATCH 6/9] only keep qwen3.5 model --- validator/modules/llm_judge/constant.py | 54 ------------------------- 1 file changed, 54 deletions(-) diff --git a/validator/modules/llm_judge/constant.py b/validator/modules/llm_judge/constant.py index 772fb3e..30efd9f 100644 --- a/validator/modules/llm_judge/constant.py +++ b/validator/modules/llm_judge/constant.py @@ -9,58 +9,4 @@ "Qwen/Qwen3.5-9B", "Qwen/Qwen3.5-9B-Base", "Qwen/Qwen3.5-27B", - # qwen3 - "Qwen/Qwen3-0.6B", - "Qwen/Qwen3-1.7B", - "Qwen/Qwen3-4B", - "Qwen/Qwen3-4B-Base", - "Qwen/Qwen3-4B-Instruct-2507", - "Qwen/Qwen3-8B", - "Qwen/Qwen3-8B-Base", - "Qwen/Qwen3-14B", - "Qwen/Qwen3-14B-Base", - "Qwen/Qwen3-32B", - # qwen2.5 - "Qwen/Qwen2.5-0.5B", - "Qwen/Qwen2.5-0.5B-Instruct", - "Qwen/Qwen2.5-1.5B", - "Qwen/Qwen2.5-1.5B-Instruct", - "Qwen/Qwen2.5-3B", - "Qwen/Qwen2.5-3B-Instruct", - "Qwen/Qwen2.5-7B", - "Qwen/Qwen2.5-7B-Instruct", - "Qwen/Qwen2.5-14B", - "Qwen/Qwen2.5-14B-Instruct", - "Qwen/Qwen2.5-32B", - "Qwen/Qwen2.5-32B-Instruct", - "Qwen/Qwen2.5-72B", - "Qwen/Qwen2.5-72B-Instruct", - # yi 1.5 - "01-ai/Yi-1.5-6B", - "01-ai/Yi-1.5-6B-Chat", - "01-ai/Yi-1.5-9B", - "01-ai/Yi-1.5-9B-Chat", - "01-ai/Yi-1.5-34B", - "01-ai/Yi-1.5-34B-Chat", - # mistral - "mistralai/Mistral-7B-v0.3", - "mistralai/Mistral-7B-Instruct-v0.3", - "mistralai/Ministral-8B-Instruct-2410", - # gemma2 - "google/gemma-2-2b", - "google/gemma-2-9b", - "google/gemma-2-27b", - "google/gemma-2-2b-it", - "google/gemma-2-9b-it", - "google/gemma-2-27b-it", - # llama3 - "meta-llama/Meta-Llama-3-8B", - "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-70B", - "meta-llama/Meta-Llama-3-70B-Instruct", - # llama3.1 - "meta-llama/Meta-Llama-3.1-8B", - "meta-llama/Meta-Llama-3.1-8B-Instruct", - "meta-llama/Meta-Llama-3.1-70B", - "meta-llama/Meta-Llama-3.1-70B-Instruct", ] \ No newline at end of file From ff452a59b205d9a455bfd059b9d0ca59a5ec7d8f Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Fri, 20 Mar 2026 00:44:54 +0800 Subject: [PATCH 7/9] fix qwen3.5 model function calling bug. --- validator/modules/llm_judge/__init__.py | 59 ++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py index 6655830..311c0c4 100644 --- a/validator/modules/llm_judge/__init__.py +++ b/validator/modules/llm_judge/__init__.py @@ -151,7 +151,7 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No repo_id=repo_id, local_dir="judge", revision=revision, - force_download=True, # Disable fallback to cached files + # force_download=True, # Disable fallback to cached files ) with open("judge/adapter_config.json", "r") as f: adapter_config = json.load(f) @@ -232,8 +232,16 @@ def _generate_response( }) messages += conversation["conversations"] + tools_for_template = conversation.get("tools", None) + try: + if isinstance(tools_for_template, str): + tools_for_template = json.loads(tools_for_template) + except Exception: + # leave tools_for_template as-is if parsing fails + pass template = self.hf_tokenizer.apply_chat_template( messages, + tools=tools_for_template, tokenize=False, add_generation_prompt=True, enable_thinking=False, @@ -556,10 +564,47 @@ def _load_jsonl_conversations( for msg in conversations: role = msg.get("role", "") content = msg.get("content", "").strip() - if ( - role in ["user", "assistant", "function_call"] - and content - ): + if not content: + continue + if role == "function_call": + # Convert function_call to assistant message with tool_calls + try: + call_data = json.loads(content) + tool_call_msg = { + "role": "assistant", + "tool_calls": [ + { + "id": f"call_{len(conversation_to_process)}", + "type": "function", + "function": { + "name": call_data.get("name", ""), + "arguments": call_data.get("arguments", {}) + }, + } + ], + } + conversation_to_process.append(tool_call_msg) + except (json.JSONDecodeError, KeyError): + # Fallback: treat as plain assistant message + conversation_to_process.append( + {"role": "assistant", "content": content} + ) + elif role == "observation": + # Convert observation to tool result message + # Find the last tool_call id to reference + tool_call_id = "call_0" + for prev_msg in reversed(conversation_to_process): + if prev_msg.get("role") == "assistant" and prev_msg.get("tool_calls"): + tool_call_id = prev_msg["tool_calls"][0]["id"] + break + conversation_to_process.append( + { + "role": "tool", + "tool_call_id": tool_call_id, + "content": content, + } + ) + elif role in ["user", "assistant"]: conversation_to_process.append( {"role": role, "content": content} ) @@ -567,7 +612,7 @@ def _load_jsonl_conversations( # Extract reference response (last assistant or function_call message) reference_response = None if conversation_to_process: - last_msg = conversation_to_process[-1] + last_msg = conversations[-1] if last_msg["role"] in ["assistant", "function_call"]: reference_response = last_msg["content"] conversation_to_process = conversation_to_process[:-1] @@ -591,6 +636,8 @@ def _load_jsonl_conversations( continue input_conversations_data["conversations"] = conversation_to_process + if tools_info is not None: + input_conversations_data["tools"] = tools_info input_conversations.append( { From 4ec68cfccdec3c295730e39b4fb63ff3b8f5004b Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Fri, 20 Mar 2026 00:45:44 +0800 Subject: [PATCH 8/9] fix qwen3.5 model function calling bug. --- validator/modules/llm_judge/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py index 311c0c4..5a76084 100644 --- a/validator/modules/llm_judge/__init__.py +++ b/validator/modules/llm_judge/__init__.py @@ -151,7 +151,7 @@ def _load_model(self, repo_id: str, revision: str = "main", max_params: int = No repo_id=repo_id, local_dir="judge", revision=revision, - # force_download=True, # Disable fallback to cached files + force_download=True, # Disable fallback to cached files ) with open("judge/adapter_config.json", "r") as f: adapter_config = json.load(f) From 6440edc12da9313813acb3e5783dad5d07963836 Mon Sep 17 00:00:00 2001 From: feng <471899214@qq.com> Date: Fri, 20 Mar 2026 23:42:20 +0800 Subject: [PATCH 9/9] fix some Tool call bug --- validator/modules/llm_judge/__init__.py | 40 +++++++++++++++-------- validator/modules/llm_judge/prompt.py | 42 +++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/validator/modules/llm_judge/__init__.py b/validator/modules/llm_judge/__init__.py index 5a76084..cb11487 100644 --- a/validator/modules/llm_judge/__init__.py +++ b/validator/modules/llm_judge/__init__.py @@ -12,11 +12,12 @@ from loguru import logger from huggingface_hub import HfApi from typing import List, Dict, Any -from validator.modules.llm_judge.prompt import get_prompt +from validator.modules.llm_judge.prompt import get_prompt,template_str from validator.modules.llm_judge.utils import download_file from validator.modules.llm_judge.constant import SUPPORTED_BASE_MODELS from validator.exceptions import LLMJudgeException, InvalidModelParametersException from peft import PeftModel +from jinja2 import Environment from transformers import AutoTokenizer, AutoModelForCausalLM from validator.modules.base import ( BaseValidationModule, @@ -556,7 +557,8 @@ def _load_jsonl_conversations( conversation_to_process = [] reference_response = None tools_info = None - + pending_tool_call_ids: list[str] = [] + tool_call_counter = 0 if "conversations" in json_data: conversations = json_data["conversations"] if isinstance(conversations, list) and conversations: @@ -567,14 +569,17 @@ def _load_jsonl_conversations( if not content: continue if role == "function_call": - # Convert function_call to assistant message with tool_calls + tool_call_counter += 1 + tool_call_id = f"call_{tool_call_counter}" + try: call_data = json.loads(content) tool_call_msg = { "role": "assistant", + "content": "", "tool_calls": [ { - "id": f"call_{len(conversation_to_process)}", + "id": tool_call_id, "type": "function", "function": { "name": call_data.get("name", ""), @@ -585,18 +590,19 @@ def _load_jsonl_conversations( } conversation_to_process.append(tool_call_msg) except (json.JSONDecodeError, KeyError): - # Fallback: treat as plain assistant message + tool_call_id = None conversation_to_process.append( {"role": "assistant", "content": content} ) + if tool_call_id: + pending_tool_call_ids.append(tool_call_id) + elif role == "observation": - # Convert observation to tool result message - # Find the last tool_call id to reference - tool_call_id = "call_0" - for prev_msg in reversed(conversation_to_process): - if prev_msg.get("role") == "assistant" and prev_msg.get("tool_calls"): - tool_call_id = prev_msg["tool_calls"][0]["id"] - break + if pending_tool_call_ids: + tool_call_id = pending_tool_call_ids.pop(0) + else: + tool_call_id = "call_unknown" + conversation_to_process.append( { "role": "tool", @@ -611,11 +617,19 @@ def _load_jsonl_conversations( # Extract reference response (last assistant or function_call message) reference_response = None + if conversation_to_process: last_msg = conversations[-1] - if last_msg["role"] in ["assistant", "function_call"]: + if last_msg["role"] in ["assistant"]: reference_response = last_msg["content"] conversation_to_process = conversation_to_process[:-1] + elif last_msg["role"] in ["function_call"]: + env = Environment(trim_blocks=True, lstrip_blocks=True) + conversation_template = env.from_string(template_str) + reference_response = conversation_template.render( + messages=[conversation_to_process[-1]], trim_blocks=True, + lstrip_blocks=True) + conversation_to_process = conversation_to_process[:-1] # Extract tools information if available (for function_call evaluation) if "tools" in json_data: diff --git a/validator/modules/llm_judge/prompt.py b/validator/modules/llm_judge/prompt.py index 726d3e7..5bbf37e 100644 --- a/validator/modules/llm_judge/prompt.py +++ b/validator/modules/llm_judge/prompt.py @@ -181,3 +181,45 @@ def function_call_ref_eval_prompt( tools=Tools, assistant_response=assistant_response, ) + +template_str= """{% for message in messages %} + +{% if message.role == "system" %} + +{{ message.content }} + + +{% elif message.role == "user" %} + +{{ message.content }} + + +{% elif message.role == "assistant" %} + + {% if message.tool_calls %} + + {% for tool in message.tool_calls %} + + {% set args = tool.function.arguments %} + {% if args is string %} + {% set args = args | from_json %} + {% endif %} + {% for key, value in args.items() %} +{{ value }} + {% endfor %} + + {% endfor %} + + {% else %} + +{{ message.content }} + + {% endif %} +{% elif message.role == "tool" %} + +{{ message.content }} + + +{% endif %} + +{% endfor %}""" \ No newline at end of file