From 8e4f1818e28d43dbcd54548330a2984a494cf7ca Mon Sep 17 00:00:00 2001 From: "qiao.cai" Date: Thu, 2 Apr 2026 15:32:22 -0700 Subject: [PATCH] feat: add kwargs in evaluate function of agent and common grader --- .../graders/agent/action/action_alignment.py | 1 + openjudge/graders/agent/action/action_loop.py | 1 + .../observation/observation_information_gain.py | 1 + .../graders/agent/tool/tool_call_accuracy.py | 1 + .../tool/tool_call_precision_recall_match.py | 1 + .../agent/tool/tool_call_step_sequence_match.py | 1 + openjudge/graders/agent/tool/tool_call_success.py | 1 + .../graders/agent/tool/tool_parameter_check.py | 1 + openjudge/graders/agent/tool/tool_selection.py | 1 + .../agent/trajectory/trajectory_accuracy.py | 1 + .../agent/trajectory/trajectory_comprehensive.py | 1 + openjudge/graders/common/correctness.py | 9 +++++++-- openjudge/graders/common/hallucination.py | 1 + openjudge/graders/common/harmfulness.py | 3 ++- openjudge/graders/common/instruction_following.py | 3 ++- openjudge/graders/common/relevance.py | 4 ++-- openjudge/graders/common/search_correctness.py | 4 ++-- tests/models/test_minimax_chat_model.py | 15 ++++----------- 18 files changed, 31 insertions(+), 19 deletions(-) diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py index b0fad9223..d78a41eec 100644 --- a/openjudge/graders/agent/action/action_alignment.py +++ b/openjudge/graders/agent/action/action_alignment.py @@ -215,6 +215,7 @@ async def _aevaluate( action: str, history: Optional[List[Dict[str, Any]]] = None, context: Optional[str] = None, + **kwargs: Any, ) -> GraderScore: """ Evaluate action alignment with plan diff --git a/openjudge/graders/agent/action/action_loop.py b/openjudge/graders/agent/action/action_loop.py index 77c080a28..057f2fb9d 100644 --- a/openjudge/graders/agent/action/action_loop.py +++ b/openjudge/graders/agent/action/action_loop.py @@ -50,6 +50,7 @@ def __init__( async def _aevaluate( self, messages: List[Dict[str, Any]], + **kwargs: Any, ) -> GraderScore: """ Detect loops in action sequences by comparing all pairs of action signatures. diff --git a/openjudge/graders/agent/observation/observation_information_gain.py b/openjudge/graders/agent/observation/observation_information_gain.py index 85d547c63..1edacda6f 100644 --- a/openjudge/graders/agent/observation/observation_information_gain.py +++ b/openjudge/graders/agent/observation/observation_information_gain.py @@ -60,6 +60,7 @@ def __init__( async def _aevaluate( self, messages: List[Dict[str, Any]], + **kwargs: Any, ) -> GraderScore: """ Evaluate information gain and redundancy in observation observations. diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py index 526936a94..e20ca2fa4 100644 --- a/openjudge/graders/agent/tool/tool_call_accuracy.py +++ b/openjudge/graders/agent/tool/tool_call_accuracy.py @@ -280,6 +280,7 @@ async def _aevaluate( tool_definitions: Dict[str, Any] | List[Dict[str, Any]], tool_calls: Dict[str, Any] | List[Dict[str, Any]] | None = None, response: str | List[Dict[str, Any]] | None = None, + **kwargs: Any, ) -> GraderScore | GraderError: """ Evaluate tool call accuracy diff --git a/openjudge/graders/agent/tool/tool_call_precision_recall_match.py b/openjudge/graders/agent/tool/tool_call_precision_recall_match.py index 64c93ed72..8832792e6 100644 --- a/openjudge/graders/agent/tool/tool_call_precision_recall_match.py +++ b/openjudge/graders/agent/tool/tool_call_precision_recall_match.py @@ -199,6 +199,7 @@ async def _aevaluate( self, tool_calls: List[Dict[str, Any]], reference_tool_calls: List[Dict[str, Any]], + **kwargs: Any, ) -> GraderScore | GraderError: """ Evaluate tool call precision/recall against reference. diff --git a/openjudge/graders/agent/tool/tool_call_step_sequence_match.py b/openjudge/graders/agent/tool/tool_call_step_sequence_match.py index 519547285..9f3105aa2 100644 --- a/openjudge/graders/agent/tool/tool_call_step_sequence_match.py +++ b/openjudge/graders/agent/tool/tool_call_step_sequence_match.py @@ -358,6 +358,7 @@ async def _aevaluate( self, messages: List[Dict[str, Any]], reference_tool_calls: List[List[Dict[str, Any]]], + **kwargs: Any, ) -> GraderScore | GraderError: """ Evaluate tool call sequence matching against reference. diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py index ee97f1acb..8d251c8d4 100644 --- a/openjudge/graders/agent/tool/tool_call_success.py +++ b/openjudge/graders/agent/tool/tool_call_success.py @@ -267,6 +267,7 @@ async def _aevaluate( tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]], tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]], tool_responses: Union[str, List[str]], + **kwargs: Any, ) -> GraderScore: """ Evaluate tool call success diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py index 66b3f9014..9fc0578a1 100644 --- a/openjudge/graders/agent/tool/tool_parameter_check.py +++ b/openjudge/graders/agent/tool/tool_parameter_check.py @@ -214,6 +214,7 @@ async def _aevaluate( query: Union[str, List[Dict[str, Any]]], tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]], tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]], + **kwargs: Any, ) -> GraderScore: """ Evaluate tool parameter extraction correctness diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py index fb4b66913..8bac4130e 100644 --- a/openjudge/graders/agent/tool/tool_selection.py +++ b/openjudge/graders/agent/tool/tool_selection.py @@ -230,6 +230,7 @@ async def _aevaluate( query: Union[str, List[Dict[str, Any]]], tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]], tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]], + **kwargs: Any, ) -> GraderScore: """ Evaluate tool selection diff --git a/openjudge/graders/agent/trajectory/trajectory_accuracy.py b/openjudge/graders/agent/trajectory/trajectory_accuracy.py index 7dfb07533..5709ab738 100644 --- a/openjudge/graders/agent/trajectory/trajectory_accuracy.py +++ b/openjudge/graders/agent/trajectory/trajectory_accuracy.py @@ -270,6 +270,7 @@ def _format_messages( async def _aevaluate( self, messages: List[Dict[str, Any]], + **kwargs: Any, ) -> GraderScore | GraderError: """ Evaluate trajectory accuracy diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py index b2a9b388e..d8df784aa 100644 --- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py +++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py @@ -545,6 +545,7 @@ async def _aevaluate( messages: List[Dict[str, Any]], query: Optional[str] = None, response: Optional[str | Dict[str, Any]] = None, + **kwargs: Any, ) -> GraderScore | GraderError: """ Evaluate complete agent trajectory comprehensively. diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index 50df115ca..17a261607 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -7,7 +7,7 @@ """ import textwrap -from typing import Optional +from typing import Any, Optional from loguru import logger @@ -312,7 +312,12 @@ def __init__( self.threshold = threshold async def _aevaluate( - self, query: str, response: str, context: str = "", reference_response: str = "", **kwargs + self, + query: str, + response: str, + context: str = "", + reference_response: str = "", + **kwargs: Any, ) -> GraderScore: """ Evaluate correctness of response against reference response diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py index 82787e477..0b3f547f8 100644 --- a/openjudge/graders/common/hallucination.py +++ b/openjudge/graders/common/hallucination.py @@ -300,6 +300,7 @@ async def _aevaluate( response: str, context: str = "", reference_response: str = "", + **kwargs: Any, ) -> GraderScore: """ Evaluate hallucination in response diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index 6c3b77577..6186387ba 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Optional +from typing import Any, Optional from loguru import logger @@ -294,6 +294,7 @@ async def _aevaluate( response: str, context: str = "", reference_response: str = "", + **kwargs: Any, ) -> GraderScore: """ Evaluate harmfulness of response diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index c4b581da8..ea175c8d3 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -7,7 +7,7 @@ """ import textwrap -from typing import Optional +from typing import Any, Optional from loguru import logger @@ -309,6 +309,7 @@ async def _aevaluate( instruction: str, response: str, query: str = "", + **kwargs: Any, ) -> GraderScore: """ Evaluate instruction following in response diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py index 338741c1e..b718417f7 100644 --- a/openjudge/graders/common/relevance.py +++ b/openjudge/graders/common/relevance.py @@ -6,7 +6,7 @@ """ import textwrap -from typing import Optional +from typing import Any, Optional from loguru import logger @@ -310,7 +310,7 @@ async def _aevaluate( response: str, context: str = "", reference_response: str = "", - **kwargs, + **kwargs: Any, ) -> GraderScore | GraderError: """ Evaluate relevance of response to query diff --git a/openjudge/graders/common/search_correctness.py b/openjudge/graders/common/search_correctness.py index d4ebca79b..28471d569 100644 --- a/openjudge/graders/common/search_correctness.py +++ b/openjudge/graders/common/search_correctness.py @@ -8,7 +8,7 @@ import os import textwrap -from typing import Optional +from typing import Any, Optional from loguru import logger @@ -272,7 +272,7 @@ async def _aevaluate( self, query: str = "", response: str = "", - **kwargs, + **kwargs: Any, ) -> "GraderScore | GraderError": """Evaluate the factual accuracy of a response using web search. diff --git a/tests/models/test_minimax_chat_model.py b/tests/models/test_minimax_chat_model.py index cbbe5e497..454bac55b 100644 --- a/tests/models/test_minimax_chat_model.py +++ b/tests/models/test_minimax_chat_model.py @@ -14,7 +14,7 @@ """ import os -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, patch import pytest @@ -22,7 +22,6 @@ from openjudge.models.minimax_chat_model import MINIMAX_MODELS, _strip_think_tags from openjudge.models.schema.oai.response import ChatResponse - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -247,9 +246,7 @@ class TestMiniMaxChatModelIntegration: @pytest.mark.asyncio async def test_basic_chat(self): model = MiniMaxChatModel(model="MiniMax-M2.7") - response = await model.achat( - messages=[{"role": "user", "content": "Reply with the single word: hello"}] - ) + response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}]) assert isinstance(response, ChatResponse) assert response.content # Think-tags should be stripped @@ -258,16 +255,12 @@ async def test_basic_chat(self): @pytest.mark.asyncio async def test_temperature_clamping_does_not_error(self): model = MiniMaxChatModel(model="MiniMax-M2.7", temperature=0.0) - response = await model.achat( - messages=[{"role": "user", "content": "Say: ok"}] - ) + response = await model.achat(messages=[{"role": "user", "content": "Say: ok"}]) assert isinstance(response, ChatResponse) @pytest.mark.asyncio async def test_highspeed_model(self): model = MiniMaxChatModel(model="MiniMax-M2.7-highspeed") - response = await model.achat( - messages=[{"role": "user", "content": "Reply with the single word: hello"}] - ) + response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}]) assert isinstance(response, ChatResponse) assert response.content