From 8e4f1818e28d43dbcd54548330a2984a494cf7ca Mon Sep 17 00:00:00 2001
From: "qiao.cai" <qiao.cai@alibaba-inc.com>
Date: Thu, 2 Apr 2026 15:32:22 -0700
Subject: [PATCH] feat: add kwargs in evaluate function of agent and common
 grader

---
 .../graders/agent/action/action_alignment.py      |  1 +
 openjudge/graders/agent/action/action_loop.py     |  1 +
 .../observation/observation_information_gain.py   |  1 +
 .../graders/agent/tool/tool_call_accuracy.py      |  1 +
 .../tool/tool_call_precision_recall_match.py      |  1 +
 .../agent/tool/tool_call_step_sequence_match.py   |  1 +
 openjudge/graders/agent/tool/tool_call_success.py |  1 +
 .../graders/agent/tool/tool_parameter_check.py    |  1 +
 openjudge/graders/agent/tool/tool_selection.py    |  1 +
 .../agent/trajectory/trajectory_accuracy.py       |  1 +
 .../agent/trajectory/trajectory_comprehensive.py  |  1 +
 openjudge/graders/common/correctness.py           |  9 +++++++--
 openjudge/graders/common/hallucination.py         |  1 +
 openjudge/graders/common/harmfulness.py           |  3 ++-
 openjudge/graders/common/instruction_following.py |  3 ++-
 openjudge/graders/common/relevance.py             |  4 ++--
 openjudge/graders/common/search_correctness.py    |  4 ++--
 tests/models/test_minimax_chat_model.py           | 15 ++++-----------
 18 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
index b0fad9223..d78a41eec 100644
--- a/openjudge/graders/agent/action/action_alignment.py
+++ b/openjudge/graders/agent/action/action_alignment.py
@@ -215,6 +215,7 @@ async def _aevaluate(
         action: str,
         history: Optional[List[Dict[str, Any]]] = None,
         context: Optional[str] = None,
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate action alignment with plan
diff --git a/openjudge/graders/agent/action/action_loop.py b/openjudge/graders/agent/action/action_loop.py
index 77c080a28..057f2fb9d 100644
--- a/openjudge/graders/agent/action/action_loop.py
+++ b/openjudge/graders/agent/action/action_loop.py
@@ -50,6 +50,7 @@ def __init__(
     async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Detect loops in action sequences by comparing all pairs of action signatures.
diff --git a/openjudge/graders/agent/observation/observation_information_gain.py b/openjudge/graders/agent/observation/observation_information_gain.py
index 85d547c63..1edacda6f 100644
--- a/openjudge/graders/agent/observation/observation_information_gain.py
+++ b/openjudge/graders/agent/observation/observation_information_gain.py
@@ -60,6 +60,7 @@ def __init__(
     async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate information gain and redundancy in observation observations.
diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py
index 526936a94..e20ca2fa4 100644
--- a/openjudge/graders/agent/tool/tool_call_accuracy.py
+++ b/openjudge/graders/agent/tool/tool_call_accuracy.py
@@ -280,6 +280,7 @@ async def _aevaluate(
         tool_definitions: Dict[str, Any] | List[Dict[str, Any]],
         tool_calls: Dict[str, Any] | List[Dict[str, Any]] | None = None,
         response: str | List[Dict[str, Any]] | None = None,
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate tool call accuracy
diff --git a/openjudge/graders/agent/tool/tool_call_precision_recall_match.py b/openjudge/graders/agent/tool/tool_call_precision_recall_match.py
index 64c93ed72..8832792e6 100644
--- a/openjudge/graders/agent/tool/tool_call_precision_recall_match.py
+++ b/openjudge/graders/agent/tool/tool_call_precision_recall_match.py
@@ -199,6 +199,7 @@ async def _aevaluate(
         self,
         tool_calls: List[Dict[str, Any]],
         reference_tool_calls: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate tool call precision/recall against reference.
diff --git a/openjudge/graders/agent/tool/tool_call_step_sequence_match.py b/openjudge/graders/agent/tool/tool_call_step_sequence_match.py
index 519547285..9f3105aa2 100644
--- a/openjudge/graders/agent/tool/tool_call_step_sequence_match.py
+++ b/openjudge/graders/agent/tool/tool_call_step_sequence_match.py
@@ -358,6 +358,7 @@ async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
         reference_tool_calls: List[List[Dict[str, Any]]],
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate tool call sequence matching against reference.
diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py
index ee97f1acb..8d251c8d4 100644
--- a/openjudge/graders/agent/tool/tool_call_success.py
+++ b/openjudge/graders/agent/tool/tool_call_success.py
@@ -267,6 +267,7 @@ async def _aevaluate(
         tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_responses: Union[str, List[str]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate tool call success
diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py
index 66b3f9014..9fc0578a1 100644
--- a/openjudge/graders/agent/tool/tool_parameter_check.py
+++ b/openjudge/graders/agent/tool/tool_parameter_check.py
@@ -214,6 +214,7 @@ async def _aevaluate(
         query: Union[str, List[Dict[str, Any]]],
         tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate tool parameter extraction correctness
diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py
index fb4b66913..8bac4130e 100644
--- a/openjudge/graders/agent/tool/tool_selection.py
+++ b/openjudge/graders/agent/tool/tool_selection.py
@@ -230,6 +230,7 @@ async def _aevaluate(
         query: Union[str, List[Dict[str, Any]]],
         tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate tool selection
diff --git a/openjudge/graders/agent/trajectory/trajectory_accuracy.py b/openjudge/graders/agent/trajectory/trajectory_accuracy.py
index 7dfb07533..5709ab738 100644
--- a/openjudge/graders/agent/trajectory/trajectory_accuracy.py
+++ b/openjudge/graders/agent/trajectory/trajectory_accuracy.py
@@ -270,6 +270,7 @@ def _format_messages(
     async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate trajectory accuracy
diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py
index b2a9b388e..d8df784aa 100644
--- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py
+++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py
@@ -545,6 +545,7 @@ async def _aevaluate(
         messages: List[Dict[str, Any]],
         query: Optional[str] = None,
         response: Optional[str | Dict[str, Any]] = None,
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate complete agent trajectory comprehensively.
diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py
index 50df115ca..17a261607 100644
--- a/openjudge/graders/common/correctness.py
+++ b/openjudge/graders/common/correctness.py
@@ -7,7 +7,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -312,7 +312,12 @@ def __init__(
         self.threshold = threshold
 
     async def _aevaluate(
-        self, query: str, response: str, context: str = "", reference_response: str = "", **kwargs
+        self,
+        query: str,
+        response: str,
+        context: str = "",
+        reference_response: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate correctness of response against reference response
diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py
index 82787e477..0b3f547f8 100644
--- a/openjudge/graders/common/hallucination.py
+++ b/openjudge/graders/common/hallucination.py
@@ -300,6 +300,7 @@ async def _aevaluate(
         response: str,
         context: str = "",
         reference_response: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate hallucination in response
diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py
index 6c3b77577..6186387ba 100644
--- a/openjudge/graders/common/harmfulness.py
+++ b/openjudge/graders/common/harmfulness.py
@@ -6,7 +6,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -294,6 +294,7 @@ async def _aevaluate(
         response: str,
         context: str = "",
         reference_response: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate harmfulness of response
diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py
index c4b581da8..ea175c8d3 100644
--- a/openjudge/graders/common/instruction_following.py
+++ b/openjudge/graders/common/instruction_following.py
@@ -7,7 +7,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -309,6 +309,7 @@ async def _aevaluate(
         instruction: str,
         response: str,
         query: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate instruction following in response
diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py
index 338741c1e..b718417f7 100644
--- a/openjudge/graders/common/relevance.py
+++ b/openjudge/graders/common/relevance.py
@@ -6,7 +6,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -310,7 +310,7 @@ async def _aevaluate(
         response: str,
         context: str = "",
         reference_response: str = "",
-        **kwargs,
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate relevance of response to query
diff --git a/openjudge/graders/common/search_correctness.py b/openjudge/graders/common/search_correctness.py
index d4ebca79b..28471d569 100644
--- a/openjudge/graders/common/search_correctness.py
+++ b/openjudge/graders/common/search_correctness.py
@@ -8,7 +8,7 @@
 
 import os
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -272,7 +272,7 @@ async def _aevaluate(
         self,
         query: str = "",
         response: str = "",
-        **kwargs,
+        **kwargs: Any,
     ) -> "GraderScore | GraderError":
         """Evaluate the factual accuracy of a response using web search.
 
diff --git a/tests/models/test_minimax_chat_model.py b/tests/models/test_minimax_chat_model.py
index cbbe5e497..454bac55b 100644
--- a/tests/models/test_minimax_chat_model.py
+++ b/tests/models/test_minimax_chat_model.py
@@ -14,7 +14,7 @@
 """
 
 import os
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, patch
 
 import pytest
 
@@ -22,7 +22,6 @@
 from openjudge.models.minimax_chat_model import MINIMAX_MODELS, _strip_think_tags
 from openjudge.models.schema.oai.response import ChatResponse
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -247,9 +246,7 @@ class TestMiniMaxChatModelIntegration:
     @pytest.mark.asyncio
     async def test_basic_chat(self):
         model = MiniMaxChatModel(model="MiniMax-M2.7")
-        response = await model.achat(
-            messages=[{"role": "user", "content": "Reply with the single word: hello"}]
-        )
+        response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}])
         assert isinstance(response, ChatResponse)
         assert response.content
         # Think-tags should be stripped
@@ -258,16 +255,12 @@ async def test_basic_chat(self):
     @pytest.mark.asyncio
     async def test_temperature_clamping_does_not_error(self):
         model = MiniMaxChatModel(model="MiniMax-M2.7", temperature=0.0)
-        response = await model.achat(
-            messages=[{"role": "user", "content": "Say: ok"}]
-        )
+        response = await model.achat(messages=[{"role": "user", "content": "Say: ok"}])
         assert isinstance(response, ChatResponse)
 
     @pytest.mark.asyncio
     async def test_highspeed_model(self):
         model = MiniMaxChatModel(model="MiniMax-M2.7-highspeed")
-        response = await model.achat(
-            messages=[{"role": "user", "content": "Reply with the single word: hello"}]
-        )
+        response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}])
         assert isinstance(response, ChatResponse)
         assert response.content