agentscope-ai · helloml0326 · Apr 7, 2026 · Apr 2, 2026
diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
@@ -215,6 +215,7 @@ async def _aevaluate(
         action: str,
         history: Optional[List[Dict[str, Any]]] = None,
         context: Optional[str] = None,
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate action alignment with plan

diff --git a/openjudge/graders/agent/action/action_loop.py b/openjudge/graders/agent/action/action_loop.py
@@ -50,6 +50,7 @@ def __init__(
     async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Detect loops in action sequences by comparing all pairs of action signatures.

diff --git a/openjudge/graders/agent/observation/observation_information_gain.py b/openjudge/graders/agent/observation/observation_information_gain.py
@@ -60,6 +60,7 @@ def __init__(
     async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate information gain and redundancy in observation observations.

diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py
@@ -280,6 +280,7 @@ async def _aevaluate(
         tool_definitions: Dict[str, Any] | List[Dict[str, Any]],
         tool_calls: Dict[str, Any] | List[Dict[str, Any]] | None = None,
         response: str | List[Dict[str, Any]] | None = None,
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate tool call accuracy

diff --git a/openjudge/graders/agent/tool/tool_call_precision_recall_match.py b/openjudge/graders/agent/tool/tool_call_precision_recall_match.py
@@ -199,6 +199,7 @@ async def _aevaluate(
         self,
         tool_calls: List[Dict[str, Any]],
         reference_tool_calls: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate tool call precision/recall against reference.

diff --git a/openjudge/graders/agent/tool/tool_call_step_sequence_match.py b/openjudge/graders/agent/tool/tool_call_step_sequence_match.py
@@ -358,6 +358,7 @@ async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
         reference_tool_calls: List[List[Dict[str, Any]]],
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate tool call sequence matching against reference.

diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py
@@ -267,6 +267,7 @@ async def _aevaluate(
         tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_responses: Union[str, List[str]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate tool call success

diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py
@@ -214,6 +214,7 @@ async def _aevaluate(
         query: Union[str, List[Dict[str, Any]]],
         tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate tool parameter extraction correctness

diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py
@@ -230,6 +230,7 @@ async def _aevaluate(
         query: Union[str, List[Dict[str, Any]]],
         tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
         tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate tool selection

diff --git a/openjudge/graders/agent/trajectory/trajectory_accuracy.py b/openjudge/graders/agent/trajectory/trajectory_accuracy.py
@@ -270,6 +270,7 @@ def _format_messages(
     async def _aevaluate(
         self,
         messages: List[Dict[str, Any]],
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate trajectory accuracy

diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py
@@ -545,6 +545,7 @@ async def _aevaluate(
         messages: List[Dict[str, Any]],
         query: Optional[str] = None,
         response: Optional[str | Dict[str, Any]] = None,
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate complete agent trajectory comprehensively.

diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py
@@ -7,7 +7,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -312,7 +312,12 @@ def __init__(
         self.threshold = threshold
 
     async def _aevaluate(
-        self, query: str, response: str, context: str = "", reference_response: str = "", **kwargs
+        self,
+        query: str,
+        response: str,
+        context: str = "",
+        reference_response: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate correctness of response against reference response

diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py
@@ -300,6 +300,7 @@ async def _aevaluate(
         response: str,
         context: str = "",
         reference_response: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate hallucination in response

diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py
@@ -6,7 +6,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -294,6 +294,7 @@ async def _aevaluate(
         response: str,
         context: str = "",
         reference_response: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate harmfulness of response

diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py
@@ -7,7 +7,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -309,6 +309,7 @@ async def _aevaluate(
         instruction: str,
         response: str,
         query: str = "",
+        **kwargs: Any,
     ) -> GraderScore:
         """
         Evaluate instruction following in response

diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py
@@ -6,7 +6,7 @@
 """
 
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -310,7 +310,7 @@ async def _aevaluate(
         response: str,
         context: str = "",
         reference_response: str = "",
-        **kwargs,
+        **kwargs: Any,
     ) -> GraderScore | GraderError:
         """
         Evaluate relevance of response to query

diff --git a/openjudge/graders/common/search_correctness.py b/openjudge/graders/common/search_correctness.py
@@ -8,7 +8,7 @@
 
 import os
 import textwrap
-from typing import Optional
+from typing import Any, Optional
 
 from loguru import logger
 
@@ -272,7 +272,7 @@ async def _aevaluate(
         self,
         query: str = "",
         response: str = "",
-        **kwargs,
+        **kwargs: Any,
     ) -> "GraderScore | GraderError":
         """Evaluate the factual accuracy of a response using web search.
 

diff --git a/tests/models/test_minimax_chat_model.py b/tests/models/test_minimax_chat_model.py
@@ -14,15 +14,14 @@
 """
 
 import os
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, patch
 
 import pytest
 
 from openjudge.models import MiniMaxChatModel
 from openjudge.models.minimax_chat_model import MINIMAX_MODELS, _strip_think_tags
 from openjudge.models.schema.oai.response import ChatResponse
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -247,9 +246,7 @@ class TestMiniMaxChatModelIntegration:
     @pytest.mark.asyncio
     async def test_basic_chat(self):
         model = MiniMaxChatModel(model="MiniMax-M2.7")
-        response = await model.achat(
-            messages=[{"role": "user", "content": "Reply with the single word: hello"}]
-        )
+        response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}])
         assert isinstance(response, ChatResponse)
         assert response.content
         # Think-tags should be stripped
@@ -258,16 +255,12 @@ async def test_basic_chat(self):
     @pytest.mark.asyncio
     async def test_temperature_clamping_does_not_error(self):
         model = MiniMaxChatModel(model="MiniMax-M2.7", temperature=0.0)
-        response = await model.achat(
-            messages=[{"role": "user", "content": "Say: ok"}]
-        )
+        response = await model.achat(messages=[{"role": "user", "content": "Say: ok"}])
         assert isinstance(response, ChatResponse)
 
     @pytest.mark.asyncio
     async def test_highspeed_model(self):
         model = MiniMaxChatModel(model="MiniMax-M2.7-highspeed")
-        response = await model.achat(
-            messages=[{"role": "user", "content": "Reply with the single word: hello"}]
-        )
+        response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}])
         assert isinstance(response, ChatResponse)
         assert response.content