Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openjudge/graders/agent/action/action_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ async def _aevaluate(
action: str,
history: Optional[List[Dict[str, Any]]] = None,
context: Optional[str] = None,
**kwargs: Any,
) -> GraderScore:
"""
Evaluate action alignment with plan
Expand Down
1 change: 1 addition & 0 deletions openjudge/graders/agent/action/action_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
async def _aevaluate(
self,
messages: List[Dict[str, Any]],
**kwargs: Any,
) -> GraderScore:
"""
Detect loops in action sequences by comparing all pairs of action signatures.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
async def _aevaluate(
self,
messages: List[Dict[str, Any]],
**kwargs: Any,
) -> GraderScore:
"""
Evaluate information gain and redundancy in observation observations.
Expand Down
1 change: 1 addition & 0 deletions openjudge/graders/agent/tool/tool_call_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ async def _aevaluate(
tool_definitions: Dict[str, Any] | List[Dict[str, Any]],
tool_calls: Dict[str, Any] | List[Dict[str, Any]] | None = None,
response: str | List[Dict[str, Any]] | None = None,
**kwargs: Any,
) -> GraderScore | GraderError:
"""
Evaluate tool call accuracy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ async def _aevaluate(
self,
tool_calls: List[Dict[str, Any]],
reference_tool_calls: List[Dict[str, Any]],
**kwargs: Any,
) -> GraderScore | GraderError:
"""
Evaluate tool call precision/recall against reference.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ async def _aevaluate(
self,
messages: List[Dict[str, Any]],
reference_tool_calls: List[List[Dict[str, Any]]],
**kwargs: Any,
) -> GraderScore | GraderError:
"""
Evaluate tool call sequence matching against reference.
Expand Down
1 change: 1 addition & 0 deletions openjudge/graders/agent/tool/tool_call_success.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ async def _aevaluate(
tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
tool_responses: Union[str, List[str]],
**kwargs: Any,
) -> GraderScore:
"""
Evaluate tool call success
Expand Down
1 change: 1 addition & 0 deletions openjudge/graders/agent/tool/tool_parameter_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ async def _aevaluate(
query: Union[str, List[Dict[str, Any]]],
tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
**kwargs: Any,
) -> GraderScore:
"""
Evaluate tool parameter extraction correctness
Expand Down
1 change: 1 addition & 0 deletions openjudge/graders/agent/tool/tool_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ async def _aevaluate(
query: Union[str, List[Dict[str, Any]]],
tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
tool_calls: Union[Dict[str, Any], List[Dict[str, Any]]],
**kwargs: Any,
) -> GraderScore:
"""
Evaluate tool selection
Expand Down
1 change: 1 addition & 0 deletions openjudge/graders/agent/trajectory/trajectory_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def _format_messages(
async def _aevaluate(
self,
messages: List[Dict[str, Any]],
**kwargs: Any,
) -> GraderScore | GraderError:
"""
Evaluate trajectory accuracy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ async def _aevaluate(
messages: List[Dict[str, Any]],
query: Optional[str] = None,
response: Optional[str | Dict[str, Any]] = None,
**kwargs: Any,
) -> GraderScore | GraderError:
"""
Evaluate complete agent trajectory comprehensively.
Expand Down
9 changes: 7 additions & 2 deletions openjudge/graders/common/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""

import textwrap
from typing import Optional
from typing import Any, Optional

from loguru import logger

Expand Down Expand Up @@ -312,7 +312,12 @@ def __init__(
self.threshold = threshold

async def _aevaluate(
self, query: str, response: str, context: str = "", reference_response: str = "", **kwargs
self,
query: str,
response: str,
context: str = "",
reference_response: str = "",
**kwargs: Any,
) -> GraderScore:
"""
Evaluate correctness of response against reference response
Expand Down
1 change: 1 addition & 0 deletions openjudge/graders/common/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ async def _aevaluate(
response: str,
context: str = "",
reference_response: str = "",
**kwargs: Any,
) -> GraderScore:
"""
Evaluate hallucination in response
Expand Down
3 changes: 2 additions & 1 deletion openjudge/graders/common/harmfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import textwrap
from typing import Optional
from typing import Any, Optional

from loguru import logger

Expand Down Expand Up @@ -294,6 +294,7 @@ async def _aevaluate(
response: str,
context: str = "",
reference_response: str = "",
**kwargs: Any,
) -> GraderScore:
"""
Evaluate harmfulness of response
Expand Down
3 changes: 2 additions & 1 deletion openjudge/graders/common/instruction_following.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""

import textwrap
from typing import Optional
from typing import Any, Optional

from loguru import logger

Expand Down Expand Up @@ -309,6 +309,7 @@ async def _aevaluate(
instruction: str,
response: str,
query: str = "",
**kwargs: Any,
) -> GraderScore:
"""
Evaluate instruction following in response
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/common/relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import textwrap
from typing import Optional
from typing import Any, Optional

from loguru import logger

Expand Down Expand Up @@ -310,7 +310,7 @@ async def _aevaluate(
response: str,
context: str = "",
reference_response: str = "",
**kwargs,
**kwargs: Any,
) -> GraderScore | GraderError:
"""
Evaluate relevance of response to query
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/common/search_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import os
import textwrap
from typing import Optional
from typing import Any, Optional

from loguru import logger

Expand Down Expand Up @@ -272,7 +272,7 @@ async def _aevaluate(
self,
query: str = "",
response: str = "",
**kwargs,
**kwargs: Any,
) -> "GraderScore | GraderError":
"""Evaluate the factual accuracy of a response using web search.

Expand Down
15 changes: 4 additions & 11 deletions tests/models/test_minimax_chat_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@
"""

import os
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import AsyncMock, patch

import pytest

from openjudge.models import MiniMaxChatModel
from openjudge.models.minimax_chat_model import MINIMAX_MODELS, _strip_think_tags
from openjudge.models.schema.oai.response import ChatResponse


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -247,9 +246,7 @@ class TestMiniMaxChatModelIntegration:
@pytest.mark.asyncio
async def test_basic_chat(self):
model = MiniMaxChatModel(model="MiniMax-M2.7")
response = await model.achat(
messages=[{"role": "user", "content": "Reply with the single word: hello"}]
)
response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}])
assert isinstance(response, ChatResponse)
assert response.content
# Think-tags should be stripped
Expand All @@ -258,16 +255,12 @@ async def test_basic_chat(self):
@pytest.mark.asyncio
async def test_temperature_clamping_does_not_error(self):
model = MiniMaxChatModel(model="MiniMax-M2.7", temperature=0.0)
response = await model.achat(
messages=[{"role": "user", "content": "Say: ok"}]
)
response = await model.achat(messages=[{"role": "user", "content": "Say: ok"}])
assert isinstance(response, ChatResponse)

@pytest.mark.asyncio
async def test_highspeed_model(self):
model = MiniMaxChatModel(model="MiniMax-M2.7-highspeed")
response = await model.achat(
messages=[{"role": "user", "content": "Reply with the single word: hello"}]
)
response = await model.achat(messages=[{"role": "user", "content": "Reply with the single word: hello"}])
assert isinstance(response, ChatResponse)
assert response.content