diff --git a/src/sre_agent/eval/common/mocks/__init__.py b/src/sre_agent/eval/common/mocks/__init__.py new file mode 100644 index 00000000..3a0bdd18 --- /dev/null +++ b/src/sre_agent/eval/common/mocks/__init__.py @@ -0,0 +1 @@ +"""Shared mocks for evaluation suites.""" diff --git a/src/sre_agent/eval/diagnosis_quality/mocks/cloudwatch.py b/src/sre_agent/eval/common/mocks/cloudwatch.py similarity index 76% rename from src/sre_agent/eval/diagnosis_quality/mocks/cloudwatch.py rename to src/sre_agent/eval/common/mocks/cloudwatch.py index 86a2b263..bc837cde 100644 --- a/src/sre_agent/eval/diagnosis_quality/mocks/cloudwatch.py +++ b/src/sre_agent/eval/common/mocks/cloudwatch.py @@ -1,15 +1,30 @@ -"""Mock CloudWatch tools for diagnosis quality evaluation.""" +"""Mock CloudWatch tools shared across evaluation suites.""" + +from typing import Protocol import opik from sre_agent.core.models import LogEntry, LogQueryResult -from sre_agent.eval.diagnosis_quality.mocks.runtime import MockToolRuntime MOCK_TIMESTAMP = "2026-01-01T00:00:00+00:00" +class _MockCloudWatchEntryLike(Protocol): + message: list[str] + + +class _CaseLike(Protocol): + mock_cloudwatch_entries: list[_MockCloudWatchEntryLike] + + +class MockRuntimeLike(Protocol): + """Structural type required by the shared CloudWatch mock.""" + + case: _CaseLike + + async def search_error_logs( - runtime: MockToolRuntime, + runtime: MockRuntimeLike, log_group: str, service_name: str, time_range_minutes: int, @@ -44,7 +59,7 @@ async def search_error_logs( ) -def _normalise_messages(runtime: MockToolRuntime) -> list[str]: +def _normalise_messages(runtime: MockRuntimeLike) -> list[str]: """Convert multiline fixture entries into non-empty log messages. Returns: diff --git a/src/sre_agent/eval/tool_call/mocks/slack.py b/src/sre_agent/eval/common/mocks/slack.py similarity index 93% rename from src/sre_agent/eval/tool_call/mocks/slack.py rename to src/sre_agent/eval/common/mocks/slack.py index 25dcc73f..bbeb4569 100644 --- a/src/sre_agent/eval/tool_call/mocks/slack.py +++ b/src/sre_agent/eval/common/mocks/slack.py @@ -1,4 +1,4 @@ -"""Mock Slack tools for tool call evaluation.""" +"""Mock Slack tools shared across evaluation suites.""" from typing import Any diff --git a/src/sre_agent/eval/diagnosis_quality/mocks/slack.py b/src/sre_agent/eval/diagnosis_quality/mocks/slack.py deleted file mode 100644 index 0c9aab73..00000000 --- a/src/sre_agent/eval/diagnosis_quality/mocks/slack.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Mock Slack tools for diagnosis quality evaluation.""" - -from typing import Any - -import opik - -MOCK_THREAD_TS = "1800000000.1000" - - -async def conversations_add_message( - channel_id: str, - payload: str, - thread_ts: str | None, -) -> dict[str, Any]: - """Mock Slack conversations_add_message. - - Returns: - A mock Slack API response dict. - """ - span_input: dict[str, Any] = {"channel_id": channel_id, "payload": payload} - if thread_ts is not None: - span_input["thread_ts"] = thread_ts - - with opik.start_as_current_span( - name="conversations_add_message", - type="tool", - input=span_input, - metadata={"mocked": True, "provider": "slack"}, - ): - if thread_ts is None: - return {"ok": True, "channel": channel_id, "ts": MOCK_THREAD_TS} - - return {"ok": True, "channel": channel_id, "ts": thread_ts} diff --git a/src/sre_agent/eval/diagnosis_quality/mocks/toolset.py b/src/sre_agent/eval/diagnosis_quality/mocks/toolset.py index 21382627..29d9cf51 100644 --- a/src/sre_agent/eval/diagnosis_quality/mocks/toolset.py +++ b/src/sre_agent/eval/diagnosis_quality/mocks/toolset.py @@ -5,8 +5,8 @@ from pydantic_ai import FunctionToolset from sre_agent.core.models import LogQueryResult -from sre_agent.eval.diagnosis_quality.mocks import cloudwatch as cloudwatch_mocks -from sre_agent.eval.diagnosis_quality.mocks import slack as slack_mocks +from sre_agent.eval.common.mocks import cloudwatch as cloudwatch_mocks +from sre_agent.eval.common.mocks import slack as slack_mocks from sre_agent.eval.diagnosis_quality.mocks.runtime import MockToolRuntime diff --git a/src/sre_agent/eval/tool_call/mocks/cloudwatch.py b/src/sre_agent/eval/tool_call/mocks/cloudwatch.py deleted file mode 100644 index 0706f2a0..00000000 --- a/src/sre_agent/eval/tool_call/mocks/cloudwatch.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Mock CloudWatch tools for tool call evaluation.""" - -import opik - -from sre_agent.core.models import LogEntry, LogQueryResult -from sre_agent.eval.tool_call.mocks.runtime import MockToolRuntime - -MOCK_TIMESTAMP = "2026-01-01T00:00:00+00:00" - - -async def search_error_logs( - runtime: MockToolRuntime, - log_group: str, - service_name: str, - time_range_minutes: int, -) -> LogQueryResult: - """Mock CloudWatch log lookup using case fixtures. - - Returns: - A LogQueryResult populated from case fixtures. - """ - with opik.start_as_current_span( - name="search_error_logs", - type="tool", - input={ - "log_group": log_group, - "service_name": service_name, - "time_range_minutes": time_range_minutes, - }, - metadata={"mocked": True, "provider": "cloudwatch"}, - ): - entries = [ - LogEntry( - timestamp=MOCK_TIMESTAMP, - message=message, - log_stream=None, - ) - for message in _normalise_messages(runtime) - ] - return LogQueryResult( - entries=entries, - log_group=log_group, - query=f"mock: search_error_logs service={service_name}", - ) - - -def _normalise_messages(runtime: MockToolRuntime) -> list[str]: - """Convert multiline fixture entries into non-empty log messages. - - Returns: - A list of non-empty log message strings. - """ - messages: list[str] = [] - for entry in runtime.case.mock_cloudwatch_entries: - message = "\n".join(line.rstrip("\n") for line in entry.message).strip() - if message: - messages.append(message) - return messages diff --git a/src/sre_agent/eval/tool_call/mocks/toolset.py b/src/sre_agent/eval/tool_call/mocks/toolset.py index f0efd754..ddaac246 100644 --- a/src/sre_agent/eval/tool_call/mocks/toolset.py +++ b/src/sre_agent/eval/tool_call/mocks/toolset.py @@ -5,8 +5,8 @@ from pydantic_ai import FunctionToolset from sre_agent.core.models import LogQueryResult -from sre_agent.eval.tool_call.mocks import cloudwatch as cloudwatch_mocks -from sre_agent.eval.tool_call.mocks import slack as slack_mocks +from sre_agent.eval.common.mocks import cloudwatch as cloudwatch_mocks +from sre_agent.eval.common.mocks import slack as slack_mocks from sre_agent.eval.tool_call.mocks.runtime import MockToolRuntime