Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,15 @@ When running with the current stack, the flow is:

We built an evaluation suite to test both tool-use behaviour and diagnosis quality. You can find details here:

- [Evaluation overview](src/sre_agent/eval/README.md)
- [Tool call evaluation](src/sre_agent/eval/tool_call/README.md)
- [Diagnosis quality evaluation](src/sre_agent/eval/diagnosis_quality/README.md)
- [Evaluation overview](evals/README.md)
- [Tool call evaluation](evals/tool_call/README.md)
- [Diagnosis quality evaluation](evals/diagnosis_quality/README.md)

Run the suites with:

```bash
uv run sre-agent-run-tool-call-eval
uv run sre-agent-run-diagnosis-quality-eval
uv run python -m evals.tool_call.run
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let’s add uv sync --group eval above this to ensure Opik is installed before running the eval suites. We should also make it clear that Opik needs to be set up first.

Let's add the below:

"""
Assuming you already have Opik up and running. If not, please refer to the README in either of the eval suites for setup instructions. Once ready, run the following to install prerequisites:

export GITHUB_PERSONAL_ACCESS_TOKEN="..."
export ANTHROPIC_API_KEY="..."
uv sync --group eval

"""

uv run python -m evals.diagnosis_quality.run
```

# 🤔 Why We Built This
Expand Down
4 changes: 2 additions & 2 deletions src/sre_agent/eval/README.md → evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,5 @@ When the server is running, open [http://localhost:5173/](http://localhost:5173/

For suite-specific details, see:

- `src/sre_agent/eval/tool_call/README.md`
- `src/sre_agent/eval/diagnosis_quality/README.md`
- `evals/tool_call/README.md`
- `evals/diagnosis_quality/README.md`
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Common helpers for evaluation suites."""

from sre_agent.eval.common.case_loader import load_json_case_models
from evals.common.case_loader import load_json_case_models

__all__ = ["load_json_case_models"]
File renamed without changes.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think command in the run section needs to be updated:

uv sync --group eval
uv run sre-agent-run-diagnosis-quality-eval

does not work.

Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ The run is hybrid:

Test cases are loaded from:

- `src/sre_agent/eval/diagnosis_quality/dataset/test_cases`
- `evals/diagnosis_quality/dataset/test_cases`

Each case follows `DiagnosisQualityEvalCase` in:

- `src/sre_agent/eval/diagnosis_quality/dataset/schema.py`
- `evals/diagnosis_quality/dataset/schema.py`

Key fields:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from opik import Opik

from sre_agent.eval.common.case_loader import load_json_case_models
from sre_agent.eval.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase
from evals.common.case_loader import load_json_case_models
from evals.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase

DEFAULT_DATASET_NAME = "sre-agent-diagnosis-quality"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,27 @@
from opik.evaluation.evaluation_result import EvaluationResult
from pydantic_ai import Agent

from sre_agent.core.models import ErrorDiagnosis
from sre_agent.core.prompts import SYSTEM_PROMPT
from sre_agent.eval.diagnosis_quality.config import (
from evals.diagnosis_quality.config import (
DEFAULT_EXPERIMENT_NAME,
DEFAULT_JUDGE_MODEL,
DEFAULT_MODEL,
DEFAULT_OPIK_PROJECT_NAME,
)
from sre_agent.eval.diagnosis_quality.dataset.create_and_populate import (
from evals.diagnosis_quality.dataset.create_and_populate import (
DEFAULT_DATASET_NAME,
create_and_populate_dataset,
)
from sre_agent.eval.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase
from sre_agent.eval.diagnosis_quality.github_toolset import build_github_toolset
from sre_agent.eval.diagnosis_quality.metrics import (
from evals.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase
from evals.diagnosis_quality.github_toolset import build_github_toolset
from evals.diagnosis_quality.metrics import (
AffectedServicesMatch,
RootCauseCorrectness,
SuggestedFixesQuality,
)
from sre_agent.eval.diagnosis_quality.mocks import MockToolRuntime, build_mock_toolset
from sre_agent.eval.diagnosis_quality.prompts import render_agent_prompt
from evals.diagnosis_quality.mocks import MockToolRuntime, build_mock_toolset
from evals.diagnosis_quality.prompts import render_agent_prompt
from sre_agent.core.models import ErrorDiagnosis
from sre_agent.core.prompts import SYSTEM_PROMPT


def evaluation_task(dataset_item: dict[str, Any]) -> dict[str, Any]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Metrics for diagnosis quality evaluation."""

from sre_agent.eval.diagnosis_quality.metrics.affected_services_match import (
from evals.diagnosis_quality.metrics.affected_services_match import (
AffectedServicesMatch,
)
from sre_agent.eval.diagnosis_quality.metrics.root_cause_correctness import (
from evals.diagnosis_quality.metrics.root_cause_correctness import (
RootCauseCorrectness,
)
from sre_agent.eval.diagnosis_quality.metrics.suggested_fixes_quality import (
from evals.diagnosis_quality.metrics.suggested_fixes_quality import (
SuggestedFixesQuality,
)

Expand Down
9 changes: 9 additions & 0 deletions evals/diagnosis_quality/mocks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Mock tools for diagnosis quality evaluation."""

from evals.diagnosis_quality.mocks.runtime import MockToolRuntime
from evals.diagnosis_quality.mocks.toolset import build_mock_toolset

__all__ = [
"MockToolRuntime",
"build_mock_toolset",
]
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import opik

from evals.diagnosis_quality.mocks.runtime import MockToolRuntime
from sre_agent.core.models import LogEntry, LogQueryResult
from sre_agent.eval.diagnosis_quality.mocks.runtime import MockToolRuntime

MOCK_TIMESTAMP = "2026-01-01T00:00:00+00:00"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from dataclasses import dataclass

from sre_agent.eval.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase
from evals.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase


@dataclass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

from pydantic_ai import FunctionToolset

from evals.diagnosis_quality.mocks import cloudwatch as cloudwatch_mocks
from evals.diagnosis_quality.mocks import slack as slack_mocks
from evals.diagnosis_quality.mocks.runtime import MockToolRuntime
from sre_agent.core.models import LogQueryResult
from sre_agent.eval.diagnosis_quality.mocks import cloudwatch as cloudwatch_mocks
from sre_agent.eval.diagnosis_quality.mocks import slack as slack_mocks
from sre_agent.eval.diagnosis_quality.mocks.runtime import MockToolRuntime


def build_mock_toolset(runtime: MockToolRuntime) -> FunctionToolset:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Prompt rendering for diagnosis quality evaluation."""

from sre_agent.core.prompts import DIAGNOSIS_PROMPT_TEMPLATE
from sre_agent.eval.diagnosis_quality.config import (
from evals.diagnosis_quality.config import (
DEFAULT_SLACK_CHANNEL_ID,
DEFAULT_TIME_RANGE_MINUTES,
)
from sre_agent.eval.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase
from evals.diagnosis_quality.dataset.schema import DiagnosisQualityEvalCase
from sre_agent.core.prompts import DIAGNOSIS_PROMPT_TEMPLATE


def render_agent_prompt(case: DiagnosisQualityEvalCase) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from pydantic_ai.exceptions import UserError

from sre_agent.eval.diagnosis_quality.config import DEFAULT_EXPERIMENT_NAME
from sre_agent.eval.diagnosis_quality.dataset.create_and_populate import DEFAULT_DATASET_NAME
from sre_agent.eval.diagnosis_quality.experiment import run_experiment
from evals.diagnosis_quality.config import DEFAULT_EXPERIMENT_NAME
from evals.diagnosis_quality.dataset.create_and_populate import DEFAULT_DATASET_NAME
from evals.diagnosis_quality.experiment import run_experiment


def main() -> None:
Expand Down
File renamed without changes
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think command in the run section needs to be updated:

uv sync --group eval
uv run sre-agent-run-tool-call-eval

does not work.

Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ The run is hybrid:

Test cases are loaded from:

- `src/sre_agent/eval/tool_call/dataset/test_cases`
- `evals/tool_call/dataset/test_cases`

Each case follows `ToolCallEvalCase` in:

- `src/sre_agent/eval/tool_call/dataset/schema.py`
- `evals/tool_call/dataset/schema.py`

Key fields:

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Dataset for tool call evaluation."""

from sre_agent.eval.tool_call.dataset.create_and_populate import (
from evals.tool_call.dataset.create_and_populate import (
DEFAULT_DATASET_NAME,
create_and_populate_dataset,
)
from sre_agent.eval.tool_call.dataset.schema import ToolCallEvalCase
from evals.tool_call.dataset.schema import ToolCallEvalCase

__all__ = ["create_and_populate_dataset", "ToolCallEvalCase", "DEFAULT_DATASET_NAME"]
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from opik import Opik

from sre_agent.eval.common.case_loader import load_json_case_models
from sre_agent.eval.tool_call.dataset.schema import ToolCallEvalCase
from evals.common.case_loader import load_json_case_models
from evals.tool_call.dataset.schema import ToolCallEvalCase

DEFAULT_DATASET_NAME = "sre-agent-tool-call"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,25 @@
from opik.evaluation.evaluation_result import EvaluationResult
from pydantic_ai import Agent

from sre_agent.core.models import ErrorDiagnosis
from sre_agent.core.prompts import SYSTEM_PROMPT
from sre_agent.eval.tool_call.config import (
from evals.tool_call.config import (
DEFAULT_EXPERIMENT_NAME,
DEFAULT_MODEL,
DEFAULT_OPIK_PROJECT_NAME,
)
from sre_agent.eval.tool_call.dataset.create_and_populate import (
from evals.tool_call.dataset.create_and_populate import (
DEFAULT_DATASET_NAME,
create_and_populate_dataset,
)
from sre_agent.eval.tool_call.dataset.schema import ToolCallEvalCase
from sre_agent.eval.tool_call.github_toolset import build_github_toolset
from sre_agent.eval.tool_call.metrics.expected_tool_select_order import (
from evals.tool_call.dataset.schema import ToolCallEvalCase
from evals.tool_call.github_toolset import build_github_toolset
from evals.tool_call.metrics.expected_tool_select_order import (
ExpectedToolSelectOrder,
)
from sre_agent.eval.tool_call.metrics.expected_tool_selection import ExpectedToolSelection
from sre_agent.eval.tool_call.mocks import MockToolRuntime, build_mock_toolset
from sre_agent.eval.tool_call.prompts import render_agent_prompt
from evals.tool_call.metrics.expected_tool_selection import ExpectedToolSelection
from evals.tool_call.mocks import MockToolRuntime, build_mock_toolset
from evals.tool_call.prompts import render_agent_prompt
from sre_agent.core.models import ErrorDiagnosis
from sre_agent.core.prompts import SYSTEM_PROMPT


def evaluation_task(dataset_item: dict[str, Any]) -> dict[str, Any]:
Expand Down
6 changes: 6 additions & 0 deletions evals/tool_call/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Metrics for tool call evaluation."""

from evals.tool_call.metrics.expected_tool_select_order import ExpectedToolSelectOrder
from evals.tool_call.metrics.expected_tool_selection import ExpectedToolSelection

__all__ = ["ExpectedToolSelection", "ExpectedToolSelectOrder"]
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from opik.evaluation.metrics import base_metric, score_result
from opik.message_processing.emulation.models import SpanModel

from sre_agent.eval.tool_call.metrics.span_tools import extract_tool_names
from evals.tool_call.metrics.span_tools import extract_tool_names


class ExpectedToolSelectOrder(base_metric.BaseMetric): # type: ignore[misc]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from opik.evaluation.metrics import base_metric, score_result
from opik.message_processing.emulation.models import SpanModel

from sre_agent.eval.tool_call.metrics.span_tools import extract_tool_names
from evals.tool_call.metrics.span_tools import extract_tool_names


class ExpectedToolSelection(base_metric.BaseMetric): # type: ignore[misc]
Expand Down
6 changes: 6 additions & 0 deletions evals/tool_call/mocks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Mock tools for tool call evaluation."""

from evals.tool_call.mocks.runtime import MockToolRuntime
from evals.tool_call.mocks.toolset import build_mock_toolset

__all__ = ["MockToolRuntime", "build_mock_toolset"]
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import opik

from evals.tool_call.mocks.runtime import MockToolRuntime
from sre_agent.core.models import LogEntry, LogQueryResult
from sre_agent.eval.tool_call.mocks.runtime import MockToolRuntime

MOCK_TIMESTAMP = "2026-01-01T00:00:00+00:00"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from dataclasses import dataclass

from sre_agent.eval.tool_call.dataset.schema import ToolCallEvalCase
from evals.tool_call.dataset.schema import ToolCallEvalCase


@dataclass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

from pydantic_ai import FunctionToolset

from evals.tool_call.mocks import cloudwatch as cloudwatch_mocks
from evals.tool_call.mocks import slack as slack_mocks
from evals.tool_call.mocks.runtime import MockToolRuntime
from sre_agent.core.models import LogQueryResult
from sre_agent.eval.tool_call.mocks import cloudwatch as cloudwatch_mocks
from sre_agent.eval.tool_call.mocks import slack as slack_mocks
from sre_agent.eval.tool_call.mocks.runtime import MockToolRuntime


def build_mock_toolset(runtime: MockToolRuntime) -> FunctionToolset:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Prompt rendering for tool call evaluation."""

from sre_agent.core.prompts import DIAGNOSIS_PROMPT_TEMPLATE
from sre_agent.eval.tool_call.config import (
from evals.tool_call.config import (
DEFAULT_SLACK_CHANNEL_ID,
DEFAULT_TIME_RANGE_MINUTES,
)
from sre_agent.eval.tool_call.dataset.schema import ToolCallEvalCase
from evals.tool_call.dataset.schema import ToolCallEvalCase
from sre_agent.core.prompts import DIAGNOSIS_PROMPT_TEMPLATE


def render_agent_prompt(case: ToolCallEvalCase) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from pydantic_ai.exceptions import UserError

from sre_agent.eval.tool_call.config import DEFAULT_EXPERIMENT_NAME
from sre_agent.eval.tool_call.dataset.create_and_populate import DEFAULT_DATASET_NAME
from sre_agent.eval.tool_call.experiment import run_experiment
from evals.tool_call.config import DEFAULT_EXPERIMENT_NAME
from evals.tool_call.dataset.create_and_populate import DEFAULT_DATASET_NAME
from evals.tool_call.experiment import run_experiment


def main() -> None:
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ dependencies = [

[project.scripts]
sre-agent = "sre_agent.cli.main:main"
sre-agent-run-tool-call-eval = "sre_agent.eval.tool_call.run:main"
sre-agent-run-diagnosis-quality-eval = "sre_agent.eval.diagnosis_quality.run:main"

[dependency-groups]
dev = [
Expand Down
9 changes: 0 additions & 9 deletions src/sre_agent/eval/diagnosis_quality/mocks/__init__.py

This file was deleted.

6 changes: 0 additions & 6 deletions src/sre_agent/eval/tool_call/metrics/__init__.py

This file was deleted.

6 changes: 0 additions & 6 deletions src/sre_agent/eval/tool_call/mocks/__init__.py

This file was deleted.