Skip to content

Commit 04b9a9c

Browse files
committed
feat(sdk): add BenchmarkRun and AsyncBenchmarkRun classes
1 parent dd03616 commit 04b9a9c

File tree

8 files changed

+750
-0
lines changed

8 files changed

+750
-0
lines changed

src/runloop_api_client/sdk/_types.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from ..lib.polling import PollingConfig
66
from ..types.devboxes import DiskSnapshotListParams, DiskSnapshotUpdateParams
77
from ..types.scenarios import ScorerListParams, ScorerCreateParams, ScorerUpdateParams, ScorerValidateParams
8+
from ..types.benchmarks import RunListScenarioRunsParams
89
from ..types.input_context import InputContext
910
from ..types.scenario_view import ScenarioView
1011
from ..types.agent_list_params import AgentListParams
@@ -203,3 +204,8 @@ class ScenarioPreview(ScenarioView):
203204

204205
input_context: InputContextPreview # type: ignore[assignment]
205206
"""The input context for the Scenario."""
207+
208+
209+
# Benchmark Run params
210+
class SDKBenchmarkRunListScenarioRunsParams(RunListScenarioRunsParams, BaseRequestOptions):
211+
pass
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
"""AsyncBenchmarkRun resource class for asynchronous operations."""
2+
3+
from __future__ import annotations
4+
5+
from typing import List
6+
from typing_extensions import Unpack, override
7+
8+
from ..types import ScenarioRunView, BenchmarkRunView
9+
from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams
10+
from .._client import AsyncRunloop
11+
12+
13+
class AsyncBenchmarkRun:
14+
"""A benchmark run for evaluating agent performance across scenarios (async).
15+
16+
Provides async methods for monitoring run status, managing the run lifecycle,
17+
and accessing scenario run results. Obtain instances via
18+
``benchmark.run()`` or ``benchmark.list_runs()``.
19+
20+
Example:
21+
>>> benchmark = runloop.benchmark.from_id("bench-xxx")
22+
>>> run = await benchmark.run(run_name="evaluation-v1")
23+
>>> info = await run.get_info()
24+
>>> scenario_runs = await run.list_scenario_runs()
25+
"""
26+
27+
def __init__(self, client: AsyncRunloop, run_id: str, benchmark_id: str) -> None:
28+
"""Create an AsyncBenchmarkRun instance.
29+
30+
:param client: AsyncRunloop client instance
31+
:type client: AsyncRunloop
32+
:param run_id: Benchmark run ID
33+
:type run_id: str
34+
:param benchmark_id: Parent benchmark ID
35+
:type benchmark_id: str
36+
"""
37+
self._client = client
38+
self._id = run_id
39+
self._benchmark_id = benchmark_id
40+
41+
@override
42+
def __repr__(self) -> str:
43+
return f"<AsyncBenchmarkRun id={self._id!r}>"
44+
45+
@property
46+
def id(self) -> str:
47+
"""Return the benchmark run ID.
48+
49+
:return: Unique benchmark run ID
50+
:rtype: str
51+
"""
52+
return self._id
53+
54+
@property
55+
def benchmark_id(self) -> str:
56+
"""Return the parent benchmark ID.
57+
58+
:return: Parent benchmark ID
59+
:rtype: str
60+
"""
61+
return self._benchmark_id
62+
63+
async def get_info(
64+
self,
65+
**options: Unpack[BaseRequestOptions],
66+
) -> BenchmarkRunView:
67+
"""Retrieve current benchmark run status and metadata.
68+
69+
:param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
70+
:return: Current benchmark run state info
71+
:rtype: BenchmarkRunView
72+
"""
73+
return await self._client.benchmarks.runs.retrieve(
74+
self._id,
75+
**options,
76+
)
77+
78+
async def cancel(
79+
self,
80+
**options: Unpack[LongRequestOptions],
81+
) -> BenchmarkRunView:
82+
"""Cancel the benchmark run.
83+
84+
Stops all running scenarios and marks the run as canceled.
85+
86+
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
87+
:return: Updated benchmark run state
88+
:rtype: BenchmarkRunView
89+
"""
90+
return await self._client.benchmarks.runs.cancel(
91+
self._id,
92+
**options,
93+
)
94+
95+
async def complete(
96+
self,
97+
**options: Unpack[LongRequestOptions],
98+
) -> BenchmarkRunView:
99+
"""Complete the benchmark run.
100+
101+
Marks the run as completed. Call this after all scenarios have finished.
102+
103+
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
104+
:return: Completed benchmark run state
105+
:rtype: BenchmarkRunView
106+
"""
107+
return await self._client.benchmarks.runs.complete(
108+
self._id,
109+
**options,
110+
)
111+
112+
async def list_scenario_runs(
113+
self,
114+
**params: Unpack[SDKBenchmarkRunListScenarioRunsParams],
115+
) -> List[ScenarioRunView]:
116+
"""List all scenario runs for this benchmark run.
117+
118+
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters
119+
:return: List of scenario run views
120+
:rtype: List[ScenarioRunView]
121+
"""
122+
page = self._client.benchmarks.runs.list_scenario_runs(
123+
self._id,
124+
**params,
125+
)
126+
return [item async for item in page]
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
"""BenchmarkRun resource class for synchronous operations."""
2+
3+
from __future__ import annotations
4+
5+
from typing import List
6+
from typing_extensions import Unpack, override
7+
8+
from ..types import ScenarioRunView, BenchmarkRunView
9+
from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams
10+
from .._client import Runloop
11+
12+
13+
class BenchmarkRun:
14+
"""A benchmark run for evaluating agent performance across scenarios.
15+
16+
Provides methods for monitoring run status, managing the run lifecycle,
17+
and accessing scenario run results. Obtain instances via
18+
``benchmark.run()`` or ``benchmark.list_runs()``.
19+
20+
Example:
21+
>>> benchmark = runloop.benchmark.from_id("bench-xxx")
22+
>>> run = benchmark.run(run_name="evaluation-v1")
23+
>>> info = run.get_info()
24+
>>> scenario_runs = run.list_scenario_runs()
25+
"""
26+
27+
def __init__(self, client: Runloop, run_id: str, benchmark_id: str) -> None:
28+
"""Create a BenchmarkRun instance.
29+
30+
:param client: Runloop client instance
31+
:type client: Runloop
32+
:param run_id: Benchmark run ID
33+
:type run_id: str
34+
:param benchmark_id: Parent benchmark ID
35+
:type benchmark_id: str
36+
"""
37+
self._client = client
38+
self._id = run_id
39+
self._benchmark_id = benchmark_id
40+
41+
@override
42+
def __repr__(self) -> str:
43+
return f"<BenchmarkRun id={self._id!r}>"
44+
45+
@property
46+
def id(self) -> str:
47+
"""Return the benchmark run ID.
48+
49+
:return: Unique benchmark run ID
50+
:rtype: str
51+
"""
52+
return self._id
53+
54+
@property
55+
def benchmark_id(self) -> str:
56+
"""Return the parent benchmark ID.
57+
58+
:return: Parent benchmark ID
59+
:rtype: str
60+
"""
61+
return self._benchmark_id
62+
63+
def get_info(
64+
self,
65+
**options: Unpack[BaseRequestOptions],
66+
) -> BenchmarkRunView:
67+
"""Retrieve current benchmark run status and metadata.
68+
69+
:param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
70+
:return: Current benchmark run state info
71+
:rtype: BenchmarkRunView
72+
"""
73+
return self._client.benchmarks.runs.retrieve(
74+
self._id,
75+
**options,
76+
)
77+
78+
def cancel(
79+
self,
80+
**options: Unpack[LongRequestOptions],
81+
) -> BenchmarkRunView:
82+
"""Cancel the benchmark run.
83+
84+
Stops all running scenarios and marks the run as canceled.
85+
86+
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
87+
:return: Updated benchmark run state
88+
:rtype: BenchmarkRunView
89+
"""
90+
return self._client.benchmarks.runs.cancel(
91+
self._id,
92+
**options,
93+
)
94+
95+
def complete(
96+
self,
97+
**options: Unpack[LongRequestOptions],
98+
) -> BenchmarkRunView:
99+
"""Complete the benchmark run.
100+
101+
Marks the run as completed. Call this after all scenarios have finished.
102+
103+
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
104+
:return: Completed benchmark run state
105+
:rtype: BenchmarkRunView
106+
"""
107+
return self._client.benchmarks.runs.complete(
108+
self._id,
109+
**options,
110+
)
111+
112+
def list_scenario_runs(
113+
self,
114+
**params: Unpack[SDKBenchmarkRunListScenarioRunsParams],
115+
) -> List[ScenarioRunView]:
116+
"""List all scenario runs for this benchmark run.
117+
118+
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters
119+
:return: List of scenario run views
120+
:rtype: List[ScenarioRunView]
121+
"""
122+
page = self._client.benchmarks.runs.list_scenario_runs(
123+
self._id,
124+
**params,
125+
)
126+
return list(page)

tests/sdk/conftest.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,30 @@ class MockScenarioRunView:
129129
scoring_contract_result: object = None
130130

131131

132+
@dataclass
133+
class MockBenchmarkRunView:
134+
"""Mock BenchmarkRunView for testing."""
135+
136+
id: str = "bench_run_123"
137+
benchmark_id: str = "bench_123"
138+
state: str = "running"
139+
metadata: Dict[str, str] = field(default_factory=dict)
140+
start_time_ms: int = 1234567890000
141+
duration_ms: int | None = None
142+
score: float | None = None
143+
144+
145+
class AsyncIterableMock:
146+
"""A simple async iterable mock for testing paginated responses."""
147+
148+
def __init__(self, items: list[Any]) -> None:
149+
self._items = items
150+
151+
async def __aiter__(self):
152+
for item in self._items:
153+
yield item
154+
155+
132156
def create_mock_httpx_client(methods: dict[str, Any] | None = None) -> AsyncMock:
133157
"""
134158
Create a mock httpx.AsyncClient with proper context manager setup.
@@ -237,6 +261,12 @@ def scenario_run_view() -> MockScenarioRunView:
237261
return MockScenarioRunView()
238262

239263

264+
@pytest.fixture
265+
def benchmark_run_view() -> MockBenchmarkRunView:
266+
"""Create a mock BenchmarkRunView."""
267+
return MockBenchmarkRunView()
268+
269+
240270
@pytest.fixture
241271
def mock_httpx_response() -> Mock:
242272
"""Create a mock httpx.Response."""

0 commit comments

Comments
 (0)