Skip to content

Commit 4aab259

Browse files
committed
feat(sdk): add Benchmark and AsyncBenchmark classes
1 parent aa97c59 commit 4aab259

File tree

10 files changed

+979
-29
lines changed

10 files changed

+979
-29
lines changed

src/runloop_api_client/sdk/_types.py

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,39 @@
11
from typing import Union, Callable, Optional
22
from typing_extensions import TypedDict
33

4+
from ..types import (
5+
InputContext,
6+
ScenarioView,
7+
AgentListParams,
8+
DevboxListParams,
9+
ObjectListParams,
10+
AgentCreateParams,
11+
DevboxCreateParams,
12+
ObjectCreateParams,
13+
ScenarioListParams,
14+
BlueprintListParams,
15+
ObjectDownloadParams,
16+
ScenarioUpdateParams,
17+
BenchmarkUpdateParams,
18+
BlueprintCreateParams,
19+
DevboxUploadFileParams,
20+
DevboxCreateTunnelParams,
21+
DevboxDownloadFileParams,
22+
DevboxRemoveTunnelParams,
23+
DevboxSnapshotDiskParams,
24+
DevboxReadFileContentsParams,
25+
DevboxWriteFileContentsParams,
26+
)
427
from .._types import Body, Query, Headers, Timeout, NotGiven
528
from ..lib.polling import PollingConfig
629
from ..types.devboxes import DiskSnapshotListParams, DiskSnapshotUpdateParams
730
from ..types.scenarios import ScorerListParams, ScorerCreateParams, ScorerUpdateParams, ScorerValidateParams
831
from ..types.benchmarks import RunListScenarioRunsParams
9-
from ..types.input_context import InputContext
10-
from ..types.scenario_view import ScenarioView
11-
from ..types.agent_list_params import AgentListParams
12-
from ..types.devbox_list_params import DevboxListParams
13-
from ..types.object_list_params import ObjectListParams
14-
from ..types.agent_create_params import AgentCreateParams
15-
from ..types.devbox_create_params import DevboxCreateParams, DevboxBaseCreateParams
16-
from ..types.object_create_params import ObjectCreateParams
17-
from ..types.scenario_list_params import ScenarioListParams
18-
from ..types.blueprint_list_params import BlueprintListParams
19-
from ..types.object_download_params import ObjectDownloadParams
20-
from ..types.scenario_update_params import ScenarioUpdateParams
21-
from ..types.blueprint_create_params import BlueprintCreateParams
22-
from ..types.devbox_upload_file_params import DevboxUploadFileParams
32+
from ..types.devbox_create_params import DevboxBaseCreateParams
2333
from ..types.scenario_start_run_params import ScenarioStartRunBaseParams
24-
from ..types.devbox_create_tunnel_params import DevboxCreateTunnelParams
25-
from ..types.devbox_download_file_params import DevboxDownloadFileParams
34+
from ..types.benchmark_start_run_params import BenchmarkSelfStartRunParams
35+
from ..types.benchmarks.run_list_params import RunSelfListParams
2636
from ..types.devbox_execute_async_params import DevboxNiceExecuteAsyncParams
27-
from ..types.devbox_remove_tunnel_params import DevboxRemoveTunnelParams
28-
from ..types.devbox_snapshot_disk_params import DevboxSnapshotDiskParams
29-
from ..types.devbox_read_file_contents_params import DevboxReadFileContentsParams
30-
from ..types.devbox_write_file_contents_params import DevboxWriteFileContentsParams
3137

3238
LogCallback = Callable[[str], None]
3339

@@ -206,6 +212,17 @@ class ScenarioPreview(ScenarioView):
206212
"""The input context for the Scenario."""
207213

208214

209-
# Benchmark Run params
215+
class SDKBenchmarkUpdateParams(BenchmarkUpdateParams, LongRequestOptions):
216+
pass
217+
218+
219+
class SDKBenchmarkStartRunParams(BenchmarkSelfStartRunParams, LongRequestOptions):
220+
pass
221+
222+
223+
class SDKBenchmarkListRunsParams(RunSelfListParams, BaseRequestOptions):
224+
pass
225+
226+
210227
class SDKBenchmarkRunListScenarioRunsParams(RunListScenarioRunsParams, BaseRequestOptions):
211228
pass
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""AsyncBenchmark resource class for asynchronous operations."""
2+
3+
from __future__ import annotations
4+
5+
from typing import List
6+
from typing_extensions import Unpack, override
7+
8+
from ..types import BenchmarkView
9+
from ._types import (
10+
BaseRequestOptions,
11+
LongRequestOptions,
12+
SDKBenchmarkUpdateParams,
13+
SDKBenchmarkListRunsParams,
14+
SDKBenchmarkStartRunParams,
15+
)
16+
from .._types import SequenceNotStr
17+
from .._client import AsyncRunloop
18+
from .async_benchmark_run import AsyncBenchmarkRun
19+
20+
21+
class AsyncBenchmark:
22+
"""A benchmark for evaluating agent performance across scenarios (async).
23+
24+
Provides async methods for retrieving benchmark details, updating the benchmark,
25+
managing scenarios, and starting benchmark runs. Obtain instances via
26+
``runloop.benchmark.from_id()`` or ``runloop.benchmark.list()``.
27+
28+
Example:
29+
>>> benchmark = runloop.benchmark.from_id("bmd_xxx")
30+
>>> info = await benchmark.get_info()
31+
>>> run = await benchmark.run(run_name="evaluation-v1")
32+
"""
33+
34+
def __init__(self, client: AsyncRunloop, benchmark_id: str) -> None:
35+
"""Create an AsyncBenchmark instance.
36+
37+
:param client: AsyncRunloop client instance
38+
:type client: AsyncRunloop
39+
:param benchmark_id: Benchmark ID
40+
:type benchmark_id: str
41+
"""
42+
self._client = client
43+
self._id = benchmark_id
44+
45+
@override
46+
def __repr__(self) -> str:
47+
return f"<AsyncBenchmark id={self._id!r}>"
48+
49+
@property
50+
def id(self) -> str:
51+
"""Return the benchmark ID.
52+
53+
:return: Unique benchmark ID
54+
:rtype: str
55+
"""
56+
return self._id
57+
58+
async def get_info(
59+
self,
60+
**options: Unpack[BaseRequestOptions],
61+
) -> BenchmarkView:
62+
"""Retrieve current benchmark details.
63+
64+
:param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
65+
:return: Current benchmark info
66+
:rtype: BenchmarkView
67+
"""
68+
return await self._client.benchmarks.retrieve(
69+
self._id,
70+
**options,
71+
)
72+
73+
async def update(
74+
self,
75+
**params: Unpack[SDKBenchmarkUpdateParams],
76+
) -> BenchmarkView:
77+
"""Update the benchmark.
78+
79+
Only provided fields will be updated.
80+
81+
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkUpdateParams` for available parameters
82+
:return: Updated benchmark info
83+
:rtype: BenchmarkView
84+
"""
85+
return await self._client.benchmarks.update(
86+
self._id,
87+
**params,
88+
)
89+
90+
async def run(
91+
self,
92+
**params: Unpack[SDKBenchmarkStartRunParams],
93+
) -> AsyncBenchmarkRun:
94+
"""Start a new benchmark run.
95+
96+
Creates a new benchmark run and returns an AsyncBenchmarkRun instance for
97+
managing the run lifecycle.
98+
99+
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkStartRunParams` for available parameters
100+
:return: AsyncBenchmarkRun instance for managing the run
101+
:rtype: AsyncBenchmarkRun
102+
"""
103+
run_view = await self._client.benchmarks.start_run(
104+
benchmark_id=self._id,
105+
**params,
106+
)
107+
return AsyncBenchmarkRun(self._client, run_view.id, run_view.benchmark_id)
108+
109+
async def add_scenarios(
110+
self,
111+
scenario_ids: SequenceNotStr[str],
112+
**options: Unpack[LongRequestOptions],
113+
) -> BenchmarkView:
114+
"""Add scenarios to the benchmark.
115+
116+
:param scenario_ids: List of scenario IDs to add
117+
:type scenario_ids: SequenceNotStr[str]
118+
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
119+
:return: Updated benchmark info
120+
:rtype: BenchmarkView
121+
"""
122+
return await self._client.benchmarks.update_scenarios(
123+
self._id,
124+
scenarios_to_add=scenario_ids,
125+
**options,
126+
)
127+
128+
async def remove_scenarios(
129+
self,
130+
scenario_ids: SequenceNotStr[str],
131+
**options: Unpack[LongRequestOptions],
132+
) -> BenchmarkView:
133+
"""Remove scenarios from the benchmark.
134+
135+
:param scenario_ids: List of scenario IDs to remove
136+
:type scenario_ids: SequenceNotStr[str]
137+
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
138+
:return: Updated benchmark info
139+
:rtype: BenchmarkView
140+
"""
141+
return await self._client.benchmarks.update_scenarios(
142+
self._id,
143+
scenarios_to_remove=scenario_ids,
144+
**options,
145+
)
146+
147+
async def list_runs(
148+
self,
149+
**params: Unpack[SDKBenchmarkListRunsParams],
150+
) -> List[AsyncBenchmarkRun]:
151+
"""List all runs for this benchmark.
152+
153+
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkListRunsParams` for available parameters
154+
:return: List of async benchmark runs
155+
:rtype: List[AsyncBenchmarkRun]
156+
"""
157+
page = await self._client.benchmarks.runs.list(
158+
benchmark_id=self._id,
159+
**params,
160+
)
161+
return [AsyncBenchmarkRun(self._client, run.id, run.benchmark_id) for run in page.runs]

0 commit comments

Comments
 (0)