diff --git a/pyproject.toml b/pyproject.toml index 2c90fa53a..93e6c93e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "anyio>=3.5.0, <5", "distro>=1.7.0, <2", "sniffio", - "uuid-utils>=0.11.0", + "uuid-utils>=0.11.0", ] requires-python = ">= 3.9" diff --git a/requirements-dev.lock b/requirements-dev.lock index b9f3f2862..c48025dbf 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -94,7 +94,7 @@ python-dateutil==2.9.0.post0 ; python_full_version < '3.10' # via time-machine respx==0.22.0 rich==14.2.0 -ruff==0.14.8 +ruff==0.14.9 six==1.17.0 ; python_full_version < '3.10' # via python-dateutil sniffio==1.3.1 diff --git a/src/runloop_api_client/sdk/_types.py b/src/runloop_api_client/sdk/_types.py index be09f6eed..6bf9da020 100644 --- a/src/runloop_api_client/sdk/_types.py +++ b/src/runloop_api_client/sdk/_types.py @@ -5,6 +5,7 @@ from ..lib.polling import PollingConfig from ..types.devboxes import DiskSnapshotListParams, DiskSnapshotUpdateParams from ..types.scenarios import ScorerListParams, ScorerCreateParams, ScorerUpdateParams, ScorerValidateParams +from ..types.benchmarks import RunListScenarioRunsParams from ..types.input_context import InputContext from ..types.scenario_view import ScenarioView from ..types.agent_list_params import AgentListParams @@ -203,3 +204,8 @@ class ScenarioPreview(ScenarioView): input_context: InputContextPreview # type: ignore[assignment] """The input context for the Scenario.""" + + +# Benchmark Run params +class SDKBenchmarkRunListScenarioRunsParams(RunListScenarioRunsParams, BaseRequestOptions): + pass diff --git a/src/runloop_api_client/sdk/async_benchmark_run.py b/src/runloop_api_client/sdk/async_benchmark_run.py new file mode 100644 index 000000000..fed2f5b00 --- /dev/null +++ b/src/runloop_api_client/sdk/async_benchmark_run.py @@ -0,0 +1,127 @@ +"""AsyncBenchmarkRun resource class for asynchronous operations.""" + +from __future__ import annotations + +from typing import List +from typing_extensions import Unpack, override + +from ..types import BenchmarkRunView +from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams +from .._client import AsyncRunloop +from .async_scenario_run import AsyncScenarioRun + + +class AsyncBenchmarkRun: + """A benchmark run for evaluating agent performance across scenarios (async). + + Provides async methods for monitoring run status, managing the run lifecycle, + and accessing scenario run results. Obtain instances via + ``benchmark.run()`` or ``benchmark.list_runs()``. + + Example: + >>> benchmark = runloop.benchmark.from_id("bench-xxx") + >>> run = await benchmark.run(run_name="evaluation-v1") + >>> info = await run.get_info() + >>> scenario_runs = await run.list_scenario_runs() + """ + + def __init__(self, client: AsyncRunloop, run_id: str, benchmark_id: str) -> None: + """Create an AsyncBenchmarkRun instance. + + :param client: AsyncRunloop client instance + :type client: AsyncRunloop + :param run_id: Benchmark run ID + :type run_id: str + :param benchmark_id: Parent benchmark ID + :type benchmark_id: str + """ + self._client = client + self._id = run_id + self._benchmark_id = benchmark_id + + @override + def __repr__(self) -> str: + return f"" + + @property + def id(self) -> str: + """Return the benchmark run ID. + + :return: Unique benchmark run ID + :rtype: str + """ + return self._id + + @property + def benchmark_id(self) -> str: + """Return the parent benchmark ID. + + :return: Parent benchmark ID + :rtype: str + """ + return self._benchmark_id + + async def get_info( + self, + **options: Unpack[BaseRequestOptions], + ) -> BenchmarkRunView: + """Retrieve current benchmark run status and metadata. + + :param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options + :return: Current benchmark run state info + :rtype: BenchmarkRunView + """ + return await self._client.benchmarks.runs.retrieve( + self._id, + **options, + ) + + async def cancel( + self, + **options: Unpack[LongRequestOptions], + ) -> BenchmarkRunView: + """Cancel the benchmark run. + + Stops all running scenarios and marks the run as canceled. + + :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options + :return: Updated benchmark run state + :rtype: BenchmarkRunView + """ + return await self._client.benchmarks.runs.cancel( + self._id, + **options, + ) + + async def complete( + self, + **options: Unpack[LongRequestOptions], + ) -> BenchmarkRunView: + """Complete the benchmark run. + + Marks the run as completed. Call this after all scenarios have finished. + + :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options + :return: Completed benchmark run state + :rtype: BenchmarkRunView + """ + return await self._client.benchmarks.runs.complete( + self._id, + **options, + ) + + async def list_scenario_runs( + self, + **params: Unpack[SDKBenchmarkRunListScenarioRunsParams], + ) -> List[AsyncScenarioRun]: + """List all scenario runs for this benchmark run. + + :param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters + :return: List of async scenario run objects + :rtype: List[AsyncScenarioRun] + """ + page = await self._client.benchmarks.runs.list_scenario_runs( + self._id, + **params, + ) + return [AsyncScenarioRun(self._client, run.id, run.devbox_id) for run in page.runs] diff --git a/src/runloop_api_client/sdk/benchmark_run.py b/src/runloop_api_client/sdk/benchmark_run.py new file mode 100644 index 000000000..ff6ed567a --- /dev/null +++ b/src/runloop_api_client/sdk/benchmark_run.py @@ -0,0 +1,127 @@ +"""BenchmarkRun resource class for synchronous operations.""" + +from __future__ import annotations + +from typing import List +from typing_extensions import Unpack, override + +from ..types import BenchmarkRunView +from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams +from .._client import Runloop +from .scenario_run import ScenarioRun + + +class BenchmarkRun: + """A benchmark run for evaluating agent performance across scenarios. + + Provides methods for monitoring run status, managing the run lifecycle, + and accessing scenario run results. Obtain instances via + ``benchmark.run()`` or ``benchmark.list_runs()``. + + Example: + >>> benchmark = runloop.benchmark.from_id("bench-xxx") + >>> run = benchmark.run(run_name="evaluation-v1") + >>> info = run.get_info() + >>> scenario_runs = run.list_scenario_runs() + """ + + def __init__(self, client: Runloop, run_id: str, benchmark_id: str) -> None: + """Create a BenchmarkRun instance. + + :param client: Runloop client instance + :type client: Runloop + :param run_id: Benchmark run ID + :type run_id: str + :param benchmark_id: Parent benchmark ID + :type benchmark_id: str + """ + self._client = client + self._id = run_id + self._benchmark_id = benchmark_id + + @override + def __repr__(self) -> str: + return f"" + + @property + def id(self) -> str: + """Return the benchmark run ID. + + :return: Unique benchmark run ID + :rtype: str + """ + return self._id + + @property + def benchmark_id(self) -> str: + """Return the parent benchmark ID. + + :return: Parent benchmark ID + :rtype: str + """ + return self._benchmark_id + + def get_info( + self, + **options: Unpack[BaseRequestOptions], + ) -> BenchmarkRunView: + """Retrieve current benchmark run status and metadata. + + :param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options + :return: Current benchmark run state info + :rtype: BenchmarkRunView + """ + return self._client.benchmarks.runs.retrieve( + self._id, + **options, + ) + + def cancel( + self, + **options: Unpack[LongRequestOptions], + ) -> BenchmarkRunView: + """Cancel the benchmark run. + + Stops all running scenarios and marks the run as canceled. + + :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options + :return: Updated benchmark run state + :rtype: BenchmarkRunView + """ + return self._client.benchmarks.runs.cancel( + self._id, + **options, + ) + + def complete( + self, + **options: Unpack[LongRequestOptions], + ) -> BenchmarkRunView: + """Complete the benchmark run. + + Marks the run as completed. Call this after all scenarios have finished. + + :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options + :return: Completed benchmark run state + :rtype: BenchmarkRunView + """ + return self._client.benchmarks.runs.complete( + self._id, + **options, + ) + + def list_scenario_runs( + self, + **params: Unpack[SDKBenchmarkRunListScenarioRunsParams], + ) -> List[ScenarioRun]: + """List all scenario runs for this benchmark run. + + :param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters + :return: List of scenario run objects + :rtype: List[ScenarioRun] + """ + page = self._client.benchmarks.runs.list_scenario_runs( + self._id, + **params, + ) + return [ScenarioRun(self._client, run.id, run.devbox_id) for run in page.runs] diff --git a/tests/sdk/conftest.py b/tests/sdk/conftest.py index c5546fe55..10ddf6254 100644 --- a/tests/sdk/conftest.py +++ b/tests/sdk/conftest.py @@ -129,6 +129,30 @@ class MockScenarioRunView: scoring_contract_result: object = None +@dataclass +class MockBenchmarkRunView: + """Mock BenchmarkRunView for testing.""" + + id: str = "bench_run_123" + benchmark_id: str = "bench_123" + state: str = "running" + metadata: Dict[str, str] = field(default_factory=dict) + start_time_ms: int = 1234567890000 + duration_ms: int | None = None + score: float | None = None + + +class AsyncIterableMock: + """A simple async iterable mock for testing paginated responses.""" + + def __init__(self, items: list[Any]) -> None: + self._items = items + + async def __aiter__(self): + for item in self._items: + yield item + + def create_mock_httpx_client(methods: dict[str, Any] | None = None) -> AsyncMock: """ Create a mock httpx.AsyncClient with proper context manager setup. @@ -237,6 +261,12 @@ def scenario_run_view() -> MockScenarioRunView: return MockScenarioRunView() +@pytest.fixture +def benchmark_run_view() -> MockBenchmarkRunView: + """Create a mock BenchmarkRunView.""" + return MockBenchmarkRunView() + + @pytest.fixture def mock_httpx_response() -> Mock: """Create a mock httpx.Response.""" diff --git a/tests/sdk/test_async_benchmark_run.py b/tests/sdk/test_async_benchmark_run.py new file mode 100644 index 000000000..1785f683a --- /dev/null +++ b/tests/sdk/test_async_benchmark_run.py @@ -0,0 +1,120 @@ +"""Comprehensive tests for async AsyncBenchmarkRun class.""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import AsyncMock + +from tests.sdk.conftest import MockScenarioRunView, MockBenchmarkRunView +from runloop_api_client.sdk.async_scenario_run import AsyncScenarioRun +from runloop_api_client.sdk.async_benchmark_run import AsyncBenchmarkRun + + +class TestAsyncBenchmarkRun: + """Tests for AsyncBenchmarkRun class.""" + + def test_init(self, mock_async_client: AsyncMock) -> None: + """Test AsyncBenchmarkRun initialization.""" + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + assert run.id == "bench_run_123" + assert run.benchmark_id == "bench_123" + + def test_repr(self, mock_async_client: AsyncMock) -> None: + """Test AsyncBenchmarkRun string representation.""" + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + assert repr(run) == "" + + async def test_get_info(self, mock_async_client: AsyncMock, benchmark_run_view: MockBenchmarkRunView) -> None: + """Test get_info method.""" + mock_async_client.benchmarks.runs.retrieve = AsyncMock(return_value=benchmark_run_view) + + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + result = await run.get_info() + + assert result == benchmark_run_view + mock_async_client.benchmarks.runs.retrieve.assert_awaited_once_with("bench_run_123") + + async def test_cancel(self, mock_async_client: AsyncMock, benchmark_run_view: MockBenchmarkRunView) -> None: + """Test cancel method.""" + benchmark_run_view.state = "canceled" + mock_async_client.benchmarks.runs.cancel = AsyncMock(return_value=benchmark_run_view) + + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + result = await run.cancel() + + assert result == benchmark_run_view + assert result.state == "canceled" + mock_async_client.benchmarks.runs.cancel.assert_awaited_once_with("bench_run_123") + + async def test_complete(self, mock_async_client: AsyncMock, benchmark_run_view: MockBenchmarkRunView) -> None: + """Test complete method.""" + benchmark_run_view.state = "completed" + mock_async_client.benchmarks.runs.complete = AsyncMock(return_value=benchmark_run_view) + + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + result = await run.complete() + + assert result == benchmark_run_view + assert result.state == "completed" + mock_async_client.benchmarks.runs.complete.assert_awaited_once_with("bench_run_123") + + async def test_list_scenario_runs_empty(self, mock_async_client: AsyncMock) -> None: + """Test list_scenario_runs method with empty results.""" + page = SimpleNamespace(runs=[]) + mock_async_client.benchmarks.runs.list_scenario_runs = AsyncMock(return_value=page) + + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + result = await run.list_scenario_runs() + + assert len(result) == 0 + mock_async_client.benchmarks.runs.list_scenario_runs.assert_awaited_once_with("bench_run_123") + + async def test_list_scenario_runs_single( + self, mock_async_client: AsyncMock, scenario_run_view: MockScenarioRunView + ) -> None: + """Test list_scenario_runs method with single result.""" + page = SimpleNamespace(runs=[scenario_run_view]) + mock_async_client.benchmarks.runs.list_scenario_runs = AsyncMock(return_value=page) + + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + result = await run.list_scenario_runs() + + assert len(result) == 1 + assert isinstance(result[0], AsyncScenarioRun) + assert result[0].id == scenario_run_view.id + assert result[0].devbox_id == scenario_run_view.devbox_id + mock_async_client.benchmarks.runs.list_scenario_runs.assert_awaited_once_with("bench_run_123") + + async def test_list_scenario_runs_multiple(self, mock_async_client: AsyncMock) -> None: + """Test list_scenario_runs method with multiple results.""" + scenario_run_view1 = MockScenarioRunView(id="run_001", devbox_id="dev_001") + scenario_run_view2 = MockScenarioRunView(id="run_002", devbox_id="dev_002") + page = SimpleNamespace(runs=[scenario_run_view1, scenario_run_view2]) + mock_async_client.benchmarks.runs.list_scenario_runs = AsyncMock(return_value=page) + + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + result = await run.list_scenario_runs() + + assert len(result) == 2 + assert isinstance(result[0], AsyncScenarioRun) + assert isinstance(result[1], AsyncScenarioRun) + assert result[0].id == "run_001" + assert result[1].id == "run_002" + mock_async_client.benchmarks.runs.list_scenario_runs.assert_awaited_once_with("bench_run_123") + + async def test_list_scenario_runs_with_params( + self, mock_async_client: AsyncMock, scenario_run_view: MockScenarioRunView + ) -> None: + """Test list_scenario_runs method with filtering parameters.""" + page = SimpleNamespace(runs=[scenario_run_view]) + mock_async_client.benchmarks.runs.list_scenario_runs = AsyncMock(return_value=page) + + run = AsyncBenchmarkRun(mock_async_client, "bench_run_123", "bench_123") + result = await run.list_scenario_runs(limit=10, state="completed") + + assert len(result) == 1 + assert isinstance(result[0], AsyncScenarioRun) + assert result[0].id == scenario_run_view.id + mock_async_client.benchmarks.runs.list_scenario_runs.assert_awaited_once_with( + "bench_run_123", limit=10, state="completed" + ) diff --git a/tests/sdk/test_benchmark_run.py b/tests/sdk/test_benchmark_run.py new file mode 100644 index 000000000..d54fb9432 --- /dev/null +++ b/tests/sdk/test_benchmark_run.py @@ -0,0 +1,116 @@ +"""Comprehensive tests for sync BenchmarkRun class.""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import Mock + +from tests.sdk.conftest import MockScenarioRunView, MockBenchmarkRunView +from runloop_api_client.sdk.scenario_run import ScenarioRun +from runloop_api_client.sdk.benchmark_run import BenchmarkRun + + +class TestBenchmarkRun: + """Tests for BenchmarkRun class.""" + + def test_init(self, mock_client: Mock) -> None: + """Test BenchmarkRun initialization.""" + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + assert run.id == "bench_run_123" + assert run.benchmark_id == "bench_123" + + def test_repr(self, mock_client: Mock) -> None: + """Test BenchmarkRun string representation.""" + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + assert repr(run) == "" + + def test_get_info(self, mock_client: Mock, benchmark_run_view: MockBenchmarkRunView) -> None: + """Test get_info method.""" + mock_client.benchmarks.runs.retrieve.return_value = benchmark_run_view + + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + result = run.get_info() + + assert result == benchmark_run_view + mock_client.benchmarks.runs.retrieve.assert_called_once_with("bench_run_123") + + def test_cancel(self, mock_client: Mock, benchmark_run_view: MockBenchmarkRunView) -> None: + """Test cancel method.""" + benchmark_run_view.state = "canceled" + mock_client.benchmarks.runs.cancel.return_value = benchmark_run_view + + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + result = run.cancel() + + assert result == benchmark_run_view + assert result.state == "canceled" + mock_client.benchmarks.runs.cancel.assert_called_once_with("bench_run_123") + + def test_complete(self, mock_client: Mock, benchmark_run_view: MockBenchmarkRunView) -> None: + """Test complete method.""" + benchmark_run_view.state = "completed" + mock_client.benchmarks.runs.complete.return_value = benchmark_run_view + + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + result = run.complete() + + assert result == benchmark_run_view + assert result.state == "completed" + mock_client.benchmarks.runs.complete.assert_called_once_with("bench_run_123") + + def test_list_scenario_runs_empty(self, mock_client: Mock) -> None: + """Test list_scenario_runs method with empty results.""" + page = SimpleNamespace(runs=[]) + mock_client.benchmarks.runs.list_scenario_runs.return_value = page + + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + result = run.list_scenario_runs() + + assert len(result) == 0 + mock_client.benchmarks.runs.list_scenario_runs.assert_called_once_with("bench_run_123") + + def test_list_scenario_runs_single(self, mock_client: Mock, scenario_run_view: MockScenarioRunView) -> None: + """Test list_scenario_runs method with single result.""" + page = SimpleNamespace(runs=[scenario_run_view]) + mock_client.benchmarks.runs.list_scenario_runs.return_value = page + + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + result = run.list_scenario_runs() + + assert len(result) == 1 + assert isinstance(result[0], ScenarioRun) + assert result[0].id == scenario_run_view.id + assert result[0].devbox_id == scenario_run_view.devbox_id + mock_client.benchmarks.runs.list_scenario_runs.assert_called_once_with("bench_run_123") + + def test_list_scenario_runs_multiple(self, mock_client: Mock) -> None: + """Test list_scenario_runs method with multiple results.""" + scenario_run_view1 = MockScenarioRunView(id="run_001", devbox_id="dev_001") + scenario_run_view2 = MockScenarioRunView(id="run_002", devbox_id="dev_002") + page = SimpleNamespace(runs=[scenario_run_view1, scenario_run_view2]) + mock_client.benchmarks.runs.list_scenario_runs.return_value = page + + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + result = run.list_scenario_runs() + + assert len(result) == 2 + assert isinstance(result[0], ScenarioRun) + assert isinstance(result[1], ScenarioRun) + assert result[0].id == "run_001" + assert result[1].id == "run_002" + mock_client.benchmarks.runs.list_scenario_runs.assert_called_once_with("bench_run_123") + + def test_list_scenario_runs_with_params(self, mock_client: Mock, scenario_run_view: MockScenarioRunView) -> None: + """Test list_scenario_runs method with filtering parameters.""" + page = SimpleNamespace(runs=[scenario_run_view]) + mock_client.benchmarks.runs.list_scenario_runs.return_value = page + + run = BenchmarkRun(mock_client, "bench_run_123", "bench_123") + result = run.list_scenario_runs(limit=10, state="completed") + + assert len(result) == 1 + assert isinstance(result[0], ScenarioRun) + assert result[0].id == scenario_run_view.id + mock_client.benchmarks.runs.list_scenario_runs.assert_called_once_with( + "bench_run_123", limit=10, state="completed" + ) diff --git a/tests/smoketests/sdk/test_async_benchmark_run.py b/tests/smoketests/sdk/test_async_benchmark_run.py new file mode 100644 index 000000000..3eab471b5 --- /dev/null +++ b/tests/smoketests/sdk/test_async_benchmark_run.py @@ -0,0 +1,145 @@ +"""Asynchronous SDK smoke tests for AsyncBenchmarkRun operations. + +These tests validate the AsyncBenchmarkRun class against the real API. +Until AsyncBenchmarkOps is available (PR3), we use the raw async API client +to find or create benchmark runs for testing. +""" + +from __future__ import annotations + +import pytest + +from runloop_api_client.sdk import AsyncRunloopSDK +from runloop_api_client.sdk.async_scenario_run import AsyncScenarioRun +from runloop_api_client.sdk.async_benchmark_run import AsyncBenchmarkRun + +pytestmark = [pytest.mark.smoketest] + +TWO_MINUTE_TIMEOUT = 120 + + +class TestAsyncBenchmarkRunRetrieval: + """Test AsyncBenchmarkRun retrieval operations.""" + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + async def test_benchmark_run_from_existing(self, async_sdk_client: AsyncRunloopSDK) -> None: + """Test creating AsyncBenchmarkRun from existing benchmark run. + + This test: + 1. Lists benchmark runs via raw async API + 2. Creates an AsyncBenchmarkRun wrapper + 3. Validates get_info returns correct data + """ + # List existing benchmark runs via raw API + runs_page = await async_sdk_client.api.benchmarks.runs.list(limit=1) + runs = runs_page.runs + + if not runs: + pytest.skip("No benchmark runs available to test") + + run_data = runs[0] + + # Create AsyncBenchmarkRun wrapper + benchmark_run = AsyncBenchmarkRun( + client=async_sdk_client.api, + run_id=run_data.id, + benchmark_id=run_data.benchmark_id, + ) + + assert benchmark_run.id == run_data.id + assert benchmark_run.benchmark_id == run_data.benchmark_id + + # Test get_info + info = await benchmark_run.get_info() + assert info.id == run_data.id + assert info.benchmark_id == run_data.benchmark_id + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + async def test_benchmark_run_list_scenario_runs(self, async_sdk_client: AsyncRunloopSDK) -> None: + """Test AsyncBenchmarkRun.list_scenario_runs method. + + This test: + 1. Finds an existing benchmark run + 2. Lists its scenario runs + """ + # List existing benchmark runs via raw API + runs_page = await async_sdk_client.api.benchmarks.runs.list(limit=1) + runs = runs_page.runs + + if not runs: + pytest.skip("No benchmark runs available to test") + + run_data = runs[0] + + # Create AsyncBenchmarkRun wrapper + benchmark_run = AsyncBenchmarkRun( + client=async_sdk_client.api, + run_id=run_data.id, + benchmark_id=run_data.benchmark_id, + ) + + # List scenario runs (might be empty, that's okay) + scenario_runs = await benchmark_run.list_scenario_runs() + assert isinstance(scenario_runs, list) + + # Verify returned items are AsyncScenarioRun objects + for scenario_run in scenario_runs: + assert isinstance(scenario_run, AsyncScenarioRun) + assert scenario_run.id is not None + assert scenario_run.devbox_id is not None + + +class TestAsyncBenchmarkRunLifecycle: + """Test AsyncBenchmarkRun lifecycle operations.""" + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + async def test_benchmark_run_create_and_cancel(self, async_sdk_client: AsyncRunloopSDK) -> None: + """Test creating a benchmark run and canceling it. + + This test: + 1. Finds an existing benchmark + 2. Starts a new benchmark run + 3. Creates an AsyncBenchmarkRun wrapper + 4. Cancels the run + """ + # Find an existing benchmark via raw API + benchmarks_page = await async_sdk_client.api.benchmarks.list(limit=1) + benchmarks = benchmarks_page.benchmarks + + if not benchmarks: + pytest.skip("No benchmarks available to test") + + benchmark = benchmarks[0] + + # Start a new benchmark run + run_data = await async_sdk_client.api.benchmarks.start_run( + benchmark_id=benchmark.id, + run_name="sdk-smoketest-async-benchmark-run", + ) + + try: + # Create AsyncBenchmarkRun wrapper + benchmark_run = AsyncBenchmarkRun( + client=async_sdk_client.api, + run_id=run_data.id, + benchmark_id=run_data.benchmark_id, + ) + + assert benchmark_run.id == run_data.id + + # Get info + info = await benchmark_run.get_info() + assert info.id == run_data.id + assert info.state in ["running", "completed", "canceled"] + + # Cancel the run + result = await benchmark_run.cancel() + assert result.state in ["canceled", "completed"] # May already be completed + + except Exception: + # Ensure cleanup on any error + try: + await async_sdk_client.api.benchmarks.runs.cancel(run_data.id) + except Exception: + pass + raise diff --git a/tests/smoketests/sdk/test_benchmark_run.py b/tests/smoketests/sdk/test_benchmark_run.py new file mode 100644 index 000000000..f21e9e87e --- /dev/null +++ b/tests/smoketests/sdk/test_benchmark_run.py @@ -0,0 +1,142 @@ +"""Synchronous SDK smoke tests for BenchmarkRun operations. + +These tests validate the BenchmarkRun class against the real API. +Until BenchmarkOps is available (PR3), we use the raw API client to +find or create benchmark runs for testing. +""" + +from __future__ import annotations + +import pytest + +from runloop_api_client.sdk import RunloopSDK +from runloop_api_client.sdk.scenario_run import ScenarioRun +from runloop_api_client.sdk.benchmark_run import BenchmarkRun + +pytestmark = [pytest.mark.smoketest] + +TWO_MINUTE_TIMEOUT = 120 + + +class TestBenchmarkRunRetrieval: + """Test BenchmarkRun retrieval operations.""" + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + def test_benchmark_run_from_existing(self, sdk_client: RunloopSDK) -> None: + """Test creating BenchmarkRun from existing benchmark run. + + This test: + 1. Lists benchmark runs via raw API + 2. Creates a BenchmarkRun wrapper + 3. Validates get_info returns correct data + """ + # List existing benchmark runs via raw API + runs = sdk_client.api.benchmarks.runs.list(limit=1).runs + + if not runs: + pytest.skip("No benchmark runs available to test") + + run_data = runs[0] + + # Create BenchmarkRun wrapper + benchmark_run = BenchmarkRun( + client=sdk_client.api, + run_id=run_data.id, + benchmark_id=run_data.benchmark_id, + ) + + assert benchmark_run.id == run_data.id + assert benchmark_run.benchmark_id == run_data.benchmark_id + + # Test get_info + info = benchmark_run.get_info() + assert info.id == run_data.id + assert info.benchmark_id == run_data.benchmark_id + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + def test_benchmark_run_list_scenario_runs(self, sdk_client: RunloopSDK) -> None: + """Test BenchmarkRun.list_scenario_runs method. + + This test: + 1. Finds an existing benchmark run + 2. Lists its scenario runs + """ + # List existing benchmark runs via raw API + runs = sdk_client.api.benchmarks.runs.list(limit=1).runs + + if not runs: + pytest.skip("No benchmark runs available to test") + + run_data = runs[0] + + # Create BenchmarkRun wrapper + benchmark_run = BenchmarkRun( + client=sdk_client.api, + run_id=run_data.id, + benchmark_id=run_data.benchmark_id, + ) + + # List scenario runs (might be empty, that's okay) + scenario_runs = benchmark_run.list_scenario_runs() + assert isinstance(scenario_runs, list) + + # Verify returned items are ScenarioRun objects + for scenario_run in scenario_runs: + assert isinstance(scenario_run, ScenarioRun) + assert scenario_run.id is not None + assert scenario_run.devbox_id is not None + + +class TestBenchmarkRunLifecycle: + """Test BenchmarkRun lifecycle operations.""" + + @pytest.mark.timeout(TWO_MINUTE_TIMEOUT) + def test_benchmark_run_create_and_cancel(self, sdk_client: RunloopSDK) -> None: + """Test creating a benchmark run and canceling it. + + This test: + 1. Finds an existing benchmark + 2. Starts a new benchmark run + 3. Creates a BenchmarkRun wrapper + 4. Cancels the run + """ + # Find an existing benchmark via raw API + benchmarks = sdk_client.api.benchmarks.list(limit=1).benchmarks + + if not benchmarks: + pytest.skip("No benchmarks available to test") + + benchmark = benchmarks[0] + + # Start a new benchmark run + run_data = sdk_client.api.benchmarks.start_run( + benchmark_id=benchmark.id, + run_name="sdk-smoketest-benchmark-run", + ) + + try: + # Create BenchmarkRun wrapper + benchmark_run = BenchmarkRun( + client=sdk_client.api, + run_id=run_data.id, + benchmark_id=run_data.benchmark_id, + ) + + assert benchmark_run.id == run_data.id + + # Get info + info = benchmark_run.get_info() + assert info.id == run_data.id + assert info.state in ["running", "completed", "canceled"] + + # Cancel the run + result = benchmark_run.cancel() + assert result.state in ["canceled", "completed"] # May already be completed + + except Exception: + # Ensure cleanup on any error + try: + sdk_client.api.benchmarks.runs.cancel(run_data.id) + except Exception: + pass + raise