From aaccd9d5e5e2fea50321e54b621a838abd54b6c9 Mon Sep 17 00:00:00 2001 From: Takumi Yanagawa Date: Sat, 16 Aug 2025 04:32:43 +0900 Subject: [PATCH 1/3] fix: missing exception handling on error Signed-off-by: Takumi Yanagawa --- itbench_utilities/agent_harness/agent.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/itbench_utilities/agent_harness/agent.py b/itbench_utilities/agent_harness/agent.py index 9c990fe..1516908 100644 --- a/itbench_utilities/agent_harness/agent.py +++ b/itbench_utilities/agent_harness/agent.py @@ -166,13 +166,13 @@ async def run_benchmark(self, benchmark_id, agent_id): return False async def run_agent(self, target_bundle: Bundle, benchmark_id: str, agent_id: str): - response = self.rest_client.get(f"/benchmarks/{benchmark_id}/agents/{agent_id}") - agent = Agent.model_validate(response.json()) - agent_info = AgentInfo(id=agent.metadata.id, name=agent.spec.name, directory=self.agent_directory) - ao = AgentOperator(agent_info=agent_info) - self.rest_client.assign(benchmark_id, agent_id, target_bundle.metadata.id) - self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Executing) try: + response = self.rest_client.get(f"/benchmarks/{benchmark_id}/agents/{agent_id}") + agent = Agent.model_validate(response.json()) + agent_info = AgentInfo(id=agent.metadata.id, name=agent.spec.name, directory=self.agent_directory) + ao = AgentOperator(agent_info=agent_info) + self.rest_client.assign(benchmark_id, agent_id, target_bundle.metadata.id) + self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Executing) shared_workspace = Path("/tmp") / "shared_workspace" / agent.metadata.id / target_bundle.spec.name shared_workspace.mkdir(parents=True, exist_ok=True) output_dir_per_bundle = Path("/tmp") / "output" / agent.metadata.id / target_bundle.spec.name @@ -191,7 +191,10 @@ async def run_agent(self, target_bundle: Bundle, benchmark_id: str, agent_id: st except Exception as e: err = traceback.format_exc() logger.error(err) - self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Error, message=f"{e}") + try: + self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Error, message=f"{e}") + except Exception as e2: + logger.error(f"Failed to update agent status to 'Error' for benchmark {benchmark_id!r} (agent {agent_id!r}): {e2}") def wait_bundle_finished(): logger.info(f"Wait for bundle to finish...") From 341f5b89e96fb93dc220f62c6cfb14f2f6258efa Mon Sep 17 00:00:00 2001 From: Takumi Yanagawa Date: Sat, 16 Aug 2025 05:48:02 +0900 Subject: [PATCH 2/3] fix: add retry and top-level exception handling to prevent unexpected crashes Signed-off-by: Takumi Yanagawa --- docker/agent-harness/entrypoint.sh | 6 +++ itbench_utilities/agent_harness/agent.py | 52 +++++++++++++++++++----- itbench_utilities/agent_harness/main.py | 2 + 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/docker/agent-harness/entrypoint.sh b/docker/agent-harness/entrypoint.sh index ee837ca..cae2e05 100755 --- a/docker/agent-harness/entrypoint.sh +++ b/docker/agent-harness/entrypoint.sh @@ -5,6 +5,8 @@ cd /etc/agent-benchmark port="443" root_path="/bench-server" benchmark_timeout="300" +benchmark_exec_max_attempts="3" +benchmark_exec_retry_interval="5" while [[ $# -gt 0 ]]; do case "$1" in @@ -12,6 +14,8 @@ while [[ $# -gt 0 ]]; do --port) port="$2"; shift 2 ;; --root_path) root_path="$2"; shift 2 ;; --benchmark_timeout) benchmark_timeout="$2"; shift 2 ;; + --benchmark_exec_max_attempts) benchmark_exec_max_attempts="$2"; shift 2 ;; + --benchmark_exec_retry_interval) benchmark_exec_retry_interval="$2"; shift 2 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done @@ -27,4 +31,6 @@ python itbench_utilities/agent_harness/main.py \ --root_path $root_path \ --ssl \ --benchmark_timeout $benchmark_timeout \ + --benchmark_exec_max_attempts $benchmark_exec_max_attempts \ + --benchmark_exec_retry_interval $benchmark_exec_retry_interval \ --single_run \ No newline at end of file diff --git a/itbench_utilities/agent_harness/agent.py b/itbench_utilities/agent_harness/agent.py index 1516908..2557dde 100644 --- a/itbench_utilities/agent_harness/agent.py +++ b/itbench_utilities/agent_harness/agent.py @@ -47,6 +47,11 @@ class AgentHarnessConfig(BaseModel): path_to_data_pushed_to_scenario: Optional[str] = None +class AgentHarnessOpts(BaseModel): + benchmark_exec_max_attempts: int = 3 + benchmark_exec_retry_interval: int = 5 + + class AgentHarness: def __init__( @@ -62,6 +67,7 @@ def __init__( single_run=False, interval=5, benchmark_timeout=300, + opts: Optional[AgentHarnessOpts] = AgentHarnessOpts(), ) -> None: self.agent_manifest = agent_manifest self.agent_directory = agent_directory @@ -81,6 +87,7 @@ def __init__( ) self.stop_event = asyncio.Event() self.task_history = [] + self.opts = opts async def run(self): @@ -107,17 +114,12 @@ async def run(self): benchmark_id = benchmark_entry.benchmark_id logger.info(f"Take the benchmark '{benchmark_entry.benchmark_id}'") self.add_history(benchmark_id) - self.rest_client.put( - f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", - Status(phase=AgentPhaseEnum.Executing).model_dump_json(), - ) - is_completed = await self.run_benchmark(benchmark_id, benchmark_entry.agent_access_info.id) - if is_completed: - phase = AgentPhaseEnum.Finished - else: - phase = AgentPhaseEnum.TimeedOut - self.rest_client.put( - f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", Status(phase=phase).model_dump_json() + await run_with_retry( + self.run_benchmark_with_status_update, + retries=self.opts.benchmark_exec_max_attempts, + delay=self.opts.benchmark_exec_retry_interval, + benchmark_id=benchmark_id, + benchmark_entry=benchmark_entry, ) if self.single_run: logger.info("Task completed. Exiting due to run-once mode.") @@ -127,6 +129,18 @@ async def run(self): await asyncio.sleep(self.interval) elapsed_time += self.interval + async def run_benchmark_with_status_update(self, benchmark_id, benchmark_entry: AgentBenchmarkEntry): + self.rest_client.put( + f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", + Status(phase=AgentPhaseEnum.Executing).model_dump_json(), + ) + is_completed = await self.run_benchmark(benchmark_id, benchmark_entry.agent_access_info.id) + if is_completed: + phase = AgentPhaseEnum.Finished + else: + phase = AgentPhaseEnum.TimeedOut + self.rest_client.put(f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", Status(phase=phase).model_dump_json()) + async def run_benchmark(self, benchmark_id, agent_id): timeout = self.benchmark_timeout @@ -231,6 +245,17 @@ def add_history(self, benchmark_id: str, bundle: Optional[Bundle] = None, agent_ self.task_history.append(item) +async def run_with_retry(func, retries=3, delay=5, *args, **kwargs): + for attempt in range(1, retries + 1): + try: + return await func(*args, **kwargs) + except Exception as e: + logger.error(f"Attempt {attempt}/{retries} failed for {func.__name__}: {e}") + if attempt < retries: + await asyncio.sleep(delay) + raise RuntimeError(f"{func.__name__} failed after {retries} attempts") + + def run(args): with open(args.input) as f: agent_manifest = AgentManifest.model_validate_json(f.read()) @@ -241,6 +266,10 @@ def run(args): data = yaml.safe_load(f.read()) config = AgentHarnessConfig.model_validate(data) + opts = AgentHarnessOpts( + benchmark_exec_retry_interval=args.benchmark_exec_retry_interval, + benchmark_exec_max_attempts=args.benchmark_exec_max_attempts, + ) agent_harness = AgentHarness( agent_manifest, args.agent_directory, @@ -252,5 +281,6 @@ def run(args): benchmark_timeout=args.benchmark_timeout, config=config, single_run=args.single_run, + opts=opts, ) asyncio.run(agent_harness.run()) diff --git a/itbench_utilities/agent_harness/main.py b/itbench_utilities/agent_harness/main.py index 247c444..46da31d 100644 --- a/itbench_utilities/agent_harness/main.py +++ b/itbench_utilities/agent_harness/main.py @@ -53,6 +53,8 @@ def main(): action="store_true", help="Process one benchmark job and exit", ) + parser.add_argument("--benchmark_exec_max_attempts", type=int, default=3, help=f"Maximum number of attempts to run the benchmark with status updates (default: 3).") + parser.add_argument("--benchmark_exec_retry_interval", type=int, default=5, help=f"Seconds to wait between retry attempts for benchmark execution (default: 5).") args = parser.parse_args() From 4185975c021a1cfb9ed94909b83c3d86c97c2597 Mon Sep 17 00:00:00 2001 From: Takumi Yanagawa Date: Sat, 16 Aug 2025 06:13:53 +0900 Subject: [PATCH 3/3] fix: add missing PyYAML dependency to pyproject.toml Signed-off-by: Takumi Yanagawa --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index c56b916..373e194 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "pandas==2.2.3", "pydantic==2.9.1", "pydantic-settings==2.7.1", + "PyYAML==6.0.2", "requests==2.32.3", "tabulate==0.9.0", "urllib3>=2.2.2",