Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docker/agent-harness/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ cd /etc/agent-benchmark
port="443"
root_path="/bench-server"
benchmark_timeout="300"
benchmark_exec_max_attempts="3"
benchmark_exec_retry_interval="5"

while [[ $# -gt 0 ]]; do
case "$1" in
--host) host="$2"; shift 2 ;;
--port) port="$2"; shift 2 ;;
--root_path) root_path="$2"; shift 2 ;;
--benchmark_timeout) benchmark_timeout="$2"; shift 2 ;;
--benchmark_exec_max_attempts) benchmark_exec_max_attempts="$2"; shift 2 ;;
--benchmark_exec_retry_interval) benchmark_exec_retry_interval="$2"; shift 2 ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
Expand All @@ -27,4 +31,6 @@ python itbench_utilities/agent_harness/main.py \
--root_path $root_path \
--ssl \
--benchmark_timeout $benchmark_timeout \
--benchmark_exec_max_attempts $benchmark_exec_max_attempts \
--benchmark_exec_retry_interval $benchmark_exec_retry_interval \
--single_run
69 changes: 51 additions & 18 deletions itbench_utilities/agent_harness/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ class AgentHarnessConfig(BaseModel):
path_to_data_pushed_to_scenario: Optional[str] = None


class AgentHarnessOpts(BaseModel):
benchmark_exec_max_attempts: int = 3
benchmark_exec_retry_interval: int = 5


class AgentHarness:

def __init__(
Expand All @@ -62,6 +67,7 @@ def __init__(
single_run=False,
interval=5,
benchmark_timeout=300,
opts: Optional[AgentHarnessOpts] = AgentHarnessOpts(),
) -> None:
self.agent_manifest = agent_manifest
self.agent_directory = agent_directory
Expand All @@ -81,6 +87,7 @@ def __init__(
)
self.stop_event = asyncio.Event()
self.task_history = []
self.opts = opts

async def run(self):

Expand All @@ -107,17 +114,12 @@ async def run(self):
benchmark_id = benchmark_entry.benchmark_id
logger.info(f"Take the benchmark '{benchmark_entry.benchmark_id}'")
self.add_history(benchmark_id)
self.rest_client.put(
f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}",
Status(phase=AgentPhaseEnum.Executing).model_dump_json(),
)
is_completed = await self.run_benchmark(benchmark_id, benchmark_entry.agent_access_info.id)
if is_completed:
phase = AgentPhaseEnum.Finished
else:
phase = AgentPhaseEnum.TimeedOut
self.rest_client.put(
f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", Status(phase=phase).model_dump_json()
await run_with_retry(
self.run_benchmark_with_status_update,
retries=self.opts.benchmark_exec_max_attempts,
delay=self.opts.benchmark_exec_retry_interval,
benchmark_id=benchmark_id,
benchmark_entry=benchmark_entry,
)
if self.single_run:
logger.info("Task completed. Exiting due to run-once mode.")
Expand All @@ -127,6 +129,18 @@ async def run(self):
await asyncio.sleep(self.interval)
elapsed_time += self.interval

async def run_benchmark_with_status_update(self, benchmark_id, benchmark_entry: AgentBenchmarkEntry):
self.rest_client.put(
f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}",
Status(phase=AgentPhaseEnum.Executing).model_dump_json(),
)
is_completed = await self.run_benchmark(benchmark_id, benchmark_entry.agent_access_info.id)
if is_completed:
phase = AgentPhaseEnum.Finished
else:
phase = AgentPhaseEnum.TimeedOut
self.rest_client.put(f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", Status(phase=phase).model_dump_json())

async def run_benchmark(self, benchmark_id, agent_id):

timeout = self.benchmark_timeout
Expand Down Expand Up @@ -166,13 +180,13 @@ async def run_benchmark(self, benchmark_id, agent_id):
return False

async def run_agent(self, target_bundle: Bundle, benchmark_id: str, agent_id: str):
response = self.rest_client.get(f"/benchmarks/{benchmark_id}/agents/{agent_id}")
agent = Agent.model_validate(response.json())
agent_info = AgentInfo(id=agent.metadata.id, name=agent.spec.name, directory=self.agent_directory)
ao = AgentOperator(agent_info=agent_info)
self.rest_client.assign(benchmark_id, agent_id, target_bundle.metadata.id)
self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Executing)
try:
response = self.rest_client.get(f"/benchmarks/{benchmark_id}/agents/{agent_id}")
agent = Agent.model_validate(response.json())
agent_info = AgentInfo(id=agent.metadata.id, name=agent.spec.name, directory=self.agent_directory)
ao = AgentOperator(agent_info=agent_info)
self.rest_client.assign(benchmark_id, agent_id, target_bundle.metadata.id)
self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Executing)
shared_workspace = Path("/tmp") / "shared_workspace" / agent.metadata.id / target_bundle.spec.name
shared_workspace.mkdir(parents=True, exist_ok=True)
output_dir_per_bundle = Path("/tmp") / "output" / agent.metadata.id / target_bundle.spec.name
Expand All @@ -191,7 +205,10 @@ async def run_agent(self, target_bundle: Bundle, benchmark_id: str, agent_id: st
except Exception as e:
err = traceback.format_exc()
logger.error(err)
self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Error, message=f"{e}")
try:
self.rest_client.push_agent_status(benchmark_id, agent_id, AgentPhaseEnum.Error, message=f"{e}")
except Exception as e2:
logger.error(f"Failed to update agent status to 'Error' for benchmark {benchmark_id!r} (agent {agent_id!r}): {e2}")

def wait_bundle_finished():
logger.info(f"Wait for bundle to finish...")
Expand Down Expand Up @@ -228,6 +245,17 @@ def add_history(self, benchmark_id: str, bundle: Optional[Bundle] = None, agent_
self.task_history.append(item)


async def run_with_retry(func, retries=3, delay=5, *args, **kwargs):
for attempt in range(1, retries + 1):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"Attempt {attempt}/{retries} failed for {func.__name__}: {e}")
if attempt < retries:
await asyncio.sleep(delay)
raise RuntimeError(f"{func.__name__} failed after {retries} attempts")


def run(args):
with open(args.input) as f:
agent_manifest = AgentManifest.model_validate_json(f.read())
Expand All @@ -238,6 +266,10 @@ def run(args):
data = yaml.safe_load(f.read())
config = AgentHarnessConfig.model_validate(data)

opts = AgentHarnessOpts(
benchmark_exec_retry_interval=args.benchmark_exec_retry_interval,
benchmark_exec_max_attempts=args.benchmark_exec_max_attempts,
)
agent_harness = AgentHarness(
agent_manifest,
args.agent_directory,
Expand All @@ -249,5 +281,6 @@ def run(args):
benchmark_timeout=args.benchmark_timeout,
config=config,
single_run=args.single_run,
opts=opts,
)
asyncio.run(agent_harness.run())
2 changes: 2 additions & 0 deletions itbench_utilities/agent_harness/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def main():
action="store_true",
help="Process one benchmark job and exit",
)
parser.add_argument("--benchmark_exec_max_attempts", type=int, default=3, help=f"Maximum number of attempts to run the benchmark with status updates (default: 3).")
parser.add_argument("--benchmark_exec_retry_interval", type=int, default=5, help=f"Seconds to wait between retry attempts for benchmark execution (default: 5).")

args = parser.parse_args()

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"pandas==2.2.3",
"pydantic==2.9.1",
"pydantic-settings==2.7.1",
"PyYAML==6.0.2",
"requests==2.32.3",
"tabulate==0.9.0",
"urllib3>=2.2.2",
Expand Down