Skip to content

Commit 699736f

Browse files
Persist Conversation Trajectory from the remote runtime (#108)
* Update SDK submodule to 49c42ee (main) * Capture conversation archive from remote workspace during eval * Capture remote conversations dir via base64 tar * Fix import order to pass pre-commit checks Co-authored-by: openhands <[email protected]> * Move conversation archive capture to Evaluation base class This change moves the conversation trajectory persistence logic from the SWE-Bench-specific implementation to the Evaluation base class, making it automatically available to all benchmarks. Benefits: - Automatic conversation capture for all benchmarks (SWE-Bench, GAIA, etc.) - Consistent behavior across all evaluation workflows - Better reproducibility and debugging capabilities - Per-instance archives support parallel execution - No need to manually add this to each new benchmark The logic is now called in _process_one_mp after successful evaluation, ensuring that conversation archives are captured for every instance across all benchmarks. Archives are stored per-instance as conversations/{instance_id}.tar.gz to support parallel workers without race conditions. Co-authored-by: openhands <[email protected]> * Revert sdk submodule to main --------- Co-authored-by: openhands <[email protected]>
1 parent 5793428 commit 699736f

File tree

1 file changed

+60
-0
lines changed

1 file changed

+60
-0
lines changed

benchmarks/utils/evaluation.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
Evaluation orchestrator.
33
"""
44

5+
import base64
56
import json
67
import os
78
import sys
89
from abc import ABC, abstractmethod
910
from concurrent.futures import ProcessPoolExecutor, as_completed
1011
from contextlib import contextmanager
12+
from pathlib import Path
1113
from typing import Callable, List, Optional, Tuple
1214

1315
from pydantic import BaseModel, Field
@@ -99,6 +101,60 @@ def _create_error_output(
99101
instance=instance.data,
100102
)
101103

104+
def _capture_conversation_archive(
105+
self,
106+
workspace: RemoteWorkspace,
107+
instance: EvalInstance,
108+
) -> None:
109+
"""Capture conversation trajectory from the remote runtime.
110+
111+
Persists the /workspace/conversations directory from the remote runtime
112+
to a per-instance tar.gz file in the evaluation output directory.
113+
114+
This provides a complete record of the agent's conversation history,
115+
which is valuable for debugging, analysis, and reproducibility.
116+
117+
Args:
118+
workspace: The remote workspace to capture from
119+
instance: The evaluation instance being processed
120+
"""
121+
try:
122+
# Create command to tar and base64 encode the conversations directory
123+
conv_cmd = (
124+
"cd / && "
125+
"if [ -d workspace/conversations ]; then "
126+
"tar -czf - workspace/conversations | base64; "
127+
"else echo ''; fi"
128+
)
129+
tar_cmd = workspace.execute_command(conv_cmd)
130+
131+
if tar_cmd.exit_code == 0 and tar_cmd.stdout.strip():
132+
# Save to instance-specific file to support parallel execution
133+
conversations_dir = (
134+
Path(self.metadata.eval_output_dir) / "conversations"
135+
)
136+
conversations_dir.mkdir(parents=True, exist_ok=True)
137+
conv_tar_path = conversations_dir / f"{instance.id}.tar.gz"
138+
139+
# Decode and write the tar.gz file
140+
conv_tar_path.write_bytes(base64.b64decode(tar_cmd.stdout))
141+
logger.info(
142+
"[child] Saved conversation archive for %s to %s",
143+
instance.id,
144+
conv_tar_path,
145+
)
146+
else:
147+
logger.debug(
148+
"[child] No conversation archive for %s (directory not found or empty)",
149+
instance.id,
150+
)
151+
except Exception as e:
152+
logger.warning(
153+
"[child] Failed to capture conversation trajectory for %s: %s",
154+
instance.id,
155+
e,
156+
)
157+
102158
# --- Runner ---
103159
def run(
104160
self,
@@ -350,6 +406,10 @@ def _process_one_mp(
350406
try:
351407
workspace = self.prepare_workspace(instance)
352408
out = self.evaluate_instance(instance, workspace)
409+
410+
# Capture conversation archive after successful evaluation
411+
self._capture_conversation_archive(workspace, instance)
412+
353413
logger.info("[child] done id=%s", instance.id)
354414
return instance, out
355415
except Exception as e:

0 commit comments

Comments
 (0)