Persist Conversation Trajectory from the remote runtime (#108)

simonrosenberg · openhands-agent · web-flow · commit 699736f6d318 · 2025-11-22T21:58:27.000+01:00
* Update SDK submodule to 49c42ee (main)

* Capture conversation archive from remote workspace during eval

* Capture remote conversations dir via base64 tar

* Fix import order to pass pre-commit checks

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;

* Move conversation archive capture to Evaluation base class

This change moves the conversation trajectory persistence logic from the
SWE-Bench-specific implementation to the Evaluation base class, making it
automatically available to all benchmarks.

Benefits:
- Automatic conversation capture for all benchmarks (SWE-Bench, GAIA, etc.)
- Consistent behavior across all evaluation workflows
- Better reproducibility and debugging capabilities
- Per-instance archives support parallel execution
- No need to manually add this to each new benchmark

The logic is now called in _process_one_mp after successful evaluation,
ensuring that conversation archives are captured for every instance across
all benchmarks. Archives are stored per-instance as conversations/{instance_id}.tar.gz
to support parallel workers without race conditions.

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;

* Revert sdk submodule to main

---------

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py
@@ -2,12 +2,14 @@
 Evaluation orchestrator.
 """
 
+import base64
 import json
 import os
 import sys
 from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from contextlib import contextmanager
+from pathlib import Path
 from typing import Callable, List, Optional, Tuple
 
 from pydantic import BaseModel, Field
@@ -99,6 +101,60 @@ def _create_error_output(
             instance=instance.data,
         )
 
+    def _capture_conversation_archive(
+        self,
+        workspace: RemoteWorkspace,
+        instance: EvalInstance,
+    ) -> None:
+        """Capture conversation trajectory from the remote runtime.
+
+        Persists the /workspace/conversations directory from the remote runtime
+        to a per-instance tar.gz file in the evaluation output directory.
+
+        This provides a complete record of the agent's conversation history,
+        which is valuable for debugging, analysis, and reproducibility.
+
+        Args:
+            workspace: The remote workspace to capture from
+            instance: The evaluation instance being processed
+        """
+        try:
+            # Create command to tar and base64 encode the conversations directory
+            conv_cmd = (
+                "cd / && "
+                "if [ -d workspace/conversations ]; then "
+                "tar -czf - workspace/conversations | base64; "
+                "else echo ''; fi"
+            )
+            tar_cmd = workspace.execute_command(conv_cmd)
+
+            if tar_cmd.exit_code == 0 and tar_cmd.stdout.strip():
+                # Save to instance-specific file to support parallel execution
+                conversations_dir = (
+                    Path(self.metadata.eval_output_dir) / "conversations"
+                )
+                conversations_dir.mkdir(parents=True, exist_ok=True)
+                conv_tar_path = conversations_dir / f"{instance.id}.tar.gz"
+
+                # Decode and write the tar.gz file
+                conv_tar_path.write_bytes(base64.b64decode(tar_cmd.stdout))
+                logger.info(
+                    "[child] Saved conversation archive for %s to %s",
+                    instance.id,
+                    conv_tar_path,
+                )
+            else:
+                logger.debug(
+                    "[child] No conversation archive for %s (directory not found or empty)",
+                    instance.id,
+                )
+        except Exception as e:
+            logger.warning(
+                "[child] Failed to capture conversation trajectory for %s: %s",
+                instance.id,
+                e,
+            )
+
     # --- Runner ---
     def run(
         self,
@@ -350,6 +406,10 @@ def _process_one_mp(
                 try:
                     workspace = self.prepare_workspace(instance)
                     out = self.evaluate_instance(instance, workspace)
+
+                    # Capture conversation archive after successful evaluation
+                    self._capture_conversation_archive(workspace, instance)
+
                     logger.info("[child] done id=%s", instance.id)
                     return instance, out
                 except Exception as e: