From a8f8a08b0dc3b122650da2a5f68e7b02600532b5 Mon Sep 17 00:00:00 2001
From: yyDing1 <yyding.me@gmail.com>
Date: Fri, 5 Jun 2026 15:44:59 +0800
Subject: [PATCH 1/5] update

---
 .../data_preprocess/swe_bench_multilingual.py | 188 +++++++++++++++
 uni_agent/reward/__init__.py                  |   2 +
 uni_agent/reward/registry.py                  |   1 +
 uni_agent/reward/swe_bench_multilingual.py    | 214 ++++++++++++++++++
 4 files changed, 405 insertions(+)
 create mode 100644 examples/data_preprocess/swe_bench_multilingual.py
 create mode 100644 uni_agent/reward/swe_bench_multilingual.py
diff --git a/examples/data_preprocess/swe_bench_multilingual.py b/examples/data_preprocess/swe_bench_multilingual.py
new file mode 100644
index 00000000..17f4b948
--- /dev/null
+++ b/examples/data_preprocess/swe_bench_multilingual.py
@@ -0,0 +1,188 @@
+# ruff: noqa: E501
+"""Preprocess SWE-bench/SWE-bench_Multilingual into the uni-agent SWE-agent format.
+
+The dataset has 300 instances across 41 repos in 7 non-Python languages
+(c/go/java/js/php/ruby/rust). Grading is the official ``swebench`` harness, wired up
+by the ``swe_bench_multilingual`` reward spec, so we only keep the fields that
+``make_test_spec`` + grading need: ``instance_id``, ``repo``, ``version``,
+``base_commit``, ``test_patch``, ``patch`` (gold, for verifiers),
+``problem_statement``, ``FAIL_TO_PASS``, ``PASS_TO_PASS``.
+
+The repo lives at ``/testbed`` in the published ``swebench/sweb.eval.x86_64.<id>``
+images. The image already holds the repo at ``base_commit`` plus the harness's
+build-time ``pre_install``/``build`` edits (e.g. apache/lucene injects a gradle
+``testLogging`` block the parser relies on). Those edits are uncommitted, so
+post-setup *commits* them (instead of ``git reset --hard``, which would revert them
+and break grading) -- giving the agent a clean tree while preserving the build
+config.
+
+Example::
+
+    DEPLOYMENT=modal python examples/data_preprocess/swe_bench_multilingual.py \
+        --local-save-dir ~/data/swe_agent
+"""
+
+import argparse
+import os
+
+from datasets import load_dataset
+from swebench.harness.constants import MAP_REPO_TO_EXT
+
+impl = os.getenv("DEPLOYMENT", "modal").lower()
+if impl != "modal":
+    # Only the public ``swebench/`` Docker Hub images cover the multilingual set.
+    raise ValueError("SWE-bench_Multilingual preprocessing only supports modal deployment")
+
+
+def get_image_name(instance_id: str) -> str:
+    """Published image ref, mirroring swebench's ``instance_image_key``."""
+    return f"swebench/sweb.eval.x86_64.{instance_id.lower().replace('__', '_1776_')}"
+
+
+# Map swebench file-extension code -> human language name for the prompt.
+EXT_TO_LANGUAGE = {
+    "c": "C",
+    "go": "Go",
+    "java": "Java",
+    "js": "JavaScript",
+    "php": "PHP",
+    "rb": "Ruby",
+    "rs": "Rust",
+}
+
+
+SYSTEM_PROMPT = """
+You are a helpful assistant that can interact with a computer to solve tasks.
+""".strip()
+
+USER_PROMPT = """
+<uploaded_files>
+/testbed
+</uploaded_files>
+I have uploaded a code repository in the /testbed directory (primary language: {language}). You can explore and modify files using the available tools. Consider the following issue description:
+
+<issue_description>
+{problem_statement}
+</issue_description>
+
+Can you help me implement the necessary changes to the repository to fix the <issue_description>?
+I have already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+Also the development environment is already set up for you (i.e., all dependencies are already installed and the project is already built), so you don't need to install other packages.
+Your task is to make the minimal changes to non-test files in the /testbed directory to ensure the <issue_description> is satisfied.
+
+Follow these steps to resolve the issue:
+1. First, explore the codebase to locate and understand the code relevant to the <issue_description>.
+- Use efficient search commands to identify key files and functions.
+- Build your understanding of how the code works, the expected behaviors and edge cases, and the potential root causes for the given issue.
+
+2. Assess whether you can reproduce the issue:
+- Create a small script (e.g. at '/testbed/reproduce_issue.*') that demonstrates the error, using the repository's own language/runtime.
+- Execute this script to confirm the error behavior before fixing it.
+- Your reproduction script should also assert the expected behavior for the fixed code.
+
+3. Analyze the root cause:
+- Identify the underlying problem based on your code exploration and reproduction results.
+- Reason about multiple potential approaches and pick the most elegant and effective one, considering correctness, generality, and side effects.
+
+4. Implement your solution:
+- Make targeted changes to the necessary files following idiomatic code patterns once you determine the root cause.
+
+5. Verify your solution:
+- Rerun your reproduction script to confirm the error is fixed, iterating until successful.
+
+6. Run unit tests:
+- Find and run the relevant unit tests using the project's own test runner to ensure your solution is correct and does not cause regressions.
+- DO NOT MODIFY any of the existing unit tests. You can add new edge test cases in a separate file if needed BUT DO NOT MODIFY THE EXISTING TESTS.
+
+7. Test edge cases:
+- Identify potential edge cases that might challenge your solution, create additional tests in a separate file, and verify robustness.
+
+8. Submit your solution:
+- Once you have verified your solution, submit it using the `submit` tool.
+
+A successful resolution means:
+- The specific error/issue described no longer occurs
+- Your changes maintain compatibility with existing functionality
+- Edge cases are properly handled
+""".strip()
+
+
+def build_swe_bench_multilingual():
+    def process(example):
+        repo = example["repo"]
+        instance_id = example["instance_id"]
+        language = EXT_TO_LANGUAGE.get(MAP_REPO_TO_EXT[repo], "the project's")
+
+        metadata = {
+            "instance_id": instance_id,
+            "repo": repo,
+            "version": str(example["version"]),
+            "base_commit": example["base_commit"],
+            "patch": example["patch"],
+            "test_patch": example["test_patch"],
+            "problem_statement": example["problem_statement"],
+            "FAIL_TO_PASS": example["FAIL_TO_PASS"],
+            "PASS_TO_PASS": example["PASS_TO_PASS"],
+        }
+
+        reset_script = " && ".join(
+            [
+                "git tag -d $(git tag -l)",
+                "git reflog expire --expire=now --all",
+                "git gc --prune=now",
+                "git config --global user.email setup@swebench.config",
+                "git config --global user.name SWE-bench",
+                "git commit --allow-empty -am SWE-bench",
+                f"git checkout {metadata['base_commit']}",
+                "git clean -fdq",
+            ]
+        )
+
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {
+                    "role": "user",
+                    "content": USER_PROMPT.format(
+                        language=language,
+                        problem_statement=example["problem_statement"],
+                    ),
+                },
+            ],
+            "agent_name": "swe_agent",
+            "extra_info": {
+                "tools_kwargs": {
+                    "env": {
+                        "deployment": {"image": get_image_name(instance_id)},
+                        "post_setup_cmd": reset_script,
+                    },
+                    "reward": {
+                        "name": "swe_bench_multilingual",
+                        "metadata": metadata,
+                    },
+                },
+            },
+        }
+
+    data_source = "SWE-bench/SWE-bench_Multilingual"
+    print(f"Loading the {data_source} dataset from huggingface...", flush=True)
+    dataset = load_dataset(data_source, split="test")
+    print(f"Loaded {len(dataset)} raw instances", flush=True)
+
+    # Test set: keep every instance (no filtering).
+    dataset = dataset.map(process, remove_columns=dataset.column_names)
+    return dataset
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local-save-dir", default="~/data/swe_agent")
+    args = parser.parse_args()
+
+    save_dir = os.path.expanduser(args.local_save_dir)
+    os.makedirs(save_dir, exist_ok=True)
+
+    sbm_dataset = build_swe_bench_multilingual()
+    out_path = f"{save_dir}/swe_bench_multilingual_{impl}.parquet"
+    sbm_dataset.to_parquet(out_path)
+    print(f"Wrote {len(sbm_dataset)} instances to {out_path}", flush=True)
diff --git a/uni_agent/reward/__init__.py b/uni_agent/reward/__init__.py
index 245d75d1..755eefaf 100644
--- a/uni_agent/reward/__init__.py
+++ b/uni_agent/reward/__init__.py
@@ -3,6 +3,7 @@
 _LAZY_EXPORTS = {
     "SearchRewardSpec": ".search",
     "SWEBenchRewardSpec": ".swe_bench",
+    "SWEBenchMultilingualRewardSpec": ".swe_bench_multilingual",
     "R2EGymRewardSpec": ".r2e_gym",
     "SWEREBenchRewardSpec": ".swe_rebench",
 }
@@ -11,6 +12,7 @@
     "load_reward_spec",
     "SearchRewardSpec",
     "SWEBenchRewardSpec",
+    "SWEBenchMultilingualRewardSpec",
     "R2EGymRewardSpec",
     "SWEREBenchRewardSpec",
 ]
diff --git a/uni_agent/reward/registry.py b/uni_agent/reward/registry.py
index 1ed16ae4..1d464fcc 100644
--- a/uni_agent/reward/registry.py
+++ b/uni_agent/reward/registry.py
@@ -11,6 +11,7 @@
 REWARD_SPEC_MODULES: dict[str, str] = {
     "search": "uni_agent.reward.search",
     "swe_bench": "uni_agent.reward.swe_bench",
+    "swe_bench_multilingual": "uni_agent.reward.swe_bench_multilingual",
     "swe_rebench": "uni_agent.reward.swe_rebench",
     "r2e_gym": "uni_agent.reward.r2e_gym",
     "terminal_bench": "uni_agent.reward.terminal_bench",
diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py
new file mode 100644
index 00000000..435f3c44
--- /dev/null
+++ b/uni_agent/reward/swe_bench_multilingual.py
@@ -0,0 +1,214 @@
+"""Reward spec for SWE-bench/SWE-bench_Multilingual.
+
+The dataset spans 7 non-Python languages (c/go/java/js/php/ruby/rust) and is fully
+covered by the official ``swebench`` harness, so we grade with it directly instead
+of re-implementing per-language logic:
+
+* The eval script is built explicitly in ``_make_eval_script_list`` -- a transcription
+  of swebench's ``make_eval_script_list_common`` (+ the JS image-asset step): reset
+  *only the test files* to ``base_commit`` -> ``git apply`` the gold ``test_patch`` ->
+  optional per-repo ``build`` -> run the repo's ``test_cmd`` wrapped in ``START``/
+  ``END`` markers -> reset the test files again. Crucially it never ``git reset --hard``
+  the whole repo, so build-time edits and the agent's solution in ``/testbed`` survive.
+* The per-repo parser (``MAP_REPO_TO_PARSER``) + official resolution metric
+  (``get_eval_tests_report`` / ``get_resolution_status``) decide ``resolved``;
+  ``make_test_spec`` still provides those grading inputs (F2P/P2P, parser key).
+
+Instance images are the published ``swebench/sweb.eval.x86_64.<id>`` containers with
+the repo checked out at ``/testbed``.
+"""
+
+import time
+import uuid
+from pathlib import Path
+
+from swebench.harness.constants import (
+    END_TEST_OUTPUT,
+    FAIL_ONLY_REPOS,
+    MAP_REPO_TO_EXT,
+    MAP_REPO_VERSION_TO_SPECS,
+    START_TEST_OUTPUT,
+    EvalType,
+    ResolvedStatus,
+)
+from swebench.harness.grading import get_eval_tests_report, get_resolution_status
+from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
+from swebench.harness.test_spec.javascript import get_download_img_commands
+from swebench.harness.test_spec.test_spec import make_test_spec
+
+from uni_agent.async_logging import get_logger
+from uni_agent.interaction import AgentEnv
+from uni_agent.reward.base import AbstractRewardSpec
+from uni_agent.reward.registry import register_reward_spec
+from uni_agent.utils import auto_await
+
+# Heredoc delimiter the upstream harness uses to inline the test patch into the script.
+HEREDOC_DELIMITER = "EOF_114329324912"
+
+
+@register_reward_spec("swe_bench_multilingual")
+class SWEBenchMultilingualRewardSpec(AbstractRewardSpec):
+    def __init__(self, *, run_id: str, metadata: dict, env: AgentEnv, eval_timeout: int = 1800):
+        self.run_id = run_id
+        self.metadata = metadata
+        self.env = env
+        self.logger = get_logger("reward_spec", run_id=run_id)
+        self.eval_timeout = eval_timeout
+        self.repo = metadata["repo"]
+        self.version = metadata["version"]
+        # Still used for grading inputs (F2P/P2P, parser key, language); pure-CPU.
+        self.test_spec = make_test_spec(metadata)
+
+    @auto_await
+    async def apply_gold_patch(self) -> None:
+        """Apply the dataset gold patch to the working tree (used by verifiers)."""
+        await self._apply_patch(self.metadata["patch"])
+
+    @auto_await
+    async def compute_reward(self, **kwargs) -> tuple[bool, dict]:
+        result = {
+            "eval_completed": False,
+            "eval_execution_time": None,
+            "eval_report": None,
+            "resolved": False,
+        }
+        try:
+            script_path = Path(f"/tmp/sbm_eval_{uuid.uuid4().hex}.sh")
+            await self.env.write_file(script_path, self._build_eval_script())
+
+            t0 = time.perf_counter()
+            # `| cat` makes stdout a pipe (non-TTY), standing in for the official
+            # harness's non-TTY `docker exec` so runners don't emit colored/TUI output.
+            output = await self.env.communicate(
+                f"bash {script_path} 2>&1 | cat",
+                timeout=self.eval_timeout,
+                check="ignore",
+            )
+            result["eval_execution_time"] = time.perf_counter() - t0
+            result["eval_completed"] = True
+
+            eval_report = self._grade(output)
+            result["eval_report"] = eval_report
+            result["resolved"] = eval_report["resolved"]
+            self.logger.info(
+                f"SWE-bench-Multilingual eval: instance={self.test_spec.instance_id} "
+                f"repo={self.test_spec.repo} lang={self.test_spec.language} "
+                f"resolved={eval_report['resolved']} found={eval_report['found_eval_status']} "
+                f"time={result['eval_execution_time']:.1f}s"
+            )
+        except Exception as exc:
+            self.logger.error(f"Failed to evaluate SWE-bench-Multilingual instance: {exc}")
+            result["error"] = str(exc)
+        return result["resolved"], result
+
+    def _make_eval_script_list(self) -> list[str]:
+        """Explicit transcription of swebench's ``make_eval_script_list`` for this
+        dataset's languages (``make_eval_script_list_common`` + the JS image-asset
+        step), kept inline -- like ``swe_bench.py`` -- so the eval flow is visible and
+        tweakable instead of hidden behind ``TestSpec.eval_script``.
+
+        Steps: reset *only the test files* to ``base_commit`` (never the whole repo, so
+        build-time ``pre_install`` edits and the agent's solution survive) -> ``git
+        apply`` the gold ``test_patch`` -> optional per-repo ``build`` -> run ``test_cmd``
+        between the ``START``/``END`` markers the parser keys off -> reset the test files
+        again so they can't be tampered with.
+        """
+        instance = self.metadata
+        repo_directory = "/testbed"
+        test_patch = instance["test_patch"]
+        specs = MAP_REPO_VERSION_TO_SPECS[self.repo][self.version]
+
+        reset_tests_command = "git checkout master 2>/dev/null || git checkout main"
+
+        build_commands = list(specs.get("build", []))
+        apply_test_patch_command = (
+            f"git apply --verbose --reject - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}"
+        )
+        test_cmd = specs["test_cmd"]
+        test_commands = [test_cmd] if isinstance(test_cmd, str) else list(test_cmd)
+
+        eval_commands = [
+            f"cd {repo_directory}",
+            f"git config --global --add safe.directory {repo_directory}",
+            f"cd {repo_directory}",
+            reset_tests_command,
+            apply_test_patch_command,
+            *build_commands,
+            f": '{START_TEST_OUTPUT}'",
+            *test_commands,
+            f": '{END_TEST_OUTPUT}'",
+            reset_tests_command,
+        ]
+        # JS instances may ship test image fixtures pulled in right after the reset
+        # (a no-op unless the instance carries ``image_assets``).
+        if MAP_REPO_TO_EXT[self.repo] == "js":
+            eval_commands[4:4] = get_download_img_commands(instance)
+        return eval_commands
+
+    def _build_eval_script(self) -> str:
+        """Assemble the eval script (same header as ``TestSpec.eval_script``; no
+        ``set -e`` on purpose, so the trailing test-file reset always runs)."""
+        return "\n".join(["#!/bin/bash", "set -uxo pipefail", *self._make_eval_script_list()]) + "\n"
+
+    def _grade(self, output: str) -> dict:
+        """Parse the test region and grade against FAIL_TO_PASS / PASS_TO_PASS."""
+        report = {"resolved": False, "found_eval_status": False, "test_status": None}
+
+        parser = MAP_REPO_TO_PARSER[self.test_spec.repo]
+        if START_TEST_OUTPUT in output and END_TEST_OUTPUT in output:
+            region = output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
+        else:
+            region = output
+        status_map = parser(region, self.test_spec)
+        # Fallback: some runners write results outside the markers (e.g. stderr).
+        if not status_map:
+            status_map = parser(output, self.test_spec)
+        if not status_map:
+            self.logger.warning(
+                "SWE-bench-Multilingual parser matched 0 tests -- the test command likely "
+                f"failed to run. Output tail:\n{output[-3000:]}"
+            )
+            return report
+
+        report["found_eval_status"] = True
+        eval_ref = {
+            "instance_id": self.test_spec.instance_id,
+            "FAIL_TO_PASS": self.test_spec.FAIL_TO_PASS,
+            "PASS_TO_PASS": self.test_spec.PASS_TO_PASS,
+        }
+        eval_type = EvalType.FAIL_ONLY if self.test_spec.repo in FAIL_ONLY_REPOS else EvalType.PASS_AND_FAIL
+        tests_status = get_eval_tests_report(status_map, eval_ref, eval_type=eval_type)
+        report["test_status"] = tests_status
+        report["resolved"] = get_resolution_status(tests_status) == ResolvedStatus.FULL.value
+        if not report["resolved"]:
+            f2p_missing = tests_status["FAIL_TO_PASS"]["failure"]
+            p2p_failed = tests_status["PASS_TO_PASS"]["failure"]
+            self.logger.warning(
+                f"SWE-bench-Multilingual NOT resolved: FAIL_TO_PASS still failing={f2p_missing[:25]} "
+                f"PASS_TO_PASS broke={p2p_failed[:25]}"
+            )
+        return report
+
+    @auto_await
+    async def _apply_patch(self, patch: str) -> None:
+        """Apply a patch string to the env. Tries multiple apply strategies in order."""
+        if not patch or not patch.strip():
+            self.logger.info("Empty patch, nothing to apply.")
+            return
+        patch_path = Path(f"/tmp/patch_{uuid.uuid4()}.diff")
+        await self.env.write_file(patch_path, patch)
+        commands = [
+            f"cd /testbed && git apply --whitespace=fix {patch_path.as_posix()}",
+            f"cd /testbed && git apply --reject --whitespace=nowarn {patch_path.as_posix()}",
+            f"cd /testbed && patch --batch --fuzz=5 -p1 -i {patch_path.as_posix()}",
+        ]
+        last_error: Exception | None = None
+        for cmd in commands:
+            try:
+                await self.env.communicate(cmd, check="raise")
+                self.logger.info("Applied patch successfully!")
+                return
+            except RuntimeError as e:
+                last_error = e
+                continue
+        raise RuntimeError("Failed to apply patch with any command") from last_error

From 772a98276cefa36a22439896b11a9d847fd00ceb Mon Sep 17 00:00:00 2001
From: yyDing1 <yyding.me@gmail.com>
Date: Sat, 6 Jun 2026 18:03:33 +0800
Subject: [PATCH 2/5] update

---
 docs/source/start/agent_interaction.md        |   1 +
 examples/agent_env/demo.py                    |   6 +
 .../agent_interaction/parallel_verify_swe.py  | 136 ++++++++++++------
 .../data_preprocess/swe_bench_multilingual.py |  13 +-
 .../data_preprocess/swe_bench_verified.py     |   8 +-
 uni_agent/deployment/modal/deployment.py      |   8 +-
 uni_agent/reward/swe_bench.py                 |   9 +-
 uni_agent/reward/swe_bench_multilingual.py    |  14 +-
 8 files changed, 130 insertions(+), 65 deletions(-)

diff --git a/docs/source/start/agent_interaction.md b/docs/source/start/agent_interaction.md
index 810b1e59..c2372ade 100644
--- a/docs/source/start/agent_interaction.md
+++ b/docs/source/start/agent_interaction.md
@@ -15,6 +15,7 @@ The inference and verification scripts for this page live under `examples/agent_
 | Qwen3-Coder-Next               | temp=0.8, topp=0.9, tp=16, 300 turns, 128K context | **67.6** (Avg@4) |
 | Qwen3.5-4B                     | temp=0.8, topp=0.9, tp=4, 100 turns, 64K context   | **45.2** (Avg@1) |
 | Qwen3.5-9B                     | temp=1.0, topp=0.7, tp=4, 100 turns, 64K context   | **53.8** (Avg@1) |
+| Qwen3.5-9B                     | temp=1.0, topp=0.95, tp=4, 200 turns, 128k context | **65.6** (Avg@1) |
 | Qwen3.5-35B-A3B                | temp=1.0, topp=0.7, tp=4, 300 turns, 128K context  | **68.4** (Avg@1) |
 
 **Reference results on Terminal-Bench v2 with Uni-Agent:**
diff --git a/examples/agent_env/demo.py b/examples/agent_env/demo.py
index 3a33dee5..8be97b92 100644
--- a/examples/agent_env/demo.py
+++ b/examples/agent_env/demo.py
@@ -83,6 +83,12 @@
     "deployment": deployment_config,
     "env_variables": {
         "PIP_PROGRESS_BAR": "off",
+        "PIP_CACHE_DIR": "~/.cache/pip",
+        "PAGER": "cat",
+        "MANPAGER": "cat",
+        "LESS": "-R",
+        "TQDM_DISABLE": "1",
+        "GIT_PAGER": "cat",
     },
 }
 env_config = AgentEnvConfig(**env_config)
diff --git a/examples/agent_interaction/parallel_verify_swe.py b/examples/agent_interaction/parallel_verify_swe.py
index d9533efb..8fcde11b 100644
--- a/examples/agent_interaction/parallel_verify_swe.py
+++ b/examples/agent_interaction/parallel_verify_swe.py
@@ -7,42 +7,41 @@
 from pathlib import Path
 
 import ray
-from datasets import load_dataset
+from tqdm import tqdm
 
+from datasets import load_dataset
 from uni_agent.async_logging import add_file_handler, cleanup_handlers
 from uni_agent.interaction import AgentEnv, AgentEnvConfig
 from uni_agent.reward import load_reward_spec
 
-logger = logging.getLogger(__file__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
+logger = logging.getLogger(__name__)
 logger.setLevel("INFO")
 
+GLOBAL_CONCURRENCY = int(os.getenv("GLOBAL_CONCURRENCY", 512))
+NUM_WORKERS = int(os.getenv("NUM_WORKERS", 8))
+DATA_PATH = os.getenv("DATA_PATH", "/home/tiger/data/swe_agent/swe_bench_multilingual_modal.parquet")
+
 
 async def run_sample(sample):
     run_id = str(uuid.uuid4())
     instance = sample["extra_info"]["tools_kwargs"]
     impl = os.getenv("DEPLOYMENT", "vefaas").lower()
 
-    # SWE preprocessors emit ``env.deployment.image`` (nested, matching
-    # ``AgentEnvConfig`` / ``DeployConfig``). Older parquets used flat
-    # ``env.image``; accept both so a stale parquet doesn't silently break.
-    instance_image = instance["env"].get("deployment", {}).get("image") or instance["env"].get("image")
-    if instance_image is None:
+    case_deployment = dict(instance["env"].get("deployment", {}))
+    if not case_deployment.get("image"):
         raise KeyError("No image found in instance.env.deployment.image or instance.env.image")
 
     if impl == "vefaas":
-        deployment_config = {
+        defaults = {
             "type": "vefaas",
-            "image": instance_image,
             "command": "curl -fsSL https://vefaas-swe.tos-cn-beijing.ivolces.com/swe-rex/install_1.4.0.sh | bash -s -- {token}",
             "timeout": 600.0,
             "startup_timeout": 180.0,
-            "function_id": os.getenv("VEFAAS_FUNCTION_ID"),
-            "function_route": os.getenv("VEFAAS_FUNCTION_ROUTE"),
         }
     elif impl == "modal":
-        deployment_config = {
+        defaults = {
             "type": "modal",
-            "image": instance_image,
             "startup_timeout": 600.0,
             "runtime_timeout": 600.0,
             "deployment_timeout": 3600.0,
@@ -52,6 +51,9 @@ async def run_sample(sample):
     else:
         raise ValueError(f"Invalid environment implementation: {impl}")
 
+    # Case config wins; defaults fill in whatever the case didn't specify.
+    deployment_config = {**defaults, **case_deployment}
+
     env_config = {
         "deployment": deployment_config,
         "env_variables": {
@@ -78,51 +80,84 @@ async def run_sample(sample):
     reward_spec = load_reward_spec(reward_config)
     add_file_handler(Path(f"/tmp/eval_gold_patch/{run_id}.log"), run_id)
 
-    await env.start()
-    await reward_spec.apply_gold_patch()
-    _, result = await reward_spec.compute_reward()
-    await env.close()
-    cleanup_handlers(run_id)
+    try:
+        await env.start()
+        await reward_spec.apply_gold_patch()
+        _, result = await reward_spec.compute_reward()
+    except Exception as e:
+        logger.error(f"Error running sample {run_id}: {e}")
+        result = {"resolved": False, "eval_completed": False, "eval_execution_time": None}
+    finally:
+        await env.close()
+        cleanup_handlers(run_id)
     return result
 
 
 @ray.remote
 class TestEvalActor:
-    _semaphore = asyncio.Semaphore(64)
-
-    async def run_batch(self, samples):
-        tasks = [self.run_single(sample) for sample in samples]
-        return await asyncio.gather(*tasks)
+    _semaphore = asyncio.Semaphore(max(1, GLOBAL_CONCURRENCY // NUM_WORKERS))
 
     async def run_single(self, sample):
         async with self._semaphore:
             return await run_sample(sample)
 
 
+def _rule(text: str = "", width: int = 50, ch: str = "─") -> str:
+    """A centered-title horizontal rule. Emoji-safe (left-aligned rows below it
+    carry the values, so we never depend on monospace emoji width)."""
+    if not text:
+        return ch * width
+    pad = max(0, width - len(text) - 2)
+    return f"{ch * (pad // 2)} {text} {ch * (pad - pad // 2)}"
+
+
 def main():
     ray.init()
     # data_path = "/home/tiger/data/swe_agent/swe_rebench_filtered.parquet"
     # data_path = "/home/tiger/data/swe_agent/r2e_gym_subset.parquet"
-    data_path = "/home/tiger/data/swe_agent/swe_bench_verified_modal.parquet"
-    dataset = load_dataset("parquet", data_files=data_path, split="train")
+    dataset = load_dataset("parquet", data_files=DATA_PATH, split="train")
     samples = dataset.to_list()
-    workers = [TestEvalActor.remote() for _ in range(8)]
-    futures = []
-    chunk_size = (len(samples) - 1) // len(workers) + 1
-    for i in range(len(workers)):
-        chunk = samples[i * chunk_size : (i + 1) * chunk_size]
-        futures.append(workers[i].run_batch.remote(chunk))
-    # each future returns a list of per-sample results (one chunk per worker)
+    logger.info(f"loaded {len(samples)} samples from {DATA_PATH}")
+    logger.info(f"deployment={os.getenv('DEPLOYMENT', 'vefaas')} workers={NUM_WORKERS} concurrency={GLOBAL_CONCURRENCY}")
+
+    workers = [TestEvalActor.remote() for _ in range(NUM_WORKERS)]
+    # one future per sample (round-robin across workers) so we can stream
+    # per-sample progress; the actor semaphore still bounds real concurrency.
+    futures = [workers[i % len(workers)].run_single.remote(s) for i, s in enumerate(samples)]
+    fut_to_idx = {f: i for i, f in enumerate(futures)}
 
     begin_time = time.time()
-    results_chunk = ray.get(futures)
+    results: list = [None] * len(futures)
+    ok = wa = tle = 0
+    remaining = list(futures)
+    with tqdm(
+        total=len(futures),
+        desc="🚀 eval",
+        colour="green",
+        unit="inst",
+        dynamic_ncols=True,
+        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]{postfix}",
+    ) as pbar:
+        while remaining:
+            done, remaining = ray.wait(remaining, num_returns=1)
+            for d in done:
+                res = ray.get(d)
+                results[fut_to_idx[d]] = res
+                if res.get("resolved"):
+                    ok += 1
+                elif res.get("eval_completed"):
+                    wa += 1
+                else:
+                    tle += 1
+                rate = ok / pbar.n * 100 if pbar.n else 0.0
+                pbar.set_postfix_str(f"✅{ok} ❌WA{wa} ⏱TLE{tle} | {rate:.0f}% pass")
+                pbar.update(1)
     end_time = time.time()
-    logger.info(f"time cost: {end_time - begin_time:.2f}s")
-    results = [item for chunk in results_chunk for item in chunk]
+
     all_num = len(results)
-    success_num = len([item for item in results if item["resolved"]])
-    fail_wa_num = len([item for item in results if not item["resolved"] and item["eval_completed"]])
-    fail_tle_num = len([item for item in results if not item["resolved"] and not item["eval_completed"]])
+    success_num = len([r for r in results if r.get("resolved")])
+    fail_wa_num = len([r for r in results if not r.get("resolved") and r.get("eval_completed")])
+    fail_tle_num = len([r for r in results if not r.get("resolved") and not r.get("eval_completed")])
 
     def instance_name(sample):
         return sample["extra_info"]["tools_kwargs"]["reward"]["metadata"]["instance_id"]
@@ -130,21 +165,32 @@ def instance_name(sample):
     fail_wa_names = [
         instance_name(sample)
         for sample, item in zip(samples, results, strict=False)
-        if not item["resolved"] and item["eval_completed"]
+        if not item.get("resolved") and item.get("eval_completed")
     ]
     fail_tle_names = [
         instance_name(sample)
         for sample, item in zip(samples, results, strict=False)
-        if not item["resolved"] and not item["eval_completed"]
+        if not item.get("resolved") and not item.get("eval_completed")
     ]
 
     exec_times = [r["eval_execution_time"] for r in results if r.get("eval_execution_time") is not None]
-    avg_exec_time = sum(exec_times) / len(exec_times)
-
-    logger.info(
-        f"all_num: {all_num}, success_num: {success_num}, fail_wa_num: {fail_wa_num}, fail_tle_num: {fail_tle_num}"
+    avg_exec_time = sum(exec_times) / len(exec_times) if exec_times else 0.0
+    pass_rate = success_num / all_num * 100 if all_num else 0.0
+    wall = end_time - begin_time
+
+    summary = "\n".join(
+        [
+            "",
+            _rule("🧪 eval summary"),
+            f"  ✅ resolved   {success_num:>4}   ({pass_rate:.1f}%)",
+            f"  ❌ wrong-ans  {fail_wa_num:>4}",
+            f"  ⏱  timeout    {fail_tle_num:>4}",
+            f"  Σ  total      {all_num:>4}",
+            _rule(f"avg {avg_exec_time:.1f}s · wall {wall:.1f}s · n={len(exec_times)}"),
+            "",
+        ]
     )
-    logger.info(f"avg_execution_time: {avg_exec_time:.2f}s (n={len(exec_times)})")
+    print(summary)
 
     logger.info(f"fail_wa instance names: {fail_wa_names}")
     logger.info(f"fail_tle instance names: {fail_tle_names}")
diff --git a/examples/data_preprocess/swe_bench_multilingual.py b/examples/data_preprocess/swe_bench_multilingual.py
index 17f4b948..0d942b14 100644
--- a/examples/data_preprocess/swe_bench_multilingual.py
+++ b/examples/data_preprocess/swe_bench_multilingual.py
@@ -130,11 +130,6 @@ def process(example):
                 "git tag -d $(git tag -l)",
                 "git reflog expire --expire=now --all",
                 "git gc --prune=now",
-                "git config --global user.email setup@swebench.config",
-                "git config --global user.name SWE-bench",
-                "git commit --allow-empty -am SWE-bench",
-                f"git checkout {metadata['base_commit']}",
-                "git clean -fdq",
             ]
         )
 
@@ -153,7 +148,13 @@ def process(example):
             "extra_info": {
                 "tools_kwargs": {
                     "env": {
-                        "deployment": {"image": get_image_name(instance_id)},
+                        "deployment": {
+                            "image": get_image_name(instance_id),
+                            "modal_sandbox_kwargs": {
+                                "cpu": (0.5, 4.0),
+                                "memory": (1024, 8192),
+                            },
+                        },
                         "post_setup_cmd": reset_script,
                     },
                     "reward": {
diff --git a/examples/data_preprocess/swe_bench_verified.py b/examples/data_preprocess/swe_bench_verified.py
index 662ccbc7..8f26ad16 100644
--- a/examples/data_preprocess/swe_bench_verified.py
+++ b/examples/data_preprocess/swe_bench_verified.py
@@ -109,13 +109,7 @@ def process_swe_bench_verified(example):
         dataset_id = "swe-bench-verified"
         instance_id = example["instance_id"]
         image_name = get_image_name(dataset_id, instance_id)
-        reset_cmds = [
-            "cd /testbed",
-            "git restore .",
-            "git reset --hard",
-            f"git checkout {example['base_commit']}",
-            "git clean -fdq",
-        ]
+        reset_cmds = []
         reset_script = " && ".join(reset_cmds)
         sample = {
             "prompt": [
diff --git a/uni_agent/deployment/modal/deployment.py b/uni_agent/deployment/modal/deployment.py
index 7168f132..72b4fa85 100644
--- a/uni_agent/deployment/modal/deployment.py
+++ b/uni_agent/deployment/modal/deployment.py
@@ -28,7 +28,7 @@
 # The semaphore is process-local, so MODAL_MAX_STARTING_PER_WORKER is a
 # per-worker cap (size it as fleet-wide target / num rollout workers).
 # MODAL_INIT_WALL_BUDGET caps a single trajectory's total init wall-clock.
-_DEFAULT_MAX_STARTING_PER_WORKER = 8
+_DEFAULT_MAX_STARTING_PER_WORKER = 64
 _DEFAULT_INIT_WALL_BUDGET = 900.0
 _STARTING_SEMA: asyncio.Semaphore | None = None
 
@@ -112,7 +112,7 @@ def from_ecr(self, image: str) -> modal.Image:
             raise ValueError(msg) from e
 
     def ensure_pipx_installed(self, image: modal.Image) -> modal.Image:
-        image = image.apt_install("pipx")
+        image = image.apt_install("pipx", env={"DEBIAN_FRONTEND": "noninteractive"})
         return image.run_commands("pipx ensurepath")
 
     def auto(self, image_spec: str | modal.Image | PurePath) -> modal.Image:
@@ -167,6 +167,10 @@ def __init__(
         self._proxy = proxy
         if modal_sandbox_kwargs is None:
             modal_sandbox_kwargs = {}
+        modal_sandbox_kwargs = dict(modal_sandbox_kwargs)
+        for _key in ("cpu", "memory"):
+            if isinstance(modal_sandbox_kwargs.get(_key), list):
+                modal_sandbox_kwargs[_key] = tuple(modal_sandbox_kwargs[_key])
         self._modal_kwargs = modal_sandbox_kwargs
         self._hooks = CombinedDeploymentHook()
 
diff --git a/uni_agent/reward/swe_bench.py b/uni_agent/reward/swe_bench.py
index 483e1452..46aac4c4 100644
--- a/uni_agent/reward/swe_bench.py
+++ b/uni_agent/reward/swe_bench.py
@@ -15,6 +15,7 @@
 from swebench.harness.grading import get_eval_tests_report, get_resolution_status
 from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
 from swebench.harness.test_spec.python import get_test_directives
+from swebench.harness.utils import get_modified_files
 
 from uni_agent.async_logging import get_logger
 from uni_agent.interaction import AgentEnv
@@ -31,8 +32,12 @@ def _make_eval_script_list(instance, specs, env_name, repo_directory, base_commi
     which resets the whole repo (e.g. reverts tox.ini). We use no-op instead.
     """
     _HEREDOC_DELIMITER = "EOF_114329324912"
-
-    reset_tests_command = "git checkout master 2>/dev/null || git checkout main"
+    base_commit = instance["base_commit"]
+    test_files = get_modified_files(test_patch)
+    if test_files:
+        reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}"
+    else:
+        reset_tests_command = f"echo 'skip reset'"
 
     apply_test_patch_command = f"git apply -v - <<'{_HEREDOC_DELIMITER}'\n{test_patch}\n{_HEREDOC_DELIMITER}"
     test_cmd = MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"]
diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py
index 435f3c44..298b690d 100644
--- a/uni_agent/reward/swe_bench_multilingual.py
+++ b/uni_agent/reward/swe_bench_multilingual.py
@@ -35,6 +35,7 @@
 from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
 from swebench.harness.test_spec.javascript import get_download_img_commands
 from swebench.harness.test_spec.test_spec import make_test_spec
+from swebench.harness.utils import get_modified_files
 
 from uni_agent.async_logging import get_logger
 from uni_agent.interaction import AgentEnv
@@ -115,10 +116,15 @@ def _make_eval_script_list(self) -> list[str]:
         """
         instance = self.metadata
         repo_directory = "/testbed"
+        base_commit = instance["base_commit"]
         test_patch = instance["test_patch"]
         specs = MAP_REPO_VERSION_TO_SPECS[self.repo][self.version]
 
-        reset_tests_command = "git checkout master 2>/dev/null || git checkout main"
+        test_files = get_modified_files(test_patch)
+        if test_files:
+            reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}"
+        else:
+            reset_tests_command = f"echo 'skip reset'"
 
         build_commands = list(specs.get("build", []))
         apply_test_patch_command = (
@@ -128,6 +134,7 @@ def _make_eval_script_list(self) -> list[str]:
         test_commands = [test_cmd] if isinstance(test_cmd, str) else list(test_cmd)
 
         eval_commands = [
+            "chmod 1777 /tmp 2>/dev/null || true",
             f"cd {repo_directory}",
             f"git config --global --add safe.directory {repo_directory}",
             f"cd {repo_directory}",
@@ -142,7 +149,8 @@ def _make_eval_script_list(self) -> list[str]:
         # JS instances may ship test image fixtures pulled in right after the reset
         # (a no-op unless the instance carries ``image_assets``).
         if MAP_REPO_TO_EXT[self.repo] == "js":
-            eval_commands[4:4] = get_download_img_commands(instance)
+            idx = eval_commands.index(apply_test_patch_command)
+            eval_commands[idx:idx] = get_download_img_commands(instance)
         return eval_commands
 
     def _build_eval_script(self) -> str:
@@ -205,7 +213,7 @@ async def _apply_patch(self, patch: str) -> None:
         last_error: Exception | None = None
         for cmd in commands:
             try:
-                await self.env.communicate(cmd, check="raise")
+                await self.env.communicate(cmd, check="ignore")
                 self.logger.info("Applied patch successfully!")
                 return
             except RuntimeError as e:

From d363197e6f6164b62bdd5388cb18fa425a76896d Mon Sep 17 00:00:00 2001
From: yyDing1 <yyding.me@gmail.com>
Date: Sat, 6 Jun 2026 18:04:53 +0800
Subject: [PATCH 3/5] update

---
 examples/agent_interaction/parallel_verify_swe.py | 6 ++++--
 uni_agent/reward/swe_bench.py                     | 2 +-
 uni_agent/reward/swe_bench_multilingual.py        | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/agent_interaction/parallel_verify_swe.py b/examples/agent_interaction/parallel_verify_swe.py
index 8fcde11b..4be8d7fa 100644
--- a/examples/agent_interaction/parallel_verify_swe.py
+++ b/examples/agent_interaction/parallel_verify_swe.py
@@ -7,9 +7,9 @@
 from pathlib import Path
 
 import ray
+from datasets import load_dataset
 from tqdm import tqdm
 
-from datasets import load_dataset
 from uni_agent.async_logging import add_file_handler, cleanup_handlers
 from uni_agent.interaction import AgentEnv, AgentEnvConfig
 from uni_agent.reward import load_reward_spec
@@ -118,7 +118,9 @@ def main():
     dataset = load_dataset("parquet", data_files=DATA_PATH, split="train")
     samples = dataset.to_list()
     logger.info(f"loaded {len(samples)} samples from {DATA_PATH}")
-    logger.info(f"deployment={os.getenv('DEPLOYMENT', 'vefaas')} workers={NUM_WORKERS} concurrency={GLOBAL_CONCURRENCY}")
+    logger.info(
+        f"deployment={os.getenv('DEPLOYMENT', 'vefaas')} workers={NUM_WORKERS} concurrency={GLOBAL_CONCURRENCY}"
+    )
 
     workers = [TestEvalActor.remote() for _ in range(NUM_WORKERS)]
     # one future per sample (round-robin across workers) so we can stream
diff --git a/uni_agent/reward/swe_bench.py b/uni_agent/reward/swe_bench.py
index 46aac4c4..108ca866 100644
--- a/uni_agent/reward/swe_bench.py
+++ b/uni_agent/reward/swe_bench.py
@@ -37,7 +37,7 @@ def _make_eval_script_list(instance, specs, env_name, repo_directory, base_commi
     if test_files:
         reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}"
     else:
-        reset_tests_command = f"echo 'skip reset'"
+        reset_tests_command = "echo 'skip reset'"
 
     apply_test_patch_command = f"git apply -v - <<'{_HEREDOC_DELIMITER}'\n{test_patch}\n{_HEREDOC_DELIMITER}"
     test_cmd = MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"]
diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py
index 298b690d..502a4d30 100644
--- a/uni_agent/reward/swe_bench_multilingual.py
+++ b/uni_agent/reward/swe_bench_multilingual.py
@@ -124,7 +124,7 @@ def _make_eval_script_list(self) -> list[str]:
         if test_files:
             reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}"
         else:
-            reset_tests_command = f"echo 'skip reset'"
+            reset_tests_command = "echo 'skip reset'"
 
         build_commands = list(specs.get("build", []))
         apply_test_patch_command = (

From 25fedad12965024a08456d12e975af5e4bbb511d Mon Sep 17 00:00:00 2001
From: yyDing1 <yyding.me@gmail.com>
Date: Sun, 7 Jun 2026 14:57:23 +0800
Subject: [PATCH 4/5] update

---
 docs/source/start/agent_interaction.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/start/agent_interaction.md b/docs/source/start/agent_interaction.md
index c2372ade..48403dd7 100644
--- a/docs/source/start/agent_interaction.md
+++ b/docs/source/start/agent_interaction.md
@@ -18,6 +18,12 @@ The inference and verification scripts for this page live under `examples/agent_
 | Qwen3.5-9B                     | temp=1.0, topp=0.95, tp=4, 200 turns, 128k context | **65.6** (Avg@1) |
 | Qwen3.5-35B-A3B                | temp=1.0, topp=0.7, tp=4, 300 turns, 128K context  | **68.4** (Avg@1) |
 
+**Reference results on SWE-bench Multilingual with Uni-Agent:**
+
+| **Model**                    | Inference Config        | **Uni-Agent** |
+| ---------------------------- | ----------------------- |:-------------:|
+| Qwen3-Coder-30B-A3B-Instruct | 200 turns, 128K context | **32.3** (Avg@1) |
+
 **Reference results on Terminal-Bench v2 with Uni-Agent:**
 
 | **Model**          | Inference Config                                    | **Uni-Agent** |

From e743727930f2352344c6c033857d7f01a308a42a Mon Sep 17 00:00:00 2001
From: yyDing1 <yyding.me@gmail.com>
Date: Sun, 7 Jun 2026 15:27:21 +0800
Subject: [PATCH 5/5] updayte

---
 uni_agent/reward/swe_bench_multilingual.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py
index 502a4d30..b593cb77 100644
--- a/uni_agent/reward/swe_bench_multilingual.py
+++ b/uni_agent/reward/swe_bench_multilingual.py
@@ -213,7 +213,7 @@ async def _apply_patch(self, patch: str) -> None:
         last_error: Exception | None = None
         for cmd in commands:
             try:
-                await self.env.communicate(cmd, check="ignore")
+                await self.env.communicate(cmd, check="raise")
                 self.logger.info("Applied patch successfully!")
                 return
             except RuntimeError as e: