From a8f8a08b0dc3b122650da2a5f68e7b02600532b5 Mon Sep 17 00:00:00 2001 From: yyDing1 Date: Fri, 5 Jun 2026 15:44:59 +0800 Subject: [PATCH 1/5] update --- .../data_preprocess/swe_bench_multilingual.py | 188 +++++++++++++++ uni_agent/reward/__init__.py | 2 + uni_agent/reward/registry.py | 1 + uni_agent/reward/swe_bench_multilingual.py | 214 ++++++++++++++++++ 4 files changed, 405 insertions(+) create mode 100644 examples/data_preprocess/swe_bench_multilingual.py create mode 100644 uni_agent/reward/swe_bench_multilingual.py diff --git a/examples/data_preprocess/swe_bench_multilingual.py b/examples/data_preprocess/swe_bench_multilingual.py new file mode 100644 index 00000000..17f4b948 --- /dev/null +++ b/examples/data_preprocess/swe_bench_multilingual.py @@ -0,0 +1,188 @@ +# ruff: noqa: E501 +"""Preprocess SWE-bench/SWE-bench_Multilingual into the uni-agent SWE-agent format. + +The dataset has 300 instances across 41 repos in 7 non-Python languages +(c/go/java/js/php/ruby/rust). Grading is the official ``swebench`` harness, wired up +by the ``swe_bench_multilingual`` reward spec, so we only keep the fields that +``make_test_spec`` + grading need: ``instance_id``, ``repo``, ``version``, +``base_commit``, ``test_patch``, ``patch`` (gold, for verifiers), +``problem_statement``, ``FAIL_TO_PASS``, ``PASS_TO_PASS``. + +The repo lives at ``/testbed`` in the published ``swebench/sweb.eval.x86_64.`` +images. The image already holds the repo at ``base_commit`` plus the harness's +build-time ``pre_install``/``build`` edits (e.g. apache/lucene injects a gradle +``testLogging`` block the parser relies on). Those edits are uncommitted, so +post-setup *commits* them (instead of ``git reset --hard``, which would revert them +and break grading) -- giving the agent a clean tree while preserving the build +config. + +Example:: + + DEPLOYMENT=modal python examples/data_preprocess/swe_bench_multilingual.py \ + --local-save-dir ~/data/swe_agent +""" + +import argparse +import os + +from datasets import load_dataset +from swebench.harness.constants import MAP_REPO_TO_EXT + +impl = os.getenv("DEPLOYMENT", "modal").lower() +if impl != "modal": + # Only the public ``swebench/`` Docker Hub images cover the multilingual set. + raise ValueError("SWE-bench_Multilingual preprocessing only supports modal deployment") + + +def get_image_name(instance_id: str) -> str: + """Published image ref, mirroring swebench's ``instance_image_key``.""" + return f"swebench/sweb.eval.x86_64.{instance_id.lower().replace('__', '_1776_')}" + + +# Map swebench file-extension code -> human language name for the prompt. +EXT_TO_LANGUAGE = { + "c": "C", + "go": "Go", + "java": "Java", + "js": "JavaScript", + "php": "PHP", + "rb": "Ruby", + "rs": "Rust", +} + + +SYSTEM_PROMPT = """ +You are a helpful assistant that can interact with a computer to solve tasks. +""".strip() + +USER_PROMPT = """ + +/testbed + +I have uploaded a code repository in the /testbed directory (primary language: {language}). You can explore and modify files using the available tools. Consider the following issue description: + + +{problem_statement} + + +Can you help me implement the necessary changes to the repository to fix the ? +I have already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! +Also the development environment is already set up for you (i.e., all dependencies are already installed and the project is already built), so you don't need to install other packages. +Your task is to make the minimal changes to non-test files in the /testbed directory to ensure the is satisfied. + +Follow these steps to resolve the issue: +1. First, explore the codebase to locate and understand the code relevant to the . +- Use efficient search commands to identify key files and functions. +- Build your understanding of how the code works, the expected behaviors and edge cases, and the potential root causes for the given issue. + +2. Assess whether you can reproduce the issue: +- Create a small script (e.g. at '/testbed/reproduce_issue.*') that demonstrates the error, using the repository's own language/runtime. +- Execute this script to confirm the error behavior before fixing it. +- Your reproduction script should also assert the expected behavior for the fixed code. + +3. Analyze the root cause: +- Identify the underlying problem based on your code exploration and reproduction results. +- Reason about multiple potential approaches and pick the most elegant and effective one, considering correctness, generality, and side effects. + +4. Implement your solution: +- Make targeted changes to the necessary files following idiomatic code patterns once you determine the root cause. + +5. Verify your solution: +- Rerun your reproduction script to confirm the error is fixed, iterating until successful. + +6. Run unit tests: +- Find and run the relevant unit tests using the project's own test runner to ensure your solution is correct and does not cause regressions. +- DO NOT MODIFY any of the existing unit tests. You can add new edge test cases in a separate file if needed BUT DO NOT MODIFY THE EXISTING TESTS. + +7. Test edge cases: +- Identify potential edge cases that might challenge your solution, create additional tests in a separate file, and verify robustness. + +8. Submit your solution: +- Once you have verified your solution, submit it using the `submit` tool. + +A successful resolution means: +- The specific error/issue described no longer occurs +- Your changes maintain compatibility with existing functionality +- Edge cases are properly handled +""".strip() + + +def build_swe_bench_multilingual(): + def process(example): + repo = example["repo"] + instance_id = example["instance_id"] + language = EXT_TO_LANGUAGE.get(MAP_REPO_TO_EXT[repo], "the project's") + + metadata = { + "instance_id": instance_id, + "repo": repo, + "version": str(example["version"]), + "base_commit": example["base_commit"], + "patch": example["patch"], + "test_patch": example["test_patch"], + "problem_statement": example["problem_statement"], + "FAIL_TO_PASS": example["FAIL_TO_PASS"], + "PASS_TO_PASS": example["PASS_TO_PASS"], + } + + reset_script = " && ".join( + [ + "git tag -d $(git tag -l)", + "git reflog expire --expire=now --all", + "git gc --prune=now", + "git config --global user.email setup@swebench.config", + "git config --global user.name SWE-bench", + "git commit --allow-empty -am SWE-bench", + f"git checkout {metadata['base_commit']}", + "git clean -fdq", + ] + ) + + return { + "prompt": [ + {"role": "system", "content": SYSTEM_PROMPT}, + { + "role": "user", + "content": USER_PROMPT.format( + language=language, + problem_statement=example["problem_statement"], + ), + }, + ], + "agent_name": "swe_agent", + "extra_info": { + "tools_kwargs": { + "env": { + "deployment": {"image": get_image_name(instance_id)}, + "post_setup_cmd": reset_script, + }, + "reward": { + "name": "swe_bench_multilingual", + "metadata": metadata, + }, + }, + }, + } + + data_source = "SWE-bench/SWE-bench_Multilingual" + print(f"Loading the {data_source} dataset from huggingface...", flush=True) + dataset = load_dataset(data_source, split="test") + print(f"Loaded {len(dataset)} raw instances", flush=True) + + # Test set: keep every instance (no filtering). + dataset = dataset.map(process, remove_columns=dataset.column_names) + return dataset + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local-save-dir", default="~/data/swe_agent") + args = parser.parse_args() + + save_dir = os.path.expanduser(args.local_save_dir) + os.makedirs(save_dir, exist_ok=True) + + sbm_dataset = build_swe_bench_multilingual() + out_path = f"{save_dir}/swe_bench_multilingual_{impl}.parquet" + sbm_dataset.to_parquet(out_path) + print(f"Wrote {len(sbm_dataset)} instances to {out_path}", flush=True) diff --git a/uni_agent/reward/__init__.py b/uni_agent/reward/__init__.py index 245d75d1..755eefaf 100644 --- a/uni_agent/reward/__init__.py +++ b/uni_agent/reward/__init__.py @@ -3,6 +3,7 @@ _LAZY_EXPORTS = { "SearchRewardSpec": ".search", "SWEBenchRewardSpec": ".swe_bench", + "SWEBenchMultilingualRewardSpec": ".swe_bench_multilingual", "R2EGymRewardSpec": ".r2e_gym", "SWEREBenchRewardSpec": ".swe_rebench", } @@ -11,6 +12,7 @@ "load_reward_spec", "SearchRewardSpec", "SWEBenchRewardSpec", + "SWEBenchMultilingualRewardSpec", "R2EGymRewardSpec", "SWEREBenchRewardSpec", ] diff --git a/uni_agent/reward/registry.py b/uni_agent/reward/registry.py index 1ed16ae4..1d464fcc 100644 --- a/uni_agent/reward/registry.py +++ b/uni_agent/reward/registry.py @@ -11,6 +11,7 @@ REWARD_SPEC_MODULES: dict[str, str] = { "search": "uni_agent.reward.search", "swe_bench": "uni_agent.reward.swe_bench", + "swe_bench_multilingual": "uni_agent.reward.swe_bench_multilingual", "swe_rebench": "uni_agent.reward.swe_rebench", "r2e_gym": "uni_agent.reward.r2e_gym", "terminal_bench": "uni_agent.reward.terminal_bench", diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py new file mode 100644 index 00000000..435f3c44 --- /dev/null +++ b/uni_agent/reward/swe_bench_multilingual.py @@ -0,0 +1,214 @@ +"""Reward spec for SWE-bench/SWE-bench_Multilingual. + +The dataset spans 7 non-Python languages (c/go/java/js/php/ruby/rust) and is fully +covered by the official ``swebench`` harness, so we grade with it directly instead +of re-implementing per-language logic: + +* The eval script is built explicitly in ``_make_eval_script_list`` -- a transcription + of swebench's ``make_eval_script_list_common`` (+ the JS image-asset step): reset + *only the test files* to ``base_commit`` -> ``git apply`` the gold ``test_patch`` -> + optional per-repo ``build`` -> run the repo's ``test_cmd`` wrapped in ``START``/ + ``END`` markers -> reset the test files again. Crucially it never ``git reset --hard`` + the whole repo, so build-time edits and the agent's solution in ``/testbed`` survive. +* The per-repo parser (``MAP_REPO_TO_PARSER``) + official resolution metric + (``get_eval_tests_report`` / ``get_resolution_status``) decide ``resolved``; + ``make_test_spec`` still provides those grading inputs (F2P/P2P, parser key). + +Instance images are the published ``swebench/sweb.eval.x86_64.`` containers with +the repo checked out at ``/testbed``. +""" + +import time +import uuid +from pathlib import Path + +from swebench.harness.constants import ( + END_TEST_OUTPUT, + FAIL_ONLY_REPOS, + MAP_REPO_TO_EXT, + MAP_REPO_VERSION_TO_SPECS, + START_TEST_OUTPUT, + EvalType, + ResolvedStatus, +) +from swebench.harness.grading import get_eval_tests_report, get_resolution_status +from swebench.harness.log_parsers import MAP_REPO_TO_PARSER +from swebench.harness.test_spec.javascript import get_download_img_commands +from swebench.harness.test_spec.test_spec import make_test_spec + +from uni_agent.async_logging import get_logger +from uni_agent.interaction import AgentEnv +from uni_agent.reward.base import AbstractRewardSpec +from uni_agent.reward.registry import register_reward_spec +from uni_agent.utils import auto_await + +# Heredoc delimiter the upstream harness uses to inline the test patch into the script. +HEREDOC_DELIMITER = "EOF_114329324912" + + +@register_reward_spec("swe_bench_multilingual") +class SWEBenchMultilingualRewardSpec(AbstractRewardSpec): + def __init__(self, *, run_id: str, metadata: dict, env: AgentEnv, eval_timeout: int = 1800): + self.run_id = run_id + self.metadata = metadata + self.env = env + self.logger = get_logger("reward_spec", run_id=run_id) + self.eval_timeout = eval_timeout + self.repo = metadata["repo"] + self.version = metadata["version"] + # Still used for grading inputs (F2P/P2P, parser key, language); pure-CPU. + self.test_spec = make_test_spec(metadata) + + @auto_await + async def apply_gold_patch(self) -> None: + """Apply the dataset gold patch to the working tree (used by verifiers).""" + await self._apply_patch(self.metadata["patch"]) + + @auto_await + async def compute_reward(self, **kwargs) -> tuple[bool, dict]: + result = { + "eval_completed": False, + "eval_execution_time": None, + "eval_report": None, + "resolved": False, + } + try: + script_path = Path(f"/tmp/sbm_eval_{uuid.uuid4().hex}.sh") + await self.env.write_file(script_path, self._build_eval_script()) + + t0 = time.perf_counter() + # `| cat` makes stdout a pipe (non-TTY), standing in for the official + # harness's non-TTY `docker exec` so runners don't emit colored/TUI output. + output = await self.env.communicate( + f"bash {script_path} 2>&1 | cat", + timeout=self.eval_timeout, + check="ignore", + ) + result["eval_execution_time"] = time.perf_counter() - t0 + result["eval_completed"] = True + + eval_report = self._grade(output) + result["eval_report"] = eval_report + result["resolved"] = eval_report["resolved"] + self.logger.info( + f"SWE-bench-Multilingual eval: instance={self.test_spec.instance_id} " + f"repo={self.test_spec.repo} lang={self.test_spec.language} " + f"resolved={eval_report['resolved']} found={eval_report['found_eval_status']} " + f"time={result['eval_execution_time']:.1f}s" + ) + except Exception as exc: + self.logger.error(f"Failed to evaluate SWE-bench-Multilingual instance: {exc}") + result["error"] = str(exc) + return result["resolved"], result + + def _make_eval_script_list(self) -> list[str]: + """Explicit transcription of swebench's ``make_eval_script_list`` for this + dataset's languages (``make_eval_script_list_common`` + the JS image-asset + step), kept inline -- like ``swe_bench.py`` -- so the eval flow is visible and + tweakable instead of hidden behind ``TestSpec.eval_script``. + + Steps: reset *only the test files* to ``base_commit`` (never the whole repo, so + build-time ``pre_install`` edits and the agent's solution survive) -> ``git + apply`` the gold ``test_patch`` -> optional per-repo ``build`` -> run ``test_cmd`` + between the ``START``/``END`` markers the parser keys off -> reset the test files + again so they can't be tampered with. + """ + instance = self.metadata + repo_directory = "/testbed" + test_patch = instance["test_patch"] + specs = MAP_REPO_VERSION_TO_SPECS[self.repo][self.version] + + reset_tests_command = "git checkout master 2>/dev/null || git checkout main" + + build_commands = list(specs.get("build", [])) + apply_test_patch_command = ( + f"git apply --verbose --reject - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}" + ) + test_cmd = specs["test_cmd"] + test_commands = [test_cmd] if isinstance(test_cmd, str) else list(test_cmd) + + eval_commands = [ + f"cd {repo_directory}", + f"git config --global --add safe.directory {repo_directory}", + f"cd {repo_directory}", + reset_tests_command, + apply_test_patch_command, + *build_commands, + f": '{START_TEST_OUTPUT}'", + *test_commands, + f": '{END_TEST_OUTPUT}'", + reset_tests_command, + ] + # JS instances may ship test image fixtures pulled in right after the reset + # (a no-op unless the instance carries ``image_assets``). + if MAP_REPO_TO_EXT[self.repo] == "js": + eval_commands[4:4] = get_download_img_commands(instance) + return eval_commands + + def _build_eval_script(self) -> str: + """Assemble the eval script (same header as ``TestSpec.eval_script``; no + ``set -e`` on purpose, so the trailing test-file reset always runs).""" + return "\n".join(["#!/bin/bash", "set -uxo pipefail", *self._make_eval_script_list()]) + "\n" + + def _grade(self, output: str) -> dict: + """Parse the test region and grade against FAIL_TO_PASS / PASS_TO_PASS.""" + report = {"resolved": False, "found_eval_status": False, "test_status": None} + + parser = MAP_REPO_TO_PARSER[self.test_spec.repo] + if START_TEST_OUTPUT in output and END_TEST_OUTPUT in output: + region = output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0] + else: + region = output + status_map = parser(region, self.test_spec) + # Fallback: some runners write results outside the markers (e.g. stderr). + if not status_map: + status_map = parser(output, self.test_spec) + if not status_map: + self.logger.warning( + "SWE-bench-Multilingual parser matched 0 tests -- the test command likely " + f"failed to run. Output tail:\n{output[-3000:]}" + ) + return report + + report["found_eval_status"] = True + eval_ref = { + "instance_id": self.test_spec.instance_id, + "FAIL_TO_PASS": self.test_spec.FAIL_TO_PASS, + "PASS_TO_PASS": self.test_spec.PASS_TO_PASS, + } + eval_type = EvalType.FAIL_ONLY if self.test_spec.repo in FAIL_ONLY_REPOS else EvalType.PASS_AND_FAIL + tests_status = get_eval_tests_report(status_map, eval_ref, eval_type=eval_type) + report["test_status"] = tests_status + report["resolved"] = get_resolution_status(tests_status) == ResolvedStatus.FULL.value + if not report["resolved"]: + f2p_missing = tests_status["FAIL_TO_PASS"]["failure"] + p2p_failed = tests_status["PASS_TO_PASS"]["failure"] + self.logger.warning( + f"SWE-bench-Multilingual NOT resolved: FAIL_TO_PASS still failing={f2p_missing[:25]} " + f"PASS_TO_PASS broke={p2p_failed[:25]}" + ) + return report + + @auto_await + async def _apply_patch(self, patch: str) -> None: + """Apply a patch string to the env. Tries multiple apply strategies in order.""" + if not patch or not patch.strip(): + self.logger.info("Empty patch, nothing to apply.") + return + patch_path = Path(f"/tmp/patch_{uuid.uuid4()}.diff") + await self.env.write_file(patch_path, patch) + commands = [ + f"cd /testbed && git apply --whitespace=fix {patch_path.as_posix()}", + f"cd /testbed && git apply --reject --whitespace=nowarn {patch_path.as_posix()}", + f"cd /testbed && patch --batch --fuzz=5 -p1 -i {patch_path.as_posix()}", + ] + last_error: Exception | None = None + for cmd in commands: + try: + await self.env.communicate(cmd, check="raise") + self.logger.info("Applied patch successfully!") + return + except RuntimeError as e: + last_error = e + continue + raise RuntimeError("Failed to apply patch with any command") from last_error From 772a98276cefa36a22439896b11a9d847fd00ceb Mon Sep 17 00:00:00 2001 From: yyDing1 Date: Sat, 6 Jun 2026 18:03:33 +0800 Subject: [PATCH 2/5] update --- docs/source/start/agent_interaction.md | 1 + examples/agent_env/demo.py | 6 + .../agent_interaction/parallel_verify_swe.py | 136 ++++++++++++------ .../data_preprocess/swe_bench_multilingual.py | 13 +- .../data_preprocess/swe_bench_verified.py | 8 +- uni_agent/deployment/modal/deployment.py | 8 +- uni_agent/reward/swe_bench.py | 9 +- uni_agent/reward/swe_bench_multilingual.py | 14 +- 8 files changed, 130 insertions(+), 65 deletions(-) diff --git a/docs/source/start/agent_interaction.md b/docs/source/start/agent_interaction.md index 810b1e59..c2372ade 100644 --- a/docs/source/start/agent_interaction.md +++ b/docs/source/start/agent_interaction.md @@ -15,6 +15,7 @@ The inference and verification scripts for this page live under `examples/agent_ | Qwen3-Coder-Next | temp=0.8, topp=0.9, tp=16, 300 turns, 128K context | **67.6** (Avg@4) | | Qwen3.5-4B | temp=0.8, topp=0.9, tp=4, 100 turns, 64K context | **45.2** (Avg@1) | | Qwen3.5-9B | temp=1.0, topp=0.7, tp=4, 100 turns, 64K context | **53.8** (Avg@1) | +| Qwen3.5-9B | temp=1.0, topp=0.95, tp=4, 200 turns, 128k context | **65.6** (Avg@1) | | Qwen3.5-35B-A3B | temp=1.0, topp=0.7, tp=4, 300 turns, 128K context | **68.4** (Avg@1) | **Reference results on Terminal-Bench v2 with Uni-Agent:** diff --git a/examples/agent_env/demo.py b/examples/agent_env/demo.py index 3a33dee5..8be97b92 100644 --- a/examples/agent_env/demo.py +++ b/examples/agent_env/demo.py @@ -83,6 +83,12 @@ "deployment": deployment_config, "env_variables": { "PIP_PROGRESS_BAR": "off", + "PIP_CACHE_DIR": "~/.cache/pip", + "PAGER": "cat", + "MANPAGER": "cat", + "LESS": "-R", + "TQDM_DISABLE": "1", + "GIT_PAGER": "cat", }, } env_config = AgentEnvConfig(**env_config) diff --git a/examples/agent_interaction/parallel_verify_swe.py b/examples/agent_interaction/parallel_verify_swe.py index d9533efb..8fcde11b 100644 --- a/examples/agent_interaction/parallel_verify_swe.py +++ b/examples/agent_interaction/parallel_verify_swe.py @@ -7,42 +7,41 @@ from pathlib import Path import ray -from datasets import load_dataset +from tqdm import tqdm +from datasets import load_dataset from uni_agent.async_logging import add_file_handler, cleanup_handlers from uni_agent.interaction import AgentEnv, AgentEnvConfig from uni_agent.reward import load_reward_spec -logger = logging.getLogger(__file__) +logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s") +logger = logging.getLogger(__name__) logger.setLevel("INFO") +GLOBAL_CONCURRENCY = int(os.getenv("GLOBAL_CONCURRENCY", 512)) +NUM_WORKERS = int(os.getenv("NUM_WORKERS", 8)) +DATA_PATH = os.getenv("DATA_PATH", "/home/tiger/data/swe_agent/swe_bench_multilingual_modal.parquet") + async def run_sample(sample): run_id = str(uuid.uuid4()) instance = sample["extra_info"]["tools_kwargs"] impl = os.getenv("DEPLOYMENT", "vefaas").lower() - # SWE preprocessors emit ``env.deployment.image`` (nested, matching - # ``AgentEnvConfig`` / ``DeployConfig``). Older parquets used flat - # ``env.image``; accept both so a stale parquet doesn't silently break. - instance_image = instance["env"].get("deployment", {}).get("image") or instance["env"].get("image") - if instance_image is None: + case_deployment = dict(instance["env"].get("deployment", {})) + if not case_deployment.get("image"): raise KeyError("No image found in instance.env.deployment.image or instance.env.image") if impl == "vefaas": - deployment_config = { + defaults = { "type": "vefaas", - "image": instance_image, "command": "curl -fsSL https://vefaas-swe.tos-cn-beijing.ivolces.com/swe-rex/install_1.4.0.sh | bash -s -- {token}", "timeout": 600.0, "startup_timeout": 180.0, - "function_id": os.getenv("VEFAAS_FUNCTION_ID"), - "function_route": os.getenv("VEFAAS_FUNCTION_ROUTE"), } elif impl == "modal": - deployment_config = { + defaults = { "type": "modal", - "image": instance_image, "startup_timeout": 600.0, "runtime_timeout": 600.0, "deployment_timeout": 3600.0, @@ -52,6 +51,9 @@ async def run_sample(sample): else: raise ValueError(f"Invalid environment implementation: {impl}") + # Case config wins; defaults fill in whatever the case didn't specify. + deployment_config = {**defaults, **case_deployment} + env_config = { "deployment": deployment_config, "env_variables": { @@ -78,51 +80,84 @@ async def run_sample(sample): reward_spec = load_reward_spec(reward_config) add_file_handler(Path(f"/tmp/eval_gold_patch/{run_id}.log"), run_id) - await env.start() - await reward_spec.apply_gold_patch() - _, result = await reward_spec.compute_reward() - await env.close() - cleanup_handlers(run_id) + try: + await env.start() + await reward_spec.apply_gold_patch() + _, result = await reward_spec.compute_reward() + except Exception as e: + logger.error(f"Error running sample {run_id}: {e}") + result = {"resolved": False, "eval_completed": False, "eval_execution_time": None} + finally: + await env.close() + cleanup_handlers(run_id) return result @ray.remote class TestEvalActor: - _semaphore = asyncio.Semaphore(64) - - async def run_batch(self, samples): - tasks = [self.run_single(sample) for sample in samples] - return await asyncio.gather(*tasks) + _semaphore = asyncio.Semaphore(max(1, GLOBAL_CONCURRENCY // NUM_WORKERS)) async def run_single(self, sample): async with self._semaphore: return await run_sample(sample) +def _rule(text: str = "", width: int = 50, ch: str = "─") -> str: + """A centered-title horizontal rule. Emoji-safe (left-aligned rows below it + carry the values, so we never depend on monospace emoji width).""" + if not text: + return ch * width + pad = max(0, width - len(text) - 2) + return f"{ch * (pad // 2)} {text} {ch * (pad - pad // 2)}" + + def main(): ray.init() # data_path = "/home/tiger/data/swe_agent/swe_rebench_filtered.parquet" # data_path = "/home/tiger/data/swe_agent/r2e_gym_subset.parquet" - data_path = "/home/tiger/data/swe_agent/swe_bench_verified_modal.parquet" - dataset = load_dataset("parquet", data_files=data_path, split="train") + dataset = load_dataset("parquet", data_files=DATA_PATH, split="train") samples = dataset.to_list() - workers = [TestEvalActor.remote() for _ in range(8)] - futures = [] - chunk_size = (len(samples) - 1) // len(workers) + 1 - for i in range(len(workers)): - chunk = samples[i * chunk_size : (i + 1) * chunk_size] - futures.append(workers[i].run_batch.remote(chunk)) - # each future returns a list of per-sample results (one chunk per worker) + logger.info(f"loaded {len(samples)} samples from {DATA_PATH}") + logger.info(f"deployment={os.getenv('DEPLOYMENT', 'vefaas')} workers={NUM_WORKERS} concurrency={GLOBAL_CONCURRENCY}") + + workers = [TestEvalActor.remote() for _ in range(NUM_WORKERS)] + # one future per sample (round-robin across workers) so we can stream + # per-sample progress; the actor semaphore still bounds real concurrency. + futures = [workers[i % len(workers)].run_single.remote(s) for i, s in enumerate(samples)] + fut_to_idx = {f: i for i, f in enumerate(futures)} begin_time = time.time() - results_chunk = ray.get(futures) + results: list = [None] * len(futures) + ok = wa = tle = 0 + remaining = list(futures) + with tqdm( + total=len(futures), + desc="🚀 eval", + colour="green", + unit="inst", + dynamic_ncols=True, + bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]{postfix}", + ) as pbar: + while remaining: + done, remaining = ray.wait(remaining, num_returns=1) + for d in done: + res = ray.get(d) + results[fut_to_idx[d]] = res + if res.get("resolved"): + ok += 1 + elif res.get("eval_completed"): + wa += 1 + else: + tle += 1 + rate = ok / pbar.n * 100 if pbar.n else 0.0 + pbar.set_postfix_str(f"✅{ok} ❌WA{wa} ⏱TLE{tle} | {rate:.0f}% pass") + pbar.update(1) end_time = time.time() - logger.info(f"time cost: {end_time - begin_time:.2f}s") - results = [item for chunk in results_chunk for item in chunk] + all_num = len(results) - success_num = len([item for item in results if item["resolved"]]) - fail_wa_num = len([item for item in results if not item["resolved"] and item["eval_completed"]]) - fail_tle_num = len([item for item in results if not item["resolved"] and not item["eval_completed"]]) + success_num = len([r for r in results if r.get("resolved")]) + fail_wa_num = len([r for r in results if not r.get("resolved") and r.get("eval_completed")]) + fail_tle_num = len([r for r in results if not r.get("resolved") and not r.get("eval_completed")]) def instance_name(sample): return sample["extra_info"]["tools_kwargs"]["reward"]["metadata"]["instance_id"] @@ -130,21 +165,32 @@ def instance_name(sample): fail_wa_names = [ instance_name(sample) for sample, item in zip(samples, results, strict=False) - if not item["resolved"] and item["eval_completed"] + if not item.get("resolved") and item.get("eval_completed") ] fail_tle_names = [ instance_name(sample) for sample, item in zip(samples, results, strict=False) - if not item["resolved"] and not item["eval_completed"] + if not item.get("resolved") and not item.get("eval_completed") ] exec_times = [r["eval_execution_time"] for r in results if r.get("eval_execution_time") is not None] - avg_exec_time = sum(exec_times) / len(exec_times) - - logger.info( - f"all_num: {all_num}, success_num: {success_num}, fail_wa_num: {fail_wa_num}, fail_tle_num: {fail_tle_num}" + avg_exec_time = sum(exec_times) / len(exec_times) if exec_times else 0.0 + pass_rate = success_num / all_num * 100 if all_num else 0.0 + wall = end_time - begin_time + + summary = "\n".join( + [ + "", + _rule("🧪 eval summary"), + f" ✅ resolved {success_num:>4} ({pass_rate:.1f}%)", + f" ❌ wrong-ans {fail_wa_num:>4}", + f" ⏱ timeout {fail_tle_num:>4}", + f" Σ total {all_num:>4}", + _rule(f"avg {avg_exec_time:.1f}s · wall {wall:.1f}s · n={len(exec_times)}"), + "", + ] ) - logger.info(f"avg_execution_time: {avg_exec_time:.2f}s (n={len(exec_times)})") + print(summary) logger.info(f"fail_wa instance names: {fail_wa_names}") logger.info(f"fail_tle instance names: {fail_tle_names}") diff --git a/examples/data_preprocess/swe_bench_multilingual.py b/examples/data_preprocess/swe_bench_multilingual.py index 17f4b948..0d942b14 100644 --- a/examples/data_preprocess/swe_bench_multilingual.py +++ b/examples/data_preprocess/swe_bench_multilingual.py @@ -130,11 +130,6 @@ def process(example): "git tag -d $(git tag -l)", "git reflog expire --expire=now --all", "git gc --prune=now", - "git config --global user.email setup@swebench.config", - "git config --global user.name SWE-bench", - "git commit --allow-empty -am SWE-bench", - f"git checkout {metadata['base_commit']}", - "git clean -fdq", ] ) @@ -153,7 +148,13 @@ def process(example): "extra_info": { "tools_kwargs": { "env": { - "deployment": {"image": get_image_name(instance_id)}, + "deployment": { + "image": get_image_name(instance_id), + "modal_sandbox_kwargs": { + "cpu": (0.5, 4.0), + "memory": (1024, 8192), + }, + }, "post_setup_cmd": reset_script, }, "reward": { diff --git a/examples/data_preprocess/swe_bench_verified.py b/examples/data_preprocess/swe_bench_verified.py index 662ccbc7..8f26ad16 100644 --- a/examples/data_preprocess/swe_bench_verified.py +++ b/examples/data_preprocess/swe_bench_verified.py @@ -109,13 +109,7 @@ def process_swe_bench_verified(example): dataset_id = "swe-bench-verified" instance_id = example["instance_id"] image_name = get_image_name(dataset_id, instance_id) - reset_cmds = [ - "cd /testbed", - "git restore .", - "git reset --hard", - f"git checkout {example['base_commit']}", - "git clean -fdq", - ] + reset_cmds = [] reset_script = " && ".join(reset_cmds) sample = { "prompt": [ diff --git a/uni_agent/deployment/modal/deployment.py b/uni_agent/deployment/modal/deployment.py index 7168f132..72b4fa85 100644 --- a/uni_agent/deployment/modal/deployment.py +++ b/uni_agent/deployment/modal/deployment.py @@ -28,7 +28,7 @@ # The semaphore is process-local, so MODAL_MAX_STARTING_PER_WORKER is a # per-worker cap (size it as fleet-wide target / num rollout workers). # MODAL_INIT_WALL_BUDGET caps a single trajectory's total init wall-clock. -_DEFAULT_MAX_STARTING_PER_WORKER = 8 +_DEFAULT_MAX_STARTING_PER_WORKER = 64 _DEFAULT_INIT_WALL_BUDGET = 900.0 _STARTING_SEMA: asyncio.Semaphore | None = None @@ -112,7 +112,7 @@ def from_ecr(self, image: str) -> modal.Image: raise ValueError(msg) from e def ensure_pipx_installed(self, image: modal.Image) -> modal.Image: - image = image.apt_install("pipx") + image = image.apt_install("pipx", env={"DEBIAN_FRONTEND": "noninteractive"}) return image.run_commands("pipx ensurepath") def auto(self, image_spec: str | modal.Image | PurePath) -> modal.Image: @@ -167,6 +167,10 @@ def __init__( self._proxy = proxy if modal_sandbox_kwargs is None: modal_sandbox_kwargs = {} + modal_sandbox_kwargs = dict(modal_sandbox_kwargs) + for _key in ("cpu", "memory"): + if isinstance(modal_sandbox_kwargs.get(_key), list): + modal_sandbox_kwargs[_key] = tuple(modal_sandbox_kwargs[_key]) self._modal_kwargs = modal_sandbox_kwargs self._hooks = CombinedDeploymentHook() diff --git a/uni_agent/reward/swe_bench.py b/uni_agent/reward/swe_bench.py index 483e1452..46aac4c4 100644 --- a/uni_agent/reward/swe_bench.py +++ b/uni_agent/reward/swe_bench.py @@ -15,6 +15,7 @@ from swebench.harness.grading import get_eval_tests_report, get_resolution_status from swebench.harness.log_parsers import MAP_REPO_TO_PARSER from swebench.harness.test_spec.python import get_test_directives +from swebench.harness.utils import get_modified_files from uni_agent.async_logging import get_logger from uni_agent.interaction import AgentEnv @@ -31,8 +32,12 @@ def _make_eval_script_list(instance, specs, env_name, repo_directory, base_commi which resets the whole repo (e.g. reverts tox.ini). We use no-op instead. """ _HEREDOC_DELIMITER = "EOF_114329324912" - - reset_tests_command = "git checkout master 2>/dev/null || git checkout main" + base_commit = instance["base_commit"] + test_files = get_modified_files(test_patch) + if test_files: + reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}" + else: + reset_tests_command = f"echo 'skip reset'" apply_test_patch_command = f"git apply -v - <<'{_HEREDOC_DELIMITER}'\n{test_patch}\n{_HEREDOC_DELIMITER}" test_cmd = MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"] diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py index 435f3c44..298b690d 100644 --- a/uni_agent/reward/swe_bench_multilingual.py +++ b/uni_agent/reward/swe_bench_multilingual.py @@ -35,6 +35,7 @@ from swebench.harness.log_parsers import MAP_REPO_TO_PARSER from swebench.harness.test_spec.javascript import get_download_img_commands from swebench.harness.test_spec.test_spec import make_test_spec +from swebench.harness.utils import get_modified_files from uni_agent.async_logging import get_logger from uni_agent.interaction import AgentEnv @@ -115,10 +116,15 @@ def _make_eval_script_list(self) -> list[str]: """ instance = self.metadata repo_directory = "/testbed" + base_commit = instance["base_commit"] test_patch = instance["test_patch"] specs = MAP_REPO_VERSION_TO_SPECS[self.repo][self.version] - reset_tests_command = "git checkout master 2>/dev/null || git checkout main" + test_files = get_modified_files(test_patch) + if test_files: + reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}" + else: + reset_tests_command = f"echo 'skip reset'" build_commands = list(specs.get("build", [])) apply_test_patch_command = ( @@ -128,6 +134,7 @@ def _make_eval_script_list(self) -> list[str]: test_commands = [test_cmd] if isinstance(test_cmd, str) else list(test_cmd) eval_commands = [ + "chmod 1777 /tmp 2>/dev/null || true", f"cd {repo_directory}", f"git config --global --add safe.directory {repo_directory}", f"cd {repo_directory}", @@ -142,7 +149,8 @@ def _make_eval_script_list(self) -> list[str]: # JS instances may ship test image fixtures pulled in right after the reset # (a no-op unless the instance carries ``image_assets``). if MAP_REPO_TO_EXT[self.repo] == "js": - eval_commands[4:4] = get_download_img_commands(instance) + idx = eval_commands.index(apply_test_patch_command) + eval_commands[idx:idx] = get_download_img_commands(instance) return eval_commands def _build_eval_script(self) -> str: @@ -205,7 +213,7 @@ async def _apply_patch(self, patch: str) -> None: last_error: Exception | None = None for cmd in commands: try: - await self.env.communicate(cmd, check="raise") + await self.env.communicate(cmd, check="ignore") self.logger.info("Applied patch successfully!") return except RuntimeError as e: From d363197e6f6164b62bdd5388cb18fa425a76896d Mon Sep 17 00:00:00 2001 From: yyDing1 Date: Sat, 6 Jun 2026 18:04:53 +0800 Subject: [PATCH 3/5] update --- examples/agent_interaction/parallel_verify_swe.py | 6 ++++-- uni_agent/reward/swe_bench.py | 2 +- uni_agent/reward/swe_bench_multilingual.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/agent_interaction/parallel_verify_swe.py b/examples/agent_interaction/parallel_verify_swe.py index 8fcde11b..4be8d7fa 100644 --- a/examples/agent_interaction/parallel_verify_swe.py +++ b/examples/agent_interaction/parallel_verify_swe.py @@ -7,9 +7,9 @@ from pathlib import Path import ray +from datasets import load_dataset from tqdm import tqdm -from datasets import load_dataset from uni_agent.async_logging import add_file_handler, cleanup_handlers from uni_agent.interaction import AgentEnv, AgentEnvConfig from uni_agent.reward import load_reward_spec @@ -118,7 +118,9 @@ def main(): dataset = load_dataset("parquet", data_files=DATA_PATH, split="train") samples = dataset.to_list() logger.info(f"loaded {len(samples)} samples from {DATA_PATH}") - logger.info(f"deployment={os.getenv('DEPLOYMENT', 'vefaas')} workers={NUM_WORKERS} concurrency={GLOBAL_CONCURRENCY}") + logger.info( + f"deployment={os.getenv('DEPLOYMENT', 'vefaas')} workers={NUM_WORKERS} concurrency={GLOBAL_CONCURRENCY}" + ) workers = [TestEvalActor.remote() for _ in range(NUM_WORKERS)] # one future per sample (round-robin across workers) so we can stream diff --git a/uni_agent/reward/swe_bench.py b/uni_agent/reward/swe_bench.py index 46aac4c4..108ca866 100644 --- a/uni_agent/reward/swe_bench.py +++ b/uni_agent/reward/swe_bench.py @@ -37,7 +37,7 @@ def _make_eval_script_list(instance, specs, env_name, repo_directory, base_commi if test_files: reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}" else: - reset_tests_command = f"echo 'skip reset'" + reset_tests_command = "echo 'skip reset'" apply_test_patch_command = f"git apply -v - <<'{_HEREDOC_DELIMITER}'\n{test_patch}\n{_HEREDOC_DELIMITER}" test_cmd = MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"] diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py index 298b690d..502a4d30 100644 --- a/uni_agent/reward/swe_bench_multilingual.py +++ b/uni_agent/reward/swe_bench_multilingual.py @@ -124,7 +124,7 @@ def _make_eval_script_list(self) -> list[str]: if test_files: reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}" else: - reset_tests_command = f"echo 'skip reset'" + reset_tests_command = "echo 'skip reset'" build_commands = list(specs.get("build", [])) apply_test_patch_command = ( From 25fedad12965024a08456d12e975af5e4bbb511d Mon Sep 17 00:00:00 2001 From: yyDing1 Date: Sun, 7 Jun 2026 14:57:23 +0800 Subject: [PATCH 4/5] update --- docs/source/start/agent_interaction.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/start/agent_interaction.md b/docs/source/start/agent_interaction.md index c2372ade..48403dd7 100644 --- a/docs/source/start/agent_interaction.md +++ b/docs/source/start/agent_interaction.md @@ -18,6 +18,12 @@ The inference and verification scripts for this page live under `examples/agent_ | Qwen3.5-9B | temp=1.0, topp=0.95, tp=4, 200 turns, 128k context | **65.6** (Avg@1) | | Qwen3.5-35B-A3B | temp=1.0, topp=0.7, tp=4, 300 turns, 128K context | **68.4** (Avg@1) | +**Reference results on SWE-bench Multilingual with Uni-Agent:** + +| **Model** | Inference Config | **Uni-Agent** | +| ---------------------------- | ----------------------- |:-------------:| +| Qwen3-Coder-30B-A3B-Instruct | 200 turns, 128K context | **32.3** (Avg@1) | + **Reference results on Terminal-Bench v2 with Uni-Agent:** | **Model** | Inference Config | **Uni-Agent** | From e743727930f2352344c6c033857d7f01a308a42a Mon Sep 17 00:00:00 2001 From: yyDing1 Date: Sun, 7 Jun 2026 15:27:21 +0800 Subject: [PATCH 5/5] updayte --- uni_agent/reward/swe_bench_multilingual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uni_agent/reward/swe_bench_multilingual.py b/uni_agent/reward/swe_bench_multilingual.py index 502a4d30..b593cb77 100644 --- a/uni_agent/reward/swe_bench_multilingual.py +++ b/uni_agent/reward/swe_bench_multilingual.py @@ -213,7 +213,7 @@ async def _apply_patch(self, patch: str) -> None: last_error: Exception | None = None for cmd in commands: try: - await self.env.communicate(cmd, check="ignore") + await self.env.communicate(cmd, check="raise") self.logger.info("Applied patch successfully!") return except RuntimeError as e: