diff --git a/src/minisweagent/config/geak.yaml b/src/minisweagent/config/geak.yaml index ff5729ad1..3f6e49243 100644 --- a/src/minisweagent/config/geak.yaml +++ b/src/minisweagent/config/geak.yaml @@ -43,7 +43,7 @@ run: kill_buffer_s: 360 # forced os._exit() this long after opt_deadline full: total_s: 7200 # 2 hours total wall-clock - preprocess_soft_cap_s: 900 + preprocess_soft_cap_s: 2400 # 40 min: translation + multi-round harness-gen + baseline preprocess_hard_cap_fraction: 0.5 # -> 3600 s ceiling finalize_grace_s: 300 kill_buffer_s: 360 diff --git a/src/minisweagent/run/mini.py b/src/minisweagent/run/mini.py index 836b14711..a53f1a362 100644 --- a/src/minisweagent/run/mini.py +++ b/src/minisweagent/run/mini.py @@ -894,6 +894,16 @@ def _hard_kill_handler() -> None: test_command = preprocess_ctx["test_command"] if preprocess_ctx.get("repo_root") and repo is None: repo = Path(preprocess_ctx["repo_root"]) + elif preprocess_ctx.get("repo_root"): + # A PyTorch->FlyDSL translation retargets the optimization root to the + # per-run ``_opt_repo`` (where the translated kernel + staged reference + # live). Honor that even when ``--repo`` was passed, otherwise + # optimization/preflight root at the source repo (which has NO + # translated kernel) and the harness fails to import it. + _pp_root = Path(preprocess_ctx["repo_root"]) + if _pp_root.name == "_opt_repo" and (repo is None or Path(repo).resolve() != _pp_root.resolve()): + logger.info("Using per-run _opt_repo as optimization root (translation run): %s", _pp_root) + repo = _pp_root # Resolve max_rounds via the documented precedence chain: # CLI --max-rounds (if any future flag added) > config (mode preset) > diff --git a/src/minisweagent/run/preprocess_v3/baseline.py b/src/minisweagent/run/preprocess_v3/baseline.py index 0b8ce951d..2c796e90e 100644 --- a/src/minisweagent/run/preprocess_v3/baseline.py +++ b/src/minisweagent/run/preprocess_v3/baseline.py @@ -35,6 +35,7 @@ import logging import os +import re import shlex import statistics import subprocess @@ -77,16 +78,42 @@ #: Short timeout for the correctness gate that runs before baseline collection. #: Goal: fail fast on a broken kernel rather than spending minutes running the -#: full benchmark loop. Override via ``GEAK_BENCH_TIMEOUT`` (or the legacy -#: ``GEAK_CORRECTNESS_GATE_TIMEOUT``). +#: full benchmark loop. ``GEAK_CORRECTNESS_GATE_TIMEOUT`` takes precedence; for +#: compiled kernels whose first ``--correctness`` run also builds the extension, +#: the larger ``GEAK_BENCH_TIMEOUT`` is honored as a fallback. Default 120s. _CORRECTNESS_GATE_TIMEOUT_S = int( os.environ.get( - "GEAK_BENCH_TIMEOUT", - os.environ.get("GEAK_CORRECTNESS_GATE_TIMEOUT", "120"), + "GEAK_CORRECTNESS_GATE_TIMEOUT", + os.environ.get("GEAK_BENCH_TIMEOUT", "120"), ) ) +#: Exception names in harness stderr/stdout that mean the harness could not +#: resolve (import/open) the kernel-under-test — a broken-harness failure that +#: yields an empty baseline. Surfaced precisely so a no-latency baseline reads +#: as "kernel not found at " instead of a silent "produced no latency". +_KERNEL_RESOLUTION_MARKERS = ("FileNotFoundError", "ModuleNotFoundError", "ImportError") + + +def detect_kernel_resolution_failure(raw_outputs: list[dict[str, Any]]) -> str | None: + """Return the first kernel-resolution error line from harness output, or ``None``. + + Scans each run's stderr/stdout for an import / file-not-found error (the + signature of a harness pointing at a non-existent kernel path) and returns + that line verbatim — e.g. ``FileNotFoundError: [Errno 2] No such file or + directory: ''`` — so callers can report exactly which path failed to + resolve rather than a generic "no latency" message. + """ + for out in raw_outputs: + blob = f"{out.get('stderr') or ''}\n{out.get('stdout') or ''}" + for marker in _KERNEL_RESOLUTION_MARKERS: + idx = blob.find(marker) + if idx != -1: + return blob[idx:].splitlines()[0].strip() + return None + + @dataclass(frozen=True) class BaselineMetrics: """Wall-clock benchmark statistics for a harness run. @@ -327,6 +354,7 @@ def collect_baseline_metrics( repeats: int = 5, work_dir: Path | None = None, gpu_id: int = 0, + skip_correctness_gate: bool = False, ) -> BaselineMetrics: """Run the harness ``repeats`` times in ``--benchmark`` mode. @@ -349,6 +377,20 @@ def collect_baseline_metrics( gpu_id: ``HIP_VISIBLE_DEVICES`` value for each invocation. Defaults to GPU 0 to match the legacy default. + skip_correctness_gate: + When ``True``, skip the up-front ``--correctness`` gate and go + straight to the benchmark loop. Use this when correctness has + already been validated upstream on an authoritative harness — + notably after a successful PyTorch→FlyDSL translation, which runs + its own correctness + performance-regression check. The gate + re-checks correctness on the (stricter) harness-generator harness + and trips on *any* non-zero exit (timeout / env / multi-shape + miss), not just real numeric mismatches, so re-gating an + already-validated kernel discards good candidates. The global + ``GEAK_SKIP_CORRECTNESS_GATE=1`` env var still forces a skip + regardless of this flag; this parameter scopes the skip to a + single call (e.g. translation runs) without disabling the gate + for user-supplied harnesses. Returns: A :class:`BaselineMetrics` summarising the run. @@ -378,10 +420,13 @@ def collect_baseline_metrics( # Correctness gate: a quick ``--correctness`` invocation up front so that a # broken kernel fails in ~5-30 s rather than after a full benchmark + profile - # cycle (~5+ min). Mirrors the legacy harness validation shape; can be - # disabled via ``GEAK_SKIP_CORRECTNESS_GATE=1`` when you explicitly want - # baseline numbers from a correctness-failing kernel. - if not os.environ.get("GEAK_SKIP_CORRECTNESS_GATE"): + # cycle (~5+ min). Mirrors the legacy harness validation shape. Skipped when: + # * ``skip_correctness_gate=True`` — correctness already validated upstream + # (e.g. translation, which runs its own correctness + perf-regression + # gate). Scoped to this call, so user-supplied harnesses still gate. + # * ``GEAK_SKIP_CORRECTNESS_GATE=1`` — global override for when you + # explicitly want baseline numbers from a correctness-failing kernel. + if not skip_correctness_gate and not os.environ.get("GEAK_SKIP_CORRECTNESS_GATE"): gate = _run_benchmark_once( harness_path, work_dir=work_dir, @@ -689,4 +734,5 @@ def collect_profile( "ProfileResult", "collect_baseline_metrics", "collect_profile", + "detect_kernel_resolution_failure", ] diff --git a/src/minisweagent/run/preprocess_v3/tools.py b/src/minisweagent/run/preprocess_v3/tools.py index 0632a73e3..467abf38c 100644 --- a/src/minisweagent/run/preprocess_v3/tools.py +++ b/src/minisweagent/run/preprocess_v3/tools.py @@ -52,6 +52,7 @@ import os import shlex import shutil +import subprocess import time from collections.abc import Callable from dataclasses import dataclass, replace @@ -64,6 +65,7 @@ ProfileResult, collect_baseline_metrics, collect_profile, + detect_kernel_resolution_failure, ) from minisweagent.run.preprocess_v3.commandment import ( CommandmentContext, @@ -306,19 +308,26 @@ def _copy_repo_sandbox(repo_root: Path, sandbox_root: Path, output_dir: Path) -> repo_root = repo_root.resolve() output_dir = output_dir.resolve() + # The output-dir guard exists to avoid recursively copying the active GEAK + # output directory when it lives INSIDE the repo being copied. It must NOT + # fire when the repo itself lives under output_dir (the per-run ``_opt_repo`` + # staged for a translation run): there, output_dir is the parent and the + # repo's own files are descendants of output_dir, so guarding would ignore + # everything and produce an empty sandbox. + guard_output_dir = output_dir == repo_root or repo_root in output_dir.parents + def _ignore(dir_path: str, names: list[str]) -> set[str]: ignored = {"__pycache__", ".pytest_cache", ".ruff_cache"} - current = Path(dir_path).resolve() - for name in names: - child = current / name - try: - child_resolved = child.resolve() - except OSError: - continue - # Avoid recursively copying the active GEAK output directory - # when users place outputs under the target repo. - if child_resolved == output_dir or output_dir in child_resolved.parents: - ignored.add(name) + if guard_output_dir: + current = Path(dir_path).resolve() + for name in names: + child = current / name + try: + child_resolved = child.resolve() + except OSError: + continue + if child_resolved == output_dir or output_dir in child_resolved.parents: + ignored.add(name) return ignored shutil.copytree(repo_root, sandbox_root, symlinks=True, ignore=_ignore) @@ -1148,6 +1157,32 @@ def _impl(source_path: str, output_dir: str) -> dict[str, Any]: if ref_src.exists() and ref_dest != ref_src: ref_dest.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(ref_src, ref_dest) + # Initialise _opt_repo as a git repo with a committed + # baseline so the optimization agent's per-slot worktrees and + # the eval/preflight path have a real git root to diff + # against (the #275 git-diff patch-capture contract). The + # staged dir is a fresh per-run copy, so this never touches + # the user's source repo. Non-fatal on failure — the eval + # path also inits a temp git repo for non-git roots. + try: + if not (opt_repo / ".git").exists(): + _git_env = { + **os.environ, + "GIT_AUTHOR_NAME": "geak", + "GIT_AUTHOR_EMAIL": "geak@local", + "GIT_COMMITTER_NAME": "geak", + "GIT_COMMITTER_EMAIL": "geak@local", + } + for _git_cmd in ( + ["git", "init", "-q"], + ["git", "add", "-A"], + ["git", "commit", "-q", "-m", "geak: translated FlyDSL baseline"], + ): + subprocess.run( + _git_cmd, cwd=str(opt_repo), env=_git_env, capture_output=True, check=True + ) + except Exception as _git_exc: # noqa: BLE001 — patch capture has a non-git fallback + logger.warning("Could not git-init per-run _opt_repo (non-fatal): %s", _git_exc) result = replace(result, translated_kernel_path=staged) logger.info( "Staged translated kernel into per-run optimization repo for patch capture: %s -> %s", @@ -1161,6 +1196,24 @@ def _impl(source_path: str, output_dir: str) -> dict[str, Any]: exc, ) agent._collected["translation"] = result + # Align downstream preprocess with where optimization will run. The + # adapter roots optimization at the per-run ``_opt_repo`` (the staged + # translated kernel's parent); point the orchestrator's kernel_path / + # repo_root there too so the harness subagent sandbox, the injected + # kernel_relpath, and the baseline work_dir all resolve the TRANSLATED + # kernel (which lives only in ``_opt_repo``) instead of the source repo + # (which has no translated kernel). Without this the harness is + # generated/verified against the wrong tree and the baseline cannot + # import the kernel. + if ( + result.success + and result.translated_kernel_path is not None + and hasattr(agent, "_extra_template_vars") + ): + staged_kernel = Path(result.translated_kernel_path).resolve() + if staged_kernel.parent.name == "_opt_repo": + agent._extra_template_vars["kernel_path"] = str(staged_kernel) + agent._extra_template_vars["repo_root"] = str(staged_kernel.parent) return { "ok": result.success, "translated_kernel_path": str(result.translated_kernel_path) if result.translated_kernel_path else None, @@ -1246,6 +1299,30 @@ def _impl(name: str, task: str | None = None, context: Any = None, **_extra_igno context.setdefault("attempt", generator_attempts) elif name == "harness-verifier": context.setdefault("attempt", max(generator_attempts, 1)) + + # Deterministic kernel-path injection: the orchestrator already knows + # where the kernel lives, so hand the harness subagents the exact + # worktree-relative path instead of letting the LLM infer it from the + # source tree. Inferring it is the root cause of harnesses that build a + # wrong path (e.g. a spurious doubled directory segment) -> the kernel + # import raises FileNotFoundError -> empty baseline -> FAIL_PREPROCESS. + # The harness must resolve the kernel as os.path.join(WORK_DIR, + # kernel_relpath); see the harness-generator's worktree-path discipline. + if name in ("harness-generator", "harness-verifier"): + _tv = getattr(agent, "_extra_template_vars", {}) or {} + _kp = _tv.get("kernel_path") + _rr = _tv.get("repo_root") + if _kp: + context.setdefault("kernel_path", str(_kp)) + if _rr: + try: + _rel = Path(str(_kp)).resolve().relative_to(Path(str(_rr)).resolve()) + context.setdefault("kernel_relpath", str(_rel)) + except ValueError: + # Kernel lives outside repo_root (e.g. a staged translated + # kernel) — the absolute kernel_path above is the signal. + pass + codebase_ctx = agent._collected.get("codebase_context") if ( codebase_ctx is not None @@ -1287,6 +1364,47 @@ def _impl(name: str, task: str | None = None, context: Any = None, **_extra_igno agent._collected["harness_path"] = stripped.split(":", 1)[1].strip() elif stripped.startswith("HARNESS_PATH="): agent._collected["harness_path"] = stripped.split("=", 1)[1].strip() + + # Deterministic verification backstop. The LLM-driven harness-verifier + # sometimes fails to emit HARNESS_VERIFIED=true even for a harness that + # actually runs (flaky subagent behavior), which leaves the orchestrator + # looping on harness-generator until the preprocess cap — never reaching + # collect_baseline. If the verifier did not confirm but the produced + # harness PASSES --correctness against the effective work_dir (the same + # one collect_baseline uses), mark it verified so the run can proceed. + if ( + name == "harness-verifier" + and not result.get("success") + and agent._collected.get("harness_path") + ): + hp = Path(str(agent._collected["harness_path"])) + _rr = agent._extra_template_vars.get("repo_root") if hasattr(agent, "_extra_template_vars") else None + _wd = Path(str(_rr)) if _rr else (Path(agent.config.repo) if agent.config.repo else None) + if hp.is_file(): + try: + from minisweagent.run.preprocess_v3.baseline import _run_benchmark_once + + check = _run_benchmark_once( + hp, work_dir=_wd, gpu_id=agent.config.gpu_id, timeout_s=300, flag="--correctness" + ) + if check["returncode"] == 0: + logger.info( + "harness-verifier: deterministic --correctness backstop PASSED for %s; " + "marking HARNESS_VERIFIED (LLM verifier did not confirm)", + hp, + ) + result["success"] = True + result["output"] = (result.get("output") or "") + ( + "\nHARNESS_VERIFIED=true\nVERIFIED_BY=deterministic_correctness_backstop\n" + ) + else: + logger.info( + "harness-verifier: deterministic --correctness backstop FAILED for %s (rc=%s)", + hp, + check["returncode"], + ) + except Exception as exc: # noqa: BLE001 — backstop must never crash dispatch + logger.debug("harness-verifier backstop errored (non-fatal): %s", exc) return result return _impl @@ -1323,7 +1441,39 @@ def _impl( eval_command = saved_eval resolved_gpu = gpu_id if gpu_id is not None else agent.config.gpu_id - resolved_work_dir = Path(work_dir) if work_dir else None + # Default the harness work_dir to the EFFECTIVE repo root (set on + # _extra_template_vars; retargeted to ``_opt_repo`` after a translation), + # falling back to the source repo. A None work_dir leaves GEAK_WORK_DIR + # unset, so the harness resolves paths against its own dir and cannot + # find the kernel (silent "no latency"). The orchestrator prompt does + # not pass work_dir, so this default is what makes the kernel resolvable. + if work_dir: + resolved_work_dir: Path | None = Path(work_dir) + else: + _effective_repo = ( + agent._extra_template_vars.get("repo_root") if hasattr(agent, "_extra_template_vars") else None + ) + if _effective_repo: + resolved_work_dir = Path(_effective_repo) + elif agent.config.repo: + resolved_work_dir = Path(agent.config.repo) + else: + resolved_work_dir = None + + # Skip the up-front correctness gate when translation already ran and + # succeeded. Translation validates correctness + perf-regression on its + # own harness; the baseline gate re-checks on the stricter + # harness-generator harness and trips on any non-zero exit (timeout / + # env / multi-shape miss), discarding kernels translation already + # accepted. This is scoped to translation runs — user-supplied + # harnesses (no translation, eval_command, Path A) still gate. + translation = agent._collected.get("translation") + skip_correctness_gate = bool(translation is not None and getattr(translation, "success", False)) + if skip_correctness_gate: + logger.info( + "collect_baseline: skipping correctness gate (translation succeeded; " + "correctness already validated upstream)" + ) if harness_path: baseline: BaselineMetrics = collect_baseline_metrics( @@ -1331,18 +1481,20 @@ def _impl( repeats=repeats, work_dir=resolved_work_dir, gpu_id=resolved_gpu, + skip_correctness_gate=skip_correctness_gate, ) agent._collected["baseline"] = baseline - from minisweagent.run.preprocess_v3.baseline import capture_full_benchmark_stdout + if baseline.success: + from minisweagent.run.preprocess_v3.baseline import capture_full_benchmark_stdout - fb_stdout = capture_full_benchmark_stdout( - Path(harness_path), - work_dir=resolved_work_dir, - gpu_id=resolved_gpu, - ) - if fb_stdout: - agent._collected["full_benchmark_stdout"] = fb_stdout + fb_stdout = capture_full_benchmark_stdout( + Path(harness_path), + work_dir=resolved_work_dir, + gpu_id=resolved_gpu, + ) + if fb_stdout: + agent._collected["full_benchmark_stdout"] = fb_stdout else: from minisweagent.run.preprocess_v3.baseline import collect_baseline_from_eval_command @@ -1354,6 +1506,45 @@ def _impl( ) agent._collected["baseline"] = baseline + if not baseline.success: + # Surface WHY the baseline is empty. A harness that points at a + # non-existent kernel fails every mode with FileNotFoundError / + # ImportError; report that precise path instead of a silent + # "produced no latency" so the failure is diagnosable and the + # regenerated harness can be fixed. + reason = detect_kernel_resolution_failure(baseline.raw_outputs) or ( + "harness ran but produced no GEAK_RESULT_LATENCY_MS marker" + ) + logger.error( + "collect_baseline: no baseline produced for %s — %s", + harness_path or eval_command, + reason, + ) + # Fail-closed: when the harness-generator retry budget is already + # exhausted and the harness still cannot produce a baseline, the + # harness is unusable. Terminate with a clear error rather than + # running on a known-broken harness or letting the orchestrator spin + # to its step limit retrying. + attempts = int(agent._collected.get("_harness_generator_attempts", 0) or 0) + if harness_path and attempts >= 3: + raise FinishedSuccessfully( + { + "harness_path": agent._collected.get("harness_path"), + "commandment_path": agent._collected.get("commandment_path"), + "errors": [f"harness unusable after {attempts} generator attempts: {reason}"], + "summary": "", + } + ) + return { + "ok": False, + "median_ms": None, + "samples_ms": [], + "stdev_ms": baseline.stdev_ms, + "repeats": baseline.repeats, + "command": baseline.command, + "error": reason, + } + return { "ok": baseline.success, "median_ms": baseline.median_ms, diff --git a/src/minisweagent/tools/rag_postprocessor.py b/src/minisweagent/tools/rag_postprocessor.py index 5d0afde5e..ac456f88d 100644 --- a/src/minisweagent/tools/rag_postprocessor.py +++ b/src/minisweagent/tools/rag_postprocessor.py @@ -88,7 +88,17 @@ def model(self): config = copy.deepcopy(self.config.model_config) self._model = get_model(config=config) else: - self._model = get_model() + # No explicit model config: load the GEAK gateway model (honors + # model_class, e.g. ``amd_llm``). A bare ``get_model()`` defaults + # to a provider-less ``LitellmModel`` that 400s on + # gateway-routed names like ``claude-opus-4.6`` ("LLM Provider + # NOT provided"), which would crash the calling agent. + try: + from minisweagent.run.pipeline_helpers import load_geak_model + + self._model = load_geak_model(None) + except Exception: # noqa: BLE001 — last-resort fallback + self._model = get_model() model_impl = getattr(self._model, "_impl", self._model) if hasattr(model_impl, "tools"): model_impl.tools = [] @@ -116,9 +126,16 @@ def process(self, rag_result: str, query: str = "") -> str: logger.debug("RAG postprocessor processing %d chars", len(rag_result)) - response = self.model.query( - [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}] - ) + # RAG post-processing is advisory polish on retrieved knowledge — it + # must NEVER crash the calling agent. On any model/transport failure, + # fall back to the raw retrieval result. + try: + response = self.model.query( + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}] + ) + except Exception as exc: # noqa: BLE001 + logger.warning("RAG postprocessor LLM call failed (%s); returning raw retrieval result", exc) + return rag_result result = response["content"] logger.debug("RAG postprocessor output %d chars", len(result)) diff --git a/subagents/preprocess/harness-generator/SUBAGENT.yaml b/subagents/preprocess/harness-generator/SUBAGENT.yaml index d9b43aa89..7b6ac9736 100644 --- a/subagents/preprocess/harness-generator/SUBAGENT.yaml +++ b/subagents/preprocess/harness-generator/SUBAGENT.yaml @@ -53,7 +53,8 @@ system_prompt: | ## Worktree Path Discipline (MANDATORY — read before writing any import/build path) - At optimization time GEAK applies each candidate patch inside a PER-SLOT git worktree exported as `$GEAK_WORK_DIR` (placed first on PYTHONPATH). The harness MUST resolve EVERY repository path from `$GEAK_WORK_DIR` so it imports/compiles the PATCHED candidate — never the original source tree. If it reads the baseline, correctness always PASSes and every measured speedup is ~1.00x with no error, and the optimizer trains on a flat signal. - Derive once at the top: `WORK_DIR = os.environ.get("GEAK_WORK_DIR", os.path.dirname(os.path.abspath(__file__)))`. - - Python imports: prepend the worktree's package dir to `sys.path` — e.g. `sys.path.insert(0, os.path.join(WORK_DIR, "python"))` (adjust `"python"` to wherever the package lives in the repo). Do this ONCE. + - The Context block provides `kernel_relpath` — the kernel-under-test's path RELATIVE to `$GEAK_WORK_DIR`. Resolve the kernel EXACTLY as `os.path.join(WORK_DIR, kernel_relpath)`. Do NOT infer, guess, or reconstruct the subdirectory from the source tree or the absolute kernel path — using the provided `kernel_relpath` verbatim is mandatory (a wrong path makes every mode fail with FileNotFoundError and produces no baseline). + - Python imports: prepend the worktree's package dir to `sys.path` — e.g. `sys.path.insert(0, os.path.join(WORK_DIR, "python"))` (adjust `"python"` to wherever the package lives in the repo, consistent with `kernel_relpath`). Do this ONCE. - NEVER hardcode an absolute source-repo path (e.g. `"/sgl-workspace/sglang/python"`), and NEVER add one as a fallback element in a sys.path candidate list/tuple — with `sys.path.insert(0, ...)` in a loop it would land AHEAD of the worktree and silently import the baseline. The worktree-derived entry must be the ONLY one you insert. The sole permitted absolute literal anywhere is the default arg of `os.environ.get("GEAK_WORK_DIR", )`. - C/C++/HIP/CUDA: build include flags as `f"-I{WORK_DIR}/"`, compile into a DETERMINISTIC fixed-name dir under `WORK_DIR` (e.g. `f"{WORK_DIR}/_geak_build"`; it is already per-slot-isolated because WORK_DIR differs per worktree), and do an INCREMENTAL rebuild keyed on source mtime/hash — rebuild only when the kernel source is newer than the artifact (a patched kernel has a newer mtime so it still recompiles), otherwise reuse the cache. Never do an unconditional cold rebuild every run (turns validation into a multi-hour recompile loop). - SELF-CONTAINED BUILD (MANDATORY): the harness MUST build the artifact ITSELF, from scratch, whenever the build dir / compiled `.so` is MISSING. Every candidate runs in a FRESH per-slot git worktree that does NOT contain any prior build — `_geak_build/` is untracked and will NOT be present. So the build cache is an OPTIMIZATION, never a PRECONDITION: `if artifact missing -> generate build files + compile; elif source newer -> recompile; else -> reuse`. NEVER `sys.exit`/raise demanding that a "preprocess seed" or pre-existing `_geak_build/` be present — that makes correctness fail on every fresh candidate worktree, so the optimizer sees a flat ~1.00x signal (the same failure mode as the worktree-bypass bug). You may reuse SHARED, READ-ONLY prebuilt third-party deps (e.g. composable_kernel / flashinfer headers) to speed the compile, but the kernel-under-test extension itself must always be buildable from only `$GEAK_WORK_DIR` + the toolchain. diff --git a/tests/run/test_preprocess_v3_bugfixes.py b/tests/run/test_preprocess_v3_bugfixes.py index 8472d1ad9..b332ee5b9 100644 --- a/tests/run/test_preprocess_v3_bugfixes.py +++ b/tests/run/test_preprocess_v3_bugfixes.py @@ -10,9 +10,11 @@ PreprocessOrchestratorConfig, ) from minisweagent.run.preprocess_v3.tools import ( + _make_tool_collect_baseline, _make_tool_commandment_from_user_command, _make_tool_dispatch_subagent, _make_tool_finish_preprocess, + _make_tool_translate_to_flydsl, ) @@ -233,3 +235,304 @@ def test_legacy_context_recovers_harness_path_from_promoted_command(tmp_path: Pa assert ctx["full_benchmark_baseline"] == str(output_dir / "full_benchmark_baseline.txt") assert (output_dir / "benchmark_baseline.txt").read_text() == "GEAK_RESULT_LATENCY_MS=1.25\n" assert ctx["v3_path_taken"] == "A" + + +@pytest.mark.parametrize( + "translation, expected_skip", + [ + (SimpleNamespace(success=True), True), # translation validated -> skip gate + (SimpleNamespace(success=False), False), # translation failed -> keep gate + (None, False), # user-supplied harness -> keep gate + ], +) +def test_collect_baseline_skips_gate_only_on_translation_success( + monkeypatch, tmp_path: Path, translation, expected_skip +) -> None: + """The baseline correctness gate is skipped iff a translation succeeded. + + Translation runs its own correctness + perf-regression check, so re-gating + on the stricter harness-generator harness discards already-validated kernels + (the FAIL_PREPROCESS-on-translation bug). The skip must stay scoped to + translation runs: user-supplied harnesses (no translation, or a failed one) + must still be gated. + """ + import minisweagent.run.preprocess_v3.tools as tools_module + + harness = tmp_path / "harness.py" + harness.write_text("print('GEAK_RESULT_LATENCY_MS=1.0')\n") + + captured: dict[str, object] = {} + + def fake_collect_baseline_metrics(harness_path, *, repeats, work_dir, gpu_id, skip_correctness_gate=False): + captured["skip_correctness_gate"] = skip_correctness_gate + return SimpleNamespace( + success=True, median_ms=1.0, samples_ms=[1.0], stdev_ms=0.0, + repeats=repeats, harness_path=harness_path, command="", + ) + + monkeypatch.setattr(tools_module, "collect_baseline_metrics", fake_collect_baseline_metrics) + import minisweagent.run.preprocess_v3.baseline as baseline_module + + monkeypatch.setattr(baseline_module, "capture_full_benchmark_stdout", lambda *a, **k: None) + + agent = PreprocessOrchestratorAgent( + model=object(), + config=PreprocessOrchestratorConfig(repo=tmp_path), + ) + if translation is not None: + agent._collected["translation"] = translation + + tool = _make_tool_collect_baseline(agent) + tool(harness_path=str(harness), repeats=1) + + assert captured["skip_correctness_gate"] is expected_skip + + +def test_dispatch_subagent_injects_deterministic_kernel_path(monkeypatch, tmp_path: Path) -> None: + """The orchestrator hands the harness subagents the exact worktree-relative + kernel path so they never have to guess it from the source tree.""" + import minisweagent.run.preprocess_v3.tools as tools_module + + # Keep the test focused on injection — no real sandbox copy. + monkeypatch.setattr(tools_module, "_ensure_preprocess_subagent_sandbox", lambda agent: (None, {})) + + repo = tmp_path / "repo" + (repo / "level3").mkdir(parents=True) + kernel = repo / "level3" / "1_MLP.py" + kernel.write_text("# kernel\n") + + captured: dict[str, object] = {} + + def fake_dispatcher(*, name, task, model, cwd=None, context=None): + captured["context"] = context + return {"name": name, "success": True, "output": "HARNESS_PATH: /tmp/harness.py"} + + agent = PreprocessOrchestratorAgent(model=object(), config=PreprocessOrchestratorConfig(repo=repo)) + agent._extra_template_vars = {"kernel_path": str(kernel), "repo_root": str(repo)} + + tool = _make_tool_dispatch_subagent(agent, fake_dispatcher) + tool(name="harness-generator", task="make a harness") + + assert captured["context"]["kernel_relpath"] == "level3/1_MLP.py" + assert captured["context"]["kernel_path"] == str(kernel) + + +def _failed_baseline(stderr: str) -> SimpleNamespace: + return SimpleNamespace( + success=False, median_ms=None, samples_ms=[], stdev_ms=None, + repeats=0, command="cmd", raw_outputs=[{"stderr": stderr, "stdout": ""}], + ) + + +def test_detect_kernel_resolution_failure() -> None: + from minisweagent.run.preprocess_v3.baseline import detect_kernel_resolution_failure + + raw = [{"stderr": "Traceback\nFileNotFoundError: [Errno 2] No such file or directory: '/x/k.py'\n", "stdout": ""}] + msg = detect_kernel_resolution_failure(raw) + assert msg is not None and "/x/k.py" in msg and "FileNotFoundError" in msg + assert detect_kernel_resolution_failure([{"stderr": "TIMEOUT after 600s", "stdout": ""}]) is None + + +def test_collect_baseline_fail_closed_after_retry_budget(monkeypatch, tmp_path: Path) -> None: + """An empty baseline after the generator retry budget is exhausted terminates + the run with a precise error instead of spinning / running on a broken harness.""" + import minisweagent.run.preprocess_v3.tools as tools_module + from minisweagent.run.preprocess_v3.orchestrator import FinishedSuccessfully + + harness = tmp_path / "harness.py" + harness.write_text("x") + monkeypatch.setattr( + tools_module, "collect_baseline_metrics", + lambda *a, **k: _failed_baseline("FileNotFoundError: No such file or directory: '/x/k.py'"), + ) + + agent = PreprocessOrchestratorAgent(model=object(), config=PreprocessOrchestratorConfig(repo=tmp_path)) + agent._collected["_harness_generator_attempts"] = 3 + tool = _make_tool_collect_baseline(agent) + + with pytest.raises(FinishedSuccessfully) as exc_info: + tool(harness_path=str(harness), repeats=1) + assert "/x/k.py" in exc_info.value.payload["errors"][0] + + +def test_collect_baseline_returns_precise_error_within_budget(monkeypatch, tmp_path: Path) -> None: + """Before the retry budget is exhausted, an empty baseline returns ok=False + with the precise kernel-resolution reason (so the generator can be retried).""" + import minisweagent.run.preprocess_v3.tools as tools_module + + harness = tmp_path / "harness.py" + harness.write_text("x") + monkeypatch.setattr( + tools_module, "collect_baseline_metrics", + lambda *a, **k: _failed_baseline("FileNotFoundError: No such file or directory: '/x/k.py'"), + ) + + agent = PreprocessOrchestratorAgent(model=object(), config=PreprocessOrchestratorConfig(repo=tmp_path)) + agent._collected["_harness_generator_attempts"] = 1 + tool = _make_tool_collect_baseline(agent) + + res = tool(harness_path=str(harness), repeats=1) + assert res["ok"] is False + assert "/x/k.py" in res["error"] + + +def test_translate_retargets_preprocess_state_to_opt_repo(monkeypatch, tmp_path: Path) -> None: + """After translation, the orchestrator's kernel_path/repo_root point at the + per-run _opt_repo (where optimization runs), not the source repo — so the + harness sandbox + baseline resolve the translated kernel.""" + import minisweagent.run.preprocess_v3.tools as tools_module + from minisweagent.run.preprocess_v3.translate import TranslationResult + + src_repo = tmp_path / "src" + src_repo.mkdir() + orig = src_repo / "k.py" + orig.write_text("# orig\n") + out = tmp_path / "out" + out.mkdir() + cand_dir = tmp_path / "cand" + cand_dir.mkdir() + cand_file = cand_dir / "k_flydsl.py" + cand_file.write_text("# flydsl\n") + + result = TranslationResult( + success=True, target_language="flydsl", translated_kernel_path=cand_file, + speedup=None, self_review="", errors=[], elapsed_s=0.0, raw={}, + ) + monkeypatch.setattr(tools_module, "translate_to_flydsl", lambda **k: result) + + agent = PreprocessOrchestratorAgent(model=object(), config=PreprocessOrchestratorConfig(repo=src_repo)) + agent._extra_template_vars = {"kernel_path": str(orig), "repo_root": str(src_repo)} + + tool = _make_tool_translate_to_flydsl(agent) + tool(source_path=str(orig), output_dir=str(out)) + + opt_repo = (out / "_opt_repo").resolve() + assert agent._extra_template_vars["repo_root"] == str(opt_repo) + assert agent._extra_template_vars["kernel_path"] == str((opt_repo / "k_flydsl.py").resolve()) + + +def test_collect_baseline_defaults_work_dir_to_effective_repo_root(monkeypatch, tmp_path: Path) -> None: + """collect_baseline runs the harness with work_dir = the effective repo root + (retargeted to _opt_repo after translation) so the kernel is resolvable.""" + import minisweagent.run.preprocess_v3.baseline as baseline_module + import minisweagent.run.preprocess_v3.tools as tools_module + + captured: dict[str, object] = {} + + def fake_collect_baseline_metrics(harness_path, *, repeats, work_dir, gpu_id, skip_correctness_gate=False): + captured["work_dir"] = work_dir + return SimpleNamespace( + success=True, median_ms=1.0, samples_ms=[1.0], stdev_ms=0.0, + repeats=repeats, harness_path=harness_path, command="", raw_outputs=[], + ) + + monkeypatch.setattr(tools_module, "collect_baseline_metrics", fake_collect_baseline_metrics) + monkeypatch.setattr(baseline_module, "capture_full_benchmark_stdout", lambda *a, **k: None) + + harness = tmp_path / "h.py" + harness.write_text("x") + opt_repo = tmp_path / "_opt_repo" + opt_repo.mkdir() + + agent = PreprocessOrchestratorAgent(model=object(), config=PreprocessOrchestratorConfig(repo=tmp_path / "src")) + agent._extra_template_vars = {"repo_root": str(opt_repo)} + + tool = _make_tool_collect_baseline(agent) + res = tool(harness_path=str(harness), repeats=1) + + assert res["ok"] is True + assert captured["work_dir"] == opt_repo + + +def test_copy_repo_sandbox_copies_repo_living_under_output_dir(tmp_path: Path) -> None: + """When the repo to sandbox is the per-run _opt_repo (which lives UNDER + output_dir), its own files must be copied — not ignored by the output-dir + recursion guard (which would leave an empty sandbox).""" + from minisweagent.run.preprocess_v3.tools import _copy_repo_sandbox + + output_dir = tmp_path / "out" + opt_repo = output_dir / "_opt_repo" + opt_repo.mkdir(parents=True) + (opt_repo / "1_MLP_flydsl.py").write_text("# flydsl\n") + (opt_repo / "1_MLP.py").write_text("# ref\n") + sandbox = output_dir / "_preprocess_subagent_worktree" + + _copy_repo_sandbox(opt_repo, sandbox, output_dir) + + assert (sandbox / "1_MLP_flydsl.py").is_file() + assert (sandbox / "1_MLP.py").is_file() + + +def test_copy_repo_sandbox_still_skips_nested_output_dir(tmp_path: Path) -> None: + """The recursion guard still fires when output_dir lives INSIDE the repo: + the output tree must not be copied into the sandbox.""" + from minisweagent.run.preprocess_v3.tools import _copy_repo_sandbox + + repo = tmp_path / "repo" + repo.mkdir() + (repo / "kernel.py").write_text("# k\n") + output_dir = repo / "optimization_logs" / "run1" + output_dir.mkdir(parents=True) + (output_dir / "log.txt").write_text("noise\n") + sandbox = tmp_path / "sandbox" + + _copy_repo_sandbox(repo, sandbox, output_dir) + + assert (sandbox / "kernel.py").is_file() + assert not (sandbox / "optimization_logs" / "run1" / "log.txt").exists() + + +def test_verifier_backstop_marks_verified_when_correctness_passes(monkeypatch, tmp_path: Path) -> None: + """If the LLM verifier fails to confirm but the harness passes --correctness, + the deterministic backstop marks it HARNESS_VERIFIED so the orchestrator + proceeds to baseline instead of looping the generator.""" + import minisweagent.run.preprocess_v3.baseline as baseline_module + import minisweagent.run.preprocess_v3.tools as tools_module + + monkeypatch.setattr(tools_module, "_ensure_preprocess_subagent_sandbox", lambda agent: (None, {})) + monkeypatch.setattr( + baseline_module, "_run_benchmark_once", + lambda *a, **k: {"returncode": 0, "stdout": "", "stderr": "", "duration_s": 1.0, "latency_ms": None}, + ) + + def fake_dispatcher(*, name, task, model, cwd=None, context=None): + return {"name": name, "success": False, "output": "could not confirm"} + + harness = tmp_path / "harness.py" + harness.write_text("x") + agent = PreprocessOrchestratorAgent(model=object(), config=PreprocessOrchestratorConfig(repo=tmp_path)) + agent._collected["harness_path"] = str(harness) + agent._extra_template_vars = {"repo_root": str(tmp_path)} + + tool = _make_tool_dispatch_subagent(agent, fake_dispatcher) + res = tool(name="harness-verifier", task="verify") + + assert res["success"] is True + assert "HARNESS_VERIFIED=true" in res["output"] + + +def test_verifier_backstop_no_false_positive_when_correctness_fails(monkeypatch, tmp_path: Path) -> None: + """The backstop must NOT mark a harness verified when --correctness fails.""" + import minisweagent.run.preprocess_v3.baseline as baseline_module + import minisweagent.run.preprocess_v3.tools as tools_module + + monkeypatch.setattr(tools_module, "_ensure_preprocess_subagent_sandbox", lambda agent: (None, {})) + monkeypatch.setattr( + baseline_module, "_run_benchmark_once", + lambda *a, **k: {"returncode": 1, "stdout": "", "stderr": "FileNotFoundError", "duration_s": 1.0, "latency_ms": None}, + ) + + def fake_dispatcher(*, name, task, model, cwd=None, context=None): + return {"name": name, "success": False, "output": "nope"} + + harness = tmp_path / "harness.py" + harness.write_text("x") + agent = PreprocessOrchestratorAgent(model=object(), config=PreprocessOrchestratorConfig(repo=tmp_path)) + agent._collected["harness_path"] = str(harness) + agent._extra_template_vars = {"repo_root": str(tmp_path)} + + tool = _make_tool_dispatch_subagent(agent, fake_dispatcher) + res = tool(name="harness-verifier", task="verify") + + assert res["success"] is False + assert "HARNESS_VERIFIED=true" not in (res.get("output") or "")