AMD-AGI · iraj465 · Jun 10, 2026
@@ -468,10 +468,21 @@ def parse_action(self, response: dict) -> dict:
         }
         content = response.get("content", "")
         actions = re.findall(r"```bash\s*\n(.*?)\n```", content, re.DOTALL) if content else []
+        # Track whether ANY real action dispatched (bash / tool / skill). A
+        # prose-only turn — no fenced bash, no native tool call — must NOT be
+        # silently accepted as a successful no-op: the model has stalled (it
+        # often *believes* it called ``submit`` and narrates the result in
+        # prose), and returning {"output":"","returncode":0} gives it no signal
+        # to correct, so it repeats "Done." every step until the step limit
+        # (observed in the heterogeneous task-planner: 143 prose turns -> 0
+        # tool calls -> LimitsExceeded). Raise FormatError instead so the model
+        # is nudged to emit a real action / tool call.
+        acted = False
         if len(actions) == 1:
             bash_action = self.execute_action({"action": actions[0].strip(), **response})
             all_action["output"] += bash_action["output"]
             all_action["returncode"] = max(all_action["returncode"], bash_action["returncode"])
+            acted = True
         if response.get("tools"):
             from minisweagent.tools.submit import Submitted as ToolSubmitted
 
@@ -484,10 +495,16 @@ def parse_action(self, response: dict) -> dict:
             tool_action = self._handle_tool_result(result)
             all_action["output"] += tool_action["output"]
             all_action["returncode"] = max(all_action["returncode"], tool_action["returncode"])
+            acted = True
         if self.config.use_skills:
             skills_action = self.skillruntime.load_skill(response)
             all_action["output"] += skills_action["output"]
             all_action["returncode"] = max(all_action["returncode"], skills_action["returncode"])
+            if skills_action.get("output") or skills_action.get("returncode"):
+                acted = True
+        if not acted:
+            # No bash, no tool, no skill — prose-only stall. Nudge the model.
+            raise FormatError(self.render_template(self.config.format_error_template, actions=actions))
         if all_action["output"] or all_action["returncode"] == 0:
             return all_action
         else:

@@ -139,6 +139,52 @@ def run_preprocess_v3(
     source_language = detected_language.name
     target_lang_name = (target_language or detected_language.name).lower()
 
+    # Deterministic Path-A bypass for a PRE-VALIDATED harness.
+    #
+    # When the caller hands us a harness it already validated end-to-end, the
+    # entire A1 preprocess (render COMMANDMENT -> collect_baseline ->
+    # collect_profile) is deterministic — there is nothing for the LLM
+    # orchestrator to decide. Driving it through the LLM loop anyway is not just
+    # wasteful: the classifier can misroute (e.g. divert a shape-bearing task to
+    # the harness-GENERATOR) or simply fail to converge, burning the whole
+    # preprocess budget without ever producing a baseline. Run the deterministic
+    # sequence directly and skip the LLM entirely. Opt-out: GEAK_NO_PREVALIDATED_BYPASS=1.
+    _bypass_disabled = os.environ.get("GEAK_NO_PREVALIDATED_BYPASS", "").strip().lower() in ("1", "true", "yes", "on")
+    if harness and not translate_only and not _bypass_disabled and Path(harness).is_file():
+        t0 = time.monotonic()
+        result = _run_prevalidated_path_a(
+            harness=Path(harness),
+            kernel_path=kernel_path,
+            repo_root=repo_root,
+            kernel_language=detected_language,
+            output_dir=output_dir,
+            gpu_id=gpu_id,
+            correctness_command=correctness_command,
+            performance_command=performance_command,
+        )
+        # PreprocessResult is a frozen dataclass; stamp elapsed via replace().
+        from dataclasses import replace as _dc_replace
+        result = _dc_replace(result, elapsed_s=time.monotonic() - t0)
+        logger.info(
+            "v3 preprocess (pre-validated Path-A bypass) completed in %.1fs (success=%s, errors=%d)",
+            result.elapsed_s, result.success, len(result.errors),
+        )
+        if not result.success and not _can_proceed_despite_failure(result):
+            raise RuntimeError(
+                "v3 preprocess (pre-validated bypass) failed: "
+                + ("; ".join(result.errors) if result.errors else "no artefacts produced")
+            )
+        return _preprocess_result_to_legacy_context(
+            result=result,
+            repo_root=repo_root,
+            output_dir=output_dir,
+            kernel_path_input=kernel_path,
+            harness=harness,
+            eval_command=eval_command,
+            correctness_command=correctness_command,
+            performance_command=performance_command,
+        )
+
     config = PreprocessOrchestratorConfig(
         gpu_id=gpu_id,
         repo=Path(repo_root) if repo_root else None,
@@ -210,6 +256,130 @@ def run_preprocess_v3(
     )
 
 
+def _run_prevalidated_path_a(
+    *,
+    harness: Path,
+    kernel_path: Path,
+    repo_root: str | None,
+    kernel_language: KernelLanguage,
+    output_dir: Path,
+    gpu_id: int,
+    correctness_command: str | list[str] | None,
+    performance_command: str | list[str] | None,
+) -> PreprocessResult:
+    """Run the deterministic A1 preprocess for a pre-validated harness — no LLM.
+
+    Mirrors exactly what the orchestrator's deterministic tools do on Path A
+    (``collect_baseline`` -> ``collect_profile`` -> ``render_commandment``),
+    but called directly so a pre-validated harness never depends on the LLM
+    classifier converging. The same worktree-bypass gate the
+    ``render_commandment`` tool enforces is applied here, so a harness that
+    hardcodes the source-repo path is still rejected (it would otherwise
+    measure the unpatched baseline at ~1.00x).
+    """
+    from minisweagent.run.preprocess_v3.baseline import (
+        BaselineMetrics,
+        ProfileResult,
+        capture_full_benchmark_stdout,
+        collect_baseline_metrics,
+        collect_profile,
+    )
+    from minisweagent.run.preprocess_v3.commandment import (
+        CommandmentContext,
+        render_commandment,
+    )
+
+    work_dir = Path(repo_root) if repo_root else None
+    errors: list[str] = []
+    warnings: list[str] = []
+
+    # Worktree-bypass gate (deterministic, final) — identical contract to the
+    # render_commandment tool. A harness that imports the source repo directly
+    # silently evaluates the UNPATCHED baseline, so refuse it up front.
+    if not os.environ.get("GEAK_ALLOW_HARDCODED_PATHS") and repo_root:
+        try:
+            from minisweagent.kernel_languages.contract import (
+                ContractViolation,
+                validate_harness,
+            )
+
+            validate_harness(harness, repo_root=repo_root)
+        except ContractViolation as exc:
+            logger.error("pre-validated bypass REJECTED harness (worktree bypass): %s", exc)
+            return PreprocessResult(
+                success=False,
+                kernel_language=kernel_language,
+                kernel_path=kernel_path,
+                harness_path=harness,
+                path_taken="A",
+                errors=[f"worktree_bypass: {exc}"],
+            )
+        except Exception as exc:  # noqa: BLE001 — never let the gate crash the bypass
+            logger.debug("pre-validated bypass: worktree gate skipped (validator error): %s", exc)
+
+    baseline: BaselineMetrics | None = None
+    full_benchmark_stdout: str | None = None
+    try:
+        baseline = collect_baseline_metrics(
+            harness, work_dir=work_dir, gpu_id=gpu_id,
+        )
+        full_benchmark_stdout = capture_full_benchmark_stdout(
+            harness, work_dir=work_dir, gpu_id=gpu_id,
+        )
+    except Exception as exc:  # noqa: BLE001
+        errors.append(f"collect_baseline failed: {exc}")
+        logger.error("pre-validated bypass: collect_baseline failed: %s", exc)
+
+    # Profiling is advisory (matches the orchestrator escape-hatch contract: a
+    # run with a verified harness + baseline is salvageable even if profile fails).
+    profile: ProfileResult | None = None
+    try:
+        profile = collect_profile(harness, work_dir=work_dir, gpu_id=gpu_id)
+    except Exception as exc:  # noqa: BLE001
+        warnings.append(f"collect_profile failed (non-fatal): {exc}")
+        logger.warning("pre-validated bypass: collect_profile failed (non-fatal): %s", exc)
+
+    commandment_path: Path | None = None
+    try:
+        ctx = CommandmentContext(
+            kernel_path=kernel_path,
+            harness_path=harness,
+            repo_root=Path(repo_root) if repo_root else None,
+            correctness_command=correctness_command,
+            performance_command=performance_command,
+        )
+        out_path = output_dir / "COMMANDMENT.md"
+        render_commandment(kernel_language, ctx, out_path=out_path)
+        commandment_path = out_path
+    except Exception as exc:  # noqa: BLE001
+        errors.append(f"render_commandment failed: {exc}")
+        logger.error("pre-validated bypass: render_commandment failed: %s", exc)
+
+    success = (
+        baseline is not None
+        and baseline.success
+        and commandment_path is not None
+    )
+    return PreprocessResult(
+        success=success,
+        kernel_language=kernel_language,
+        kernel_path=kernel_path,
+        harness_path=harness,
+        baseline=baseline,
+        full_benchmark_stdout=full_benchmark_stdout,
+        profile=profile,
+        commandment_path=commandment_path,
+        path_taken="A",
+        tool_calls=[
+            {"name": "collect_baseline", "args": {"harness_path": str(harness)}},
+            {"name": "collect_profile", "args": {"harness_path": str(harness)}},
+            {"name": "render_commandment", "args": {"harness_path": str(harness)}},
+        ],
+        errors=errors,
+        warnings=warnings,
+    )
+
+
 # ---------------------------------------------------------------------------
 # Codebase-explore kernel discovery
 # ---------------------------------------------------------------------------

@@ -117,7 +117,9 @@ class LimitsExceeded(TerminatingException):
 **Case A — user provided explicit run instructions / commands.**
 Indicators: a literal command-line invocation (``python <script>``, ``pytest ... -k ...``, ``make ...``, shell script, existing custom harness command). The command is opaque: it may NOT support GEAK's four harness flags.
 
-**Shapes pre-check (decide this FIRST, before A1 vs A2).** If the task prompt carries shapes (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple) **and** the user's command does not already pin those exact shapes (e.g. it has no matching ``-m/-n``/dim arguments), route to **A2-with-shapes** — *regardless of whether the command contains a GEAK mode flag*. A flag-aware command whose shapes are not pinned would otherwise be classified A1 and run the harness's full default sweep, ignoring the prompt shapes, until it times out. When you are **unsure** whether the command already pins the prompt's exact shapes, choose A2-with-shapes: needlessly sending a shape-pinning command to the generator only costs time / a possible regeneration (a perf cost), whereas wrongly keeping it in A1 silently drops the shapes and hits the default-sweep timeout (a correctness bug). Decide the shapes question BEFORE deciding the command is directly runnable.
+**Pre-validated-harness exemption (check this BEFORE the shapes pre-check).** If the "Hints from the call site" section marks the harness **pre-validated** for the four standard modes, the shapes pre-check below does NOT apply: a pre-validated harness already encodes its authoritative shapes internally (the caller validated it end-to-end), so there are no "unpinned prompt shapes" to fix and there is nothing to regenerate. Route it straight to **A1** and call ``commandment_from_user_command`` with the harness invocation. Regenerating a pre-validated harness via ``harness-generator`` is always wrong: it discards a working harness and the generator loop can burn the entire preprocess budget without ever producing a baseline.
+
+**Shapes pre-check (decide this FIRST, before A1 vs A2 — but AFTER the pre-validated-harness exemption above).** If the task prompt carries shapes (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple) **and** the user's command does not already pin those exact shapes (e.g. it has no matching ``-m/-n``/dim arguments) **and the harness is NOT marked pre-validated**, route to **A2-with-shapes** — *regardless of whether the command contains a GEAK mode flag*. A flag-aware command whose shapes are not pinned would otherwise be classified A1 and run the harness's full default sweep, ignoring the prompt shapes, until it times out. When you are **unsure** whether the command already pins the prompt's exact shapes, choose A2-with-shapes: needlessly sending a shape-pinning command to the generator only costs time / a possible regeneration (a perf cost), whereas wrongly keeping it in A1 silently drops the shapes and hits the default-sweep timeout (a correctness bug). Decide the shapes question BEFORE deciding the command is directly runnable.
 
 Split the remaining Case A (no unpinned prompt shapes) on whether the command already speaks GEAK's harness contract:
 

@@ -333,3 +333,43 @@ def test_empty_actions_handling(model_factory):
     assert exit_status == "Submitted"
     assert submission == "done\n"
     assert agent.model.n_calls == 2
+
+
+def test_prose_only_turn_is_nudged_not_silently_accepted(model_factory):
+    """A prose-only turn (no fenced bash, no tool call) must surface a
+    FormatError nudge in the next observation, NOT be silently accepted as a
+    successful no-op.
+
+    Regression: ``parse_action`` previously returned ``{"output":"",
+    "returncode":0}`` for a prose-only response, which passed the
+    ``returncode == 0`` check and produced an empty observation. A model that
+    believes it already finished (e.g. narrates "Done." / "tasks submitted")
+    then repeats that prose every step with no corrective signal, looping until
+    the step limit (observed: heterogeneous task-planner, 143 prose turns -> 0
+    tool calls -> LimitsExceeded). The fix raises FormatError so the model is
+    told to emit a real action.
+    """
+    factory, config = model_factory
+    agent = DefaultAgent(
+        model=factory(
+            [
+                # Prose only — the model thinks it is done but took no action.
+                ("The tasks have been successfully submitted. No further action is needed.", []),
+                ("Actually finishing now", [{"command": "echo 'COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT'\necho 'done'"}]),
+            ]
+        ),
+        env=LocalEnvironment(),
+        **config,
+    )
+
+    exit_status, submission = agent.run("Test prose-only nudge")
+    assert exit_status == "Submitted"
+    assert submission == "done\n"
+    # The prose-only turn must have produced a corrective observation (the
+    # format-error nudge) rather than an empty no-op the model can't learn from.
+    nudge_seen = any(
+        "EXACTLY ONE action" in get_text(m)
+        for m in agent.messages
+        if m.get("role") == "user"
+    )
+    assert nudge_seen, "prose-only turn should surface a format-error nudge to the model"
@@ -426,3 +426,94 @@ def test_legacy_context_recovers_harness_path_from_promoted_command(tmp_path: Pa
     assert ctx["full_benchmark_baseline"] == str(output_dir / "full_benchmark_baseline.txt")
     assert (output_dir / "benchmark_baseline.txt").read_text() == "GEAK_RESULT_LATENCY_MS=1.25\n"
     assert ctx["v3_path_taken"] == "A"
+
+
+def test_prevalidated_harness_bypasses_llm_orchestrator(tmp_path: Path, monkeypatch) -> None:
+    """A pre-validated harness must run the deterministic Path-A sequence
+    (collect_baseline -> collect_profile -> render_commandment) WITHOUT ever
+    invoking the LLM orchestrator.
+
+    Regression: the orchestrator's LLM classifier could misroute a shape-bearing
+    task to the harness-generator or fail to converge, burning the whole
+    preprocess budget with no baseline. ``_run_prevalidated_path_a`` short-circuits
+    that. Here the model is a sentinel that raises if queried — proving the LLM
+    loop is skipped entirely.
+    """
+    import minisweagent.run.preprocess_v3.adapter as adapter_module
+
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    (repo / "kernel.py").write_text("# kernel\n")
+    harness = tmp_path / "harness.py"
+    harness.write_text("print('GEAK_RESULT_LATENCY_MS=2.0')\n")
+    output_dir = tmp_path / "out"
+
+    class _ExplodingModel:
+        def query(self, *a, **k):  # pragma: no cover - must never be called
+            raise AssertionError("LLM orchestrator was invoked for a pre-validated harness")
+
+    # Stub the deterministic building blocks so the test runs no real subprocess.
+    def fake_collect_baseline_metrics(harness_path, *, work_dir=None, gpu_id=0, repeats=5):
+        return SimpleNamespace(
+            success=True, median_ms=2.0, samples_ms=[2.0], stdev_ms=0.0,
+            repeats=repeats, harness_path=harness_path, command="python harness",
+            raw_outputs=[{"stdout": "GEAK_RESULT_LATENCY_MS=2.0", "returncode": 0, "latency_ms": 2.0}],
+        )
+
+    monkeypatch.setattr(adapter_module, "PreprocessOrchestratorAgent",
+                        lambda *a, **k: (_ for _ in ()).throw(AssertionError("orchestrator constructed")))
+    # The worktree-bypass gate (contract.validate_harness) is a separate concern;
+    # disable it here so the test exercises the bypass control-flow, not harness
+    # contract validation (covered elsewhere).
+    monkeypatch.setenv("GEAK_ALLOW_HARDCODED_PATHS", "1")
+    import minisweagent.run.preprocess_v3.baseline as baseline_module
+    monkeypatch.setattr(baseline_module, "collect_baseline_metrics", fake_collect_baseline_metrics)
+    monkeypatch.setattr(baseline_module, "capture_full_benchmark_stdout", lambda *a, **k: "GEAK_RESULT_LATENCY_MS=2.0")
+    monkeypatch.setattr(baseline_module, "collect_profile",
+                        lambda *a, **k: SimpleNamespace(success=False, profile=None, command="", backend="metrix", profile_path=None))
+
+    ctx = adapter_module.run_preprocess_v3(
+        kernel_url=str(repo / "kernel.py"),
+        output_dir=output_dir,
+        gpu_id=0,
+        model=_ExplodingModel(),
+        harness=str(harness),
+        repo=str(repo),
+    )
+
+    # Deterministic path produced the baseline + commandment, LLM never ran.
+    assert ctx["v3_path_taken"] == "A"
+    assert (output_dir / "COMMANDMENT.md").is_file()
+    assert ctx["benchmark_baseline"] == str(output_dir / "benchmark_baseline.txt")
+
+
+def test_prevalidated_bypass_opt_out_env(tmp_path: Path, monkeypatch) -> None:
+    """GEAK_NO_PREVALIDATED_BYPASS=1 disables the deterministic short-circuit
+    (falls back to the LLM orchestrator path)."""
+    import minisweagent.run.preprocess_v3.adapter as adapter_module
+
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    (repo / "kernel.py").write_text("# kernel\n")
+    harness = tmp_path / "harness.py"
+    harness.write_text("print('ok')\n")
+
+    monkeypatch.setenv("GEAK_NO_PREVALIDATED_BYPASS", "1")
+
+    called = {"bypass": False}
+    monkeypatch.setattr(adapter_module, "_run_prevalidated_path_a",
+                        lambda **k: called.__setitem__("bypass", True))
+    # Make the orchestrator path raise immediately so we can detect we reached it
+    # (and did NOT take the bypass).
+    monkeypatch.setattr(adapter_module, "PreprocessOrchestratorAgent",
+                        lambda *a, **k: (_ for _ in ()).throw(RuntimeError("reached-orchestrator")))
+
+    with pytest.raises(RuntimeError, match="reached-orchestrator"):
+        adapter_module.run_preprocess_v3(
+            kernel_url=str(repo / "kernel.py"),
+            output_dir=tmp_path / "out",
+            model=object(),
+            harness=str(harness),
+            repo=str(repo),
+        )
+    assert called["bypass"] is False