Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/minisweagent/agents/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,10 +468,21 @@ def parse_action(self, response: dict) -> dict:
}
content = response.get("content", "")
actions = re.findall(r"```bash\s*\n(.*?)\n```", content, re.DOTALL) if content else []
# Track whether ANY real action dispatched (bash / tool / skill). A
# prose-only turn — no fenced bash, no native tool call — must NOT be
# silently accepted as a successful no-op: the model has stalled (it
# often *believes* it called ``submit`` and narrates the result in
# prose), and returning {"output":"","returncode":0} gives it no signal
# to correct, so it repeats "Done." every step until the step limit
# (observed in the heterogeneous task-planner: 143 prose turns -> 0
# tool calls -> LimitsExceeded). Raise FormatError instead so the model
# is nudged to emit a real action / tool call.
acted = False
if len(actions) == 1:
bash_action = self.execute_action({"action": actions[0].strip(), **response})
all_action["output"] += bash_action["output"]
all_action["returncode"] = max(all_action["returncode"], bash_action["returncode"])
acted = True
if response.get("tools"):
from minisweagent.tools.submit import Submitted as ToolSubmitted

Expand All @@ -484,10 +495,16 @@ def parse_action(self, response: dict) -> dict:
tool_action = self._handle_tool_result(result)
all_action["output"] += tool_action["output"]
all_action["returncode"] = max(all_action["returncode"], tool_action["returncode"])
acted = True
if self.config.use_skills:
skills_action = self.skillruntime.load_skill(response)
all_action["output"] += skills_action["output"]
all_action["returncode"] = max(all_action["returncode"], skills_action["returncode"])
if skills_action.get("output") or skills_action.get("returncode"):
acted = True
if not acted:
# No bash, no tool, no skill — prose-only stall. Nudge the model.
raise FormatError(self.render_template(self.config.format_error_template, actions=actions))
if all_action["output"] or all_action["returncode"] == 0:
return all_action
else:
Expand Down
170 changes: 170 additions & 0 deletions src/minisweagent/run/preprocess_v3/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,52 @@ def run_preprocess_v3(
source_language = detected_language.name
target_lang_name = (target_language or detected_language.name).lower()

# Deterministic Path-A bypass for a PRE-VALIDATED harness.
#
# When the caller hands us a harness it already validated end-to-end, the
# entire A1 preprocess (render COMMANDMENT -> collect_baseline ->
# collect_profile) is deterministic — there is nothing for the LLM
# orchestrator to decide. Driving it through the LLM loop anyway is not just
# wasteful: the classifier can misroute (e.g. divert a shape-bearing task to
# the harness-GENERATOR) or simply fail to converge, burning the whole
# preprocess budget without ever producing a baseline. Run the deterministic
# sequence directly and skip the LLM entirely. Opt-out: GEAK_NO_PREVALIDATED_BYPASS=1.
_bypass_disabled = os.environ.get("GEAK_NO_PREVALIDATED_BYPASS", "").strip().lower() in ("1", "true", "yes", "on")
if harness and not translate_only and not _bypass_disabled and Path(harness).is_file():
t0 = time.monotonic()
result = _run_prevalidated_path_a(
harness=Path(harness),
kernel_path=kernel_path,
repo_root=repo_root,
kernel_language=detected_language,
output_dir=output_dir,
gpu_id=gpu_id,
correctness_command=correctness_command,
performance_command=performance_command,
)
# PreprocessResult is a frozen dataclass; stamp elapsed via replace().
from dataclasses import replace as _dc_replace
result = _dc_replace(result, elapsed_s=time.monotonic() - t0)
logger.info(
"v3 preprocess (pre-validated Path-A bypass) completed in %.1fs (success=%s, errors=%d)",
result.elapsed_s, result.success, len(result.errors),
)
if not result.success and not _can_proceed_despite_failure(result):
raise RuntimeError(
"v3 preprocess (pre-validated bypass) failed: "
+ ("; ".join(result.errors) if result.errors else "no artefacts produced")
)
return _preprocess_result_to_legacy_context(
result=result,
repo_root=repo_root,
output_dir=output_dir,
kernel_path_input=kernel_path,
harness=harness,
eval_command=eval_command,
correctness_command=correctness_command,
performance_command=performance_command,
)

config = PreprocessOrchestratorConfig(
gpu_id=gpu_id,
repo=Path(repo_root) if repo_root else None,
Expand Down Expand Up @@ -210,6 +256,130 @@ def run_preprocess_v3(
)


def _run_prevalidated_path_a(
*,
harness: Path,
kernel_path: Path,
repo_root: str | None,
kernel_language: KernelLanguage,
output_dir: Path,
gpu_id: int,
correctness_command: str | list[str] | None,
performance_command: str | list[str] | None,
) -> PreprocessResult:
"""Run the deterministic A1 preprocess for a pre-validated harness — no LLM.

Mirrors exactly what the orchestrator's deterministic tools do on Path A
(``collect_baseline`` -> ``collect_profile`` -> ``render_commandment``),
but called directly so a pre-validated harness never depends on the LLM
classifier converging. The same worktree-bypass gate the
``render_commandment`` tool enforces is applied here, so a harness that
hardcodes the source-repo path is still rejected (it would otherwise
measure the unpatched baseline at ~1.00x).
"""
from minisweagent.run.preprocess_v3.baseline import (
BaselineMetrics,
ProfileResult,
capture_full_benchmark_stdout,
collect_baseline_metrics,
collect_profile,
)
from minisweagent.run.preprocess_v3.commandment import (
CommandmentContext,
render_commandment,
)

work_dir = Path(repo_root) if repo_root else None
errors: list[str] = []
warnings: list[str] = []

# Worktree-bypass gate (deterministic, final) — identical contract to the
# render_commandment tool. A harness that imports the source repo directly
# silently evaluates the UNPATCHED baseline, so refuse it up front.
if not os.environ.get("GEAK_ALLOW_HARDCODED_PATHS") and repo_root:
try:
from minisweagent.kernel_languages.contract import (
ContractViolation,
validate_harness,
)

validate_harness(harness, repo_root=repo_root)
except ContractViolation as exc:
logger.error("pre-validated bypass REJECTED harness (worktree bypass): %s", exc)
return PreprocessResult(
success=False,
kernel_language=kernel_language,
kernel_path=kernel_path,
harness_path=harness,
path_taken="A",
errors=[f"worktree_bypass: {exc}"],
)
except Exception as exc: # noqa: BLE001 — never let the gate crash the bypass
logger.debug("pre-validated bypass: worktree gate skipped (validator error): %s", exc)

baseline: BaselineMetrics | None = None
full_benchmark_stdout: str | None = None
try:
baseline = collect_baseline_metrics(
harness, work_dir=work_dir, gpu_id=gpu_id,
)
full_benchmark_stdout = capture_full_benchmark_stdout(
harness, work_dir=work_dir, gpu_id=gpu_id,
)
except Exception as exc: # noqa: BLE001
errors.append(f"collect_baseline failed: {exc}")
logger.error("pre-validated bypass: collect_baseline failed: %s", exc)

# Profiling is advisory (matches the orchestrator escape-hatch contract: a
# run with a verified harness + baseline is salvageable even if profile fails).
profile: ProfileResult | None = None
try:
profile = collect_profile(harness, work_dir=work_dir, gpu_id=gpu_id)
except Exception as exc: # noqa: BLE001
warnings.append(f"collect_profile failed (non-fatal): {exc}")
logger.warning("pre-validated bypass: collect_profile failed (non-fatal): %s", exc)

commandment_path: Path | None = None
try:
ctx = CommandmentContext(
kernel_path=kernel_path,
harness_path=harness,
repo_root=Path(repo_root) if repo_root else None,
correctness_command=correctness_command,
performance_command=performance_command,
)
out_path = output_dir / "COMMANDMENT.md"
render_commandment(kernel_language, ctx, out_path=out_path)
commandment_path = out_path
except Exception as exc: # noqa: BLE001
errors.append(f"render_commandment failed: {exc}")
logger.error("pre-validated bypass: render_commandment failed: %s", exc)

success = (
baseline is not None
and baseline.success
and commandment_path is not None
)
return PreprocessResult(
success=success,
kernel_language=kernel_language,
kernel_path=kernel_path,
harness_path=harness,
baseline=baseline,
full_benchmark_stdout=full_benchmark_stdout,
profile=profile,
commandment_path=commandment_path,
path_taken="A",
tool_calls=[
{"name": "collect_baseline", "args": {"harness_path": str(harness)}},
{"name": "collect_profile", "args": {"harness_path": str(harness)}},
{"name": "render_commandment", "args": {"harness_path": str(harness)}},
],
errors=errors,
warnings=warnings,
)


# ---------------------------------------------------------------------------
# Codebase-explore kernel discovery
# ---------------------------------------------------------------------------
Expand Down
4 changes: 3 additions & 1 deletion src/minisweagent/run/preprocess_v3/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ class LimitsExceeded(TerminatingException):
**Case A — user provided explicit run instructions / commands.**
Indicators: a literal command-line invocation (``python <script>``, ``pytest ... -k ...``, ``make ...``, shell script, existing custom harness command). The command is opaque: it may NOT support GEAK's four harness flags.

**Shapes pre-check (decide this FIRST, before A1 vs A2).** If the task prompt carries shapes (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple) **and** the user's command does not already pin those exact shapes (e.g. it has no matching ``-m/-n``/dim arguments), route to **A2-with-shapes** — *regardless of whether the command contains a GEAK mode flag*. A flag-aware command whose shapes are not pinned would otherwise be classified A1 and run the harness's full default sweep, ignoring the prompt shapes, until it times out. When you are **unsure** whether the command already pins the prompt's exact shapes, choose A2-with-shapes: needlessly sending a shape-pinning command to the generator only costs time / a possible regeneration (a perf cost), whereas wrongly keeping it in A1 silently drops the shapes and hits the default-sweep timeout (a correctness bug). Decide the shapes question BEFORE deciding the command is directly runnable.
**Pre-validated-harness exemption (check this BEFORE the shapes pre-check).** If the "Hints from the call site" section marks the harness **pre-validated** for the four standard modes, the shapes pre-check below does NOT apply: a pre-validated harness already encodes its authoritative shapes internally (the caller validated it end-to-end), so there are no "unpinned prompt shapes" to fix and there is nothing to regenerate. Route it straight to **A1** and call ``commandment_from_user_command`` with the harness invocation. Regenerating a pre-validated harness via ``harness-generator`` is always wrong: it discards a working harness and the generator loop can burn the entire preprocess budget without ever producing a baseline.

**Shapes pre-check (decide this FIRST, before A1 vs A2 — but AFTER the pre-validated-harness exemption above).** If the task prompt carries shapes (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple) **and** the user's command does not already pin those exact shapes (e.g. it has no matching ``-m/-n``/dim arguments) **and the harness is NOT marked pre-validated**, route to **A2-with-shapes** — *regardless of whether the command contains a GEAK mode flag*. A flag-aware command whose shapes are not pinned would otherwise be classified A1 and run the harness's full default sweep, ignoring the prompt shapes, until it times out. When you are **unsure** whether the command already pins the prompt's exact shapes, choose A2-with-shapes: needlessly sending a shape-pinning command to the generator only costs time / a possible regeneration (a perf cost), whereas wrongly keeping it in A1 silently drops the shapes and hits the default-sweep timeout (a correctness bug). Decide the shapes question BEFORE deciding the command is directly runnable.

Split the remaining Case A (no unpinned prompt shapes) on whether the command already speaks GEAK's harness contract:

Expand Down
40 changes: 40 additions & 0 deletions tests/agents/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,43 @@ def test_empty_actions_handling(model_factory):
assert exit_status == "Submitted"
assert submission == "done\n"
assert agent.model.n_calls == 2


def test_prose_only_turn_is_nudged_not_silently_accepted(model_factory):
"""A prose-only turn (no fenced bash, no tool call) must surface a
FormatError nudge in the next observation, NOT be silently accepted as a
successful no-op.

Regression: ``parse_action`` previously returned ``{"output":"",
"returncode":0}`` for a prose-only response, which passed the
``returncode == 0`` check and produced an empty observation. A model that
believes it already finished (e.g. narrates "Done." / "tasks submitted")
then repeats that prose every step with no corrective signal, looping until
the step limit (observed: heterogeneous task-planner, 143 prose turns -> 0
tool calls -> LimitsExceeded). The fix raises FormatError so the model is
told to emit a real action.
"""
factory, config = model_factory
agent = DefaultAgent(
model=factory(
[
# Prose only — the model thinks it is done but took no action.
("The tasks have been successfully submitted. No further action is needed.", []),
("Actually finishing now", [{"command": "echo 'COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT'\necho 'done'"}]),
]
),
env=LocalEnvironment(),
**config,
)

exit_status, submission = agent.run("Test prose-only nudge")
assert exit_status == "Submitted"
assert submission == "done\n"
# The prose-only turn must have produced a corrective observation (the
# format-error nudge) rather than an empty no-op the model can't learn from.
nudge_seen = any(
"EXACTLY ONE action" in get_text(m)
for m in agent.messages
if m.get("role") == "user"
)
assert nudge_seen, "prose-only turn should surface a format-error nudge to the model"
91 changes: 91 additions & 0 deletions tests/run/test_preprocess_v3_bugfixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,3 +426,94 @@ def test_legacy_context_recovers_harness_path_from_promoted_command(tmp_path: Pa
assert ctx["full_benchmark_baseline"] == str(output_dir / "full_benchmark_baseline.txt")
assert (output_dir / "benchmark_baseline.txt").read_text() == "GEAK_RESULT_LATENCY_MS=1.25\n"
assert ctx["v3_path_taken"] == "A"


def test_prevalidated_harness_bypasses_llm_orchestrator(tmp_path: Path, monkeypatch) -> None:
"""A pre-validated harness must run the deterministic Path-A sequence
(collect_baseline -> collect_profile -> render_commandment) WITHOUT ever
invoking the LLM orchestrator.

Regression: the orchestrator's LLM classifier could misroute a shape-bearing
task to the harness-generator or fail to converge, burning the whole
preprocess budget with no baseline. ``_run_prevalidated_path_a`` short-circuits
that. Here the model is a sentinel that raises if queried — proving the LLM
loop is skipped entirely.
"""
import minisweagent.run.preprocess_v3.adapter as adapter_module

repo = tmp_path / "repo"
repo.mkdir()
(repo / "kernel.py").write_text("# kernel\n")
harness = tmp_path / "harness.py"
harness.write_text("print('GEAK_RESULT_LATENCY_MS=2.0')\n")
output_dir = tmp_path / "out"

class _ExplodingModel:
def query(self, *a, **k): # pragma: no cover - must never be called
raise AssertionError("LLM orchestrator was invoked for a pre-validated harness")

# Stub the deterministic building blocks so the test runs no real subprocess.
def fake_collect_baseline_metrics(harness_path, *, work_dir=None, gpu_id=0, repeats=5):
return SimpleNamespace(
success=True, median_ms=2.0, samples_ms=[2.0], stdev_ms=0.0,
repeats=repeats, harness_path=harness_path, command="python harness",
raw_outputs=[{"stdout": "GEAK_RESULT_LATENCY_MS=2.0", "returncode": 0, "latency_ms": 2.0}],
)

monkeypatch.setattr(adapter_module, "PreprocessOrchestratorAgent",
lambda *a, **k: (_ for _ in ()).throw(AssertionError("orchestrator constructed")))
# The worktree-bypass gate (contract.validate_harness) is a separate concern;
# disable it here so the test exercises the bypass control-flow, not harness
# contract validation (covered elsewhere).
monkeypatch.setenv("GEAK_ALLOW_HARDCODED_PATHS", "1")
import minisweagent.run.preprocess_v3.baseline as baseline_module
monkeypatch.setattr(baseline_module, "collect_baseline_metrics", fake_collect_baseline_metrics)
monkeypatch.setattr(baseline_module, "capture_full_benchmark_stdout", lambda *a, **k: "GEAK_RESULT_LATENCY_MS=2.0")
monkeypatch.setattr(baseline_module, "collect_profile",
lambda *a, **k: SimpleNamespace(success=False, profile=None, command="", backend="metrix", profile_path=None))

ctx = adapter_module.run_preprocess_v3(
kernel_url=str(repo / "kernel.py"),
output_dir=output_dir,
gpu_id=0,
model=_ExplodingModel(),
harness=str(harness),
repo=str(repo),
)

# Deterministic path produced the baseline + commandment, LLM never ran.
assert ctx["v3_path_taken"] == "A"
assert (output_dir / "COMMANDMENT.md").is_file()
assert ctx["benchmark_baseline"] == str(output_dir / "benchmark_baseline.txt")


def test_prevalidated_bypass_opt_out_env(tmp_path: Path, monkeypatch) -> None:
"""GEAK_NO_PREVALIDATED_BYPASS=1 disables the deterministic short-circuit
(falls back to the LLM orchestrator path)."""
import minisweagent.run.preprocess_v3.adapter as adapter_module

repo = tmp_path / "repo"
repo.mkdir()
(repo / "kernel.py").write_text("# kernel\n")
harness = tmp_path / "harness.py"
harness.write_text("print('ok')\n")

monkeypatch.setenv("GEAK_NO_PREVALIDATED_BYPASS", "1")

called = {"bypass": False}
monkeypatch.setattr(adapter_module, "_run_prevalidated_path_a",
lambda **k: called.__setitem__("bypass", True))
# Make the orchestrator path raise immediately so we can detect we reached it
# (and did NOT take the bypass).
monkeypatch.setattr(adapter_module, "PreprocessOrchestratorAgent",
lambda *a, **k: (_ for _ in ()).throw(RuntimeError("reached-orchestrator")))

with pytest.raises(RuntimeError, match="reached-orchestrator"):
adapter_module.run_preprocess_v3(
kernel_url=str(repo / "kernel.py"),
output_dir=tmp_path / "out",
model=object(),
harness=str(harness),
repo=str(repo),
)
assert called["bypass"] is False