Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/minisweagent/run/mini.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
apply_mode_presets,
resolve_max_rounds,
)
from minisweagent.run.preprocess.contract_normalize import is_amalgamation_command
from minisweagent.run.preprocess_v3.adapter import run_preprocess_v3 as run_preprocessor
from minisweagent.run.state import (
PreprocessState,
Expand Down Expand Up @@ -682,7 +683,16 @@ def _sigint_handler(_signum, _frame): # noqa: ANN001
else:
raise
else:
if isinstance(test_command, str) and "&&" in test_command:
# Only pre-split a compound ``cmd_a && cmd_b`` into correctness /
# performance hints when it is a genuine build-bearing contract
# (mirrors the preprocessor's _try_synthesize_shell_contract_harness
# split). A non-build ``&&`` is an amalgamation (same script run twice
# with different settings, or two different tests chained); splitting it
# left=correctness / right=performance silently drops one metric. Pass
# it through whole as eval_command so the preprocessor's deterministic
# amalgamation guard fires (PATH_A_FLAG_MISSING) and routes it to the
# harness generator (Case A2), which resolves it into one metric.
if isinstance(test_command, str) and "&&" in test_command and not is_amalgamation_command(test_command):
left, right = test_command.rsplit("&&", 1)
correctness_command = left.strip() or None
performance_command = right.strip() or None
Expand Down
24 changes: 24 additions & 0 deletions src/minisweagent/run/preprocess/contract_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,30 @@ def infer_compile_command_from_eval(eval_command: str | None) -> str | None:
return None


def is_amalgamation_command(cmd: str) -> bool:
"""True when *cmd* chains segments with ``&&`` but has no confident leading
compile/build prefix — i.e. a joint/amalgamation command (the same script run
twice with different settings, or two different tests chained).

Such a command must be resolved into a single latency value by the harness
generator (Case A2), NOT split blindly left=correctness / right=performance
(which silently drops one metric). We gate on
:func:`infer_compile_command_from_eval` rather than a raw build-token substring
scan so the keep-vs-refuse decision stays consistent with the compile-prefix the
split paths actually re-prepend, and to avoid whole-string false positives. This
*reduces, not eliminates*, false positives: a build substring living only inside
a flag value of the first segment (e.g. ``--mode compile_fwd``) still reads as a
compile prefix and is treated as build-bearing.

Shared by the preprocessor (``commandment_from_user_command``) and the CLI
entry point (``mini.py``) so a compound ``--test-command`` is classified the
same way regardless of which layer sees it first.
"""
if "&&" not in cmd:
return False
return infer_compile_command_from_eval(cmd) is None


def discovery_digest(discovery: dict[str, Any] | None, *, max_chars: int = 6000) -> dict[str, Any]:
"""Return a JSON-serializable, size-capped snapshot for contract.json."""
if not discovery:
Expand Down
9 changes: 9 additions & 0 deletions src/minisweagent/run/preprocess_v3/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,15 @@ def _build_env(
if work_dir is not None:
existing = env.get("PYTHONPATH", "")
env["PYTHONPATH"] = f"{work_dir}:{existing}" if existing else str(work_dir)
# Worktree-awareness contract (mirrors run_harness._build_env): a
# contract-compliant harness resolves every repo path from GEAK_WORK_DIR
# (e.g. ``os.environ.get("GEAK_WORK_DIR", "<fallback>")``). Without this
# the harness falls back to its own directory and cannot find the kernel
# source, so it runs nothing and emits no GEAK_RESULT_LATENCY_MS marker
# (silent "produced no latency"). GEAK_REPO_ROOT uses setdefault so an
# already-exported source root (set by the adapter) is preserved.
env["GEAK_WORK_DIR"] = str(work_dir)
env.setdefault("GEAK_REPO_ROOT", str(work_dir))
env["HIP_VISIBLE_DEVICES"] = str(gpu_id)
env["PYTHONUNBUFFERED"] = "1"
if extra:
Expand Down
24 changes: 20 additions & 4 deletions src/minisweagent/run/preprocess_v3/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,23 +117,39 @@ class LimitsExceeded(TerminatingException):
**Case A — user provided explicit run instructions / commands.**
Indicators: a literal command-line invocation (``python <script>``, ``pytest ... -k ...``, ``make ...``, shell script, existing custom harness command). The command is opaque: it may NOT support GEAK's four harness flags.

Action: **skip ``run_discovery``** — the user already told you what to run, so test discovery is unnecessary and wastes time. Go directly to ``commandment_from_user_command`` with the extracted user command. Do not generate a harness.
**Shapes pre-check (decide this FIRST, before A1 vs A2).** If the task prompt carries shapes (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple) **and** the user's command does not already pin those exact shapes (e.g. it has no matching ``-m/-n``/dim arguments), route to **A2-with-shapes** — *regardless of whether the command contains a GEAK mode flag*. A flag-aware command whose shapes are not pinned would otherwise be classified A1 and run the harness's full default sweep, ignoring the prompt shapes, until it times out. When you are **unsure** whether the command already pins the prompt's exact shapes, choose A2-with-shapes: needlessly sending a shape-pinning command to the generator only costs time / a possible regeneration (a perf cost), whereas wrongly keeping it in A1 silently drops the shapes and hits the default-sweep timeout (a correctness bug). Decide the shapes question BEFORE deciding the command is directly runnable.

Split the remaining Case A (no unpinned prompt shapes) on whether the command already speaks GEAK's harness contract:

- **A1 — flag-aware command.** The command text literally contains one of ``--correctness`` / ``--benchmark`` / ``--full-benchmark`` / ``--profile`` (or the "Hints from the call site" section marks the harness **pre-validated** for the four standard modes), **and** the shapes pre-check above did not divert it to A2. Follow the A1 action below.
- **A2 — flag-less or composite command.** Otherwise (a plain ``python test.py`` / ``make ...`` with no GEAK flag, a request to cover several op/shape/quant facets of one kernel, or any command diverted by the shapes pre-check). Follow the A2 action below.

You do not have to classify A1 vs A2 perfectly: ``commandment_from_user_command`` has a deterministic backstop that refuses a flag-less command and returns ``ok: False`` with ``PATH_A_FLAG_MISSING`` — see the recovery rule under A2.

**A1 action — flag-aware command.** **skip ``run_discovery``** — the user already told you what to run, so test discovery is unnecessary and wastes time. Go directly to ``commandment_from_user_command`` with the extracted user command. Do not generate a harness. (Reminder: this applies only when the shapes pre-check did **not** divert the command — a flag-aware command whose prompt carries shapes the command does not already pin is the A2-with-shapes path, not A1.)

**STRICT keyword-argument names for ``commandment_from_user_command``** (do NOT use synonyms — the tool will TypeError):

```
commandment_from_user_command(
run_command="<user's verbatim shell command>", # NOT command/cmd/user_command/raw_command/harness_command
run_command="<user's verbatim shell command, which already contains a GEAK mode flag>", # NOT command/cmd/user_command/raw_command/harness_command
out_path="<output_dir>/COMMANDMENT.md", # NOT output/output_path/path/commandment_path
modes_covered=["correctness","profile","benchmark","full_benchmark"],
inferred_modes=[],
notes="<short audit note>"
)
```

**Important exception**: if the "Hints from the call site" section says the harness is **pre-validated** and supports the four standard modes (``--correctness``, ``--benchmark``, ``--full-benchmark``, ``--profile``), you MUST list all four modes in ``modes_covered`` when calling ``commandment_from_user_command``. The tool will substitute the correct flag for each COMMANDMENT section automatically. Do NOT put all modes in ``inferred_modes`` — use ``modes_covered``.
Listing all four in ``modes_covered`` is correct **only for A1** (the command/harness already supports the four flags, so the tool substitutes the right flag per section). Do NOT use this all-four call for a flag-less command — that is the A2 path.

**A2 action — flag-less or composite command.** Do **NOT** call ``commandment_from_user_command`` with a flag-less command. Instead route on whether the **task prompt carries shapes** (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple):

- **A2-with-shapes:** **skip ``run_discovery``** (the prompt already gives the authoritative shapes) and dispatch ``harness-generator`` (then ``harness-verifier``, then ``render_commandment``) with the **prompt shapes ONLY**, mapped to the harness's CLI params (e.g. for the rmsnorm harness, weight ``(n,)`` → ``-n``, activation ``(m,n)`` → ``-m``, dtype → ``-d``, op → ``--mode``). The generated harness must use ONLY the prompt-provided shapes (same authoritative-override contract as Case B). A composite task (several facets of one source kernel) produces **one** harness that internally iterates all facets and emits **one** aggregate metric. This bullet covers every shape-bearing command the pre-check diverts here, including: (a) a **single-flag** command whose prompt carries shapes it does not pin, and (b) a **build-bearing ``&&``** command whose prompt carries shapes. In all cases do **NOT** inject the shapes into the user's command yourself — the generator owns shape handling (there is no single way to add shapes; some harnesses take ``-m/-n`` flags, others need the shapes edited inside the file). A build-bearing ``&&`` command with **no** prompt shapes is not diverted and keeps the deterministic compile + correctness + performance route via ``commandment_from_user_command``.
- **A2-no-shapes:** the command is flag-less AND the prompt names no shapes/dims. Do **NOT** dispatch ``harness-generator`` blind — with neither flags nor shapes it would fall back to the harness's full default sweep (the timeout that motivated this rule). Instead fall through to ``run_discovery`` and proceed exactly as **Case C** (discovery's ATD is authoritative for source/shapes). Record the user's flag-less command in ``notes`` for audit; it is not executed as the eval contract.

**A2 recovery (do not omit):** if you do call ``commandment_from_user_command`` and it returns ``ok: False`` / ``PATH_A_FLAG_MISSING``, the command was either flag-less or a non-build ``&&`` amalgamation. **Switch to the A2 action** (route on shapes as above). Do NOT retry ``commandment_from_user_command`` with the same command — it will return ``ok: False`` again.

**After ``commandment_from_user_command`` succeeds**, you **MUST** call ``collect_baseline`` before calling ``finish_preprocess``. Baseline is **required** for downstream verified-speedup evaluation:
**After ``commandment_from_user_command`` succeeds (A1)**, you **MUST** call ``collect_baseline`` before calling ``finish_preprocess``. Baseline is **required** for downstream verified-speedup evaluation:

- If the return value includes a ``harness_path``, call ``collect_baseline(harness_path=<path>)`` and ``collect_profile(harness_path=<path>)``.
- If ``harness_path`` is null/absent, call ``collect_baseline(eval_command="<eval_command from the return value>")`` — use the ``eval_command`` field from ``commandment_from_user_command``'s return value (NOT the original ``run_command`` you passed in, because the tool sanitizes the command to add GEAK metric markers). This runs the eval command directly and parses ``GEAK_METRIC`` / ``GEAK_RESULT_LATENCY_MS`` markers from stdout.
Expand Down
106 changes: 105 additions & 1 deletion src/minisweagent/run/preprocess_v3/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ def _substitute_mode_flag(cmd: str, target_mode: str) -> str:
for other_mode, other_flag in _MODE_TO_FLAG.items():
if other_mode != target_mode and other_flag in cmd:
return cmd.replace(other_flag, dst_flag)
# Case 3: no known flag at all -> returned unchanged. This silent no-op is a
# latent trap for a flag-less command (every mode collapses to one body); the
# only caller that matters for #258, ``_make_tool_commandment_from_user_command``,
# neutralizes it with the deterministic ``is_flagless`` backstop before this
# matters. Do not rely on this helper to signal a missing flag.
return cmd


Expand Down Expand Up @@ -181,6 +186,28 @@ def _extract_harness_from_command(cmd: str) -> str | None:
return None


def _is_amalgamation_command(cmd: str) -> bool:
"""Thin wrapper over :func:`contract_normalize.is_amalgamation_command`.

Kept local so this module's call sites read clearly and to preserve the safe
ImportError fallback: if the shared module cannot be imported, treat a ``&&``
command as an amalgamation and route to the harness generator (A2) rather than
risk a blind split. Flag-independent by design — it must catch flag-bearing
amalgamations too, since those would otherwise yield a harness path via
``_extract_harness_from_command`` and slip past the flag-less backstop in
``commandment_from_user_command``.
"""
if "&&" not in cmd:
return False
try:
from minisweagent.run.preprocess.contract_normalize import (
is_amalgamation_command,
)
except ImportError:
return True
return is_amalgamation_command(cmd)


def _try_synthesize_shell_contract_harness(
cmd: str,
*,
Expand Down Expand Up @@ -216,6 +243,18 @@ def _try_synthesize_shell_contract_harness(
return None
if not repo_root_str:
return None
# Only synthesize for a genuine build-bearing command. A non-build `&&` is an
# amalgamation (same script twice with different settings, or two different
# tests chained); its resolution is delegated to the harness generator (A2) —
# the deterministic split below is intentionally NOT used for it. We do not
# distinguish same-script-twice from different-scripts here: the generator
# handles both, so neither is a "limitation" of this path.
if _is_amalgamation_command(cmd):
return None
# The split below hard-codes left=correctness / right=performance ordering;
# this is an intentional assumption that holds for the canonical
# "compile && correctness && performance" pattern but degrades for 3+ mixed
# segments (e.g. "make && python a && python b"). Kept deliberately simple.
# Only split when the command has multiple meaningful segments, not
# when && is just directory navigation (e.g. "cd /dir && bash script.sh").
# A cd-only left half is not a correctness command.
Expand Down Expand Up @@ -1234,7 +1273,19 @@ def _impl(
eval_command = saved_eval

resolved_gpu = gpu_id if gpu_id is not None else agent.config.gpu_id
resolved_work_dir = Path(work_dir) if work_dir else None
# Fall back to the orchestrator's source repo when the subagent omits
# work_dir (it has no schema default and is frequently not passed).
# A None work_dir means _build_env sets neither PYTHONPATH nor
# GEAK_WORK_DIR, so the harness can't find the kernel source and silently
# produces no latency. At preprocess time no per-slot worktree exists yet,
# so the source repo is the correct target; optimization-time runs pass
# their per-slot worktree explicitly via the legacy run_harness path.
if work_dir:
resolved_work_dir = Path(work_dir)
elif agent.config.repo:
resolved_work_dir = Path(agent.config.repo)
else:
resolved_work_dir = None

if harness_path:
baseline: BaselineMetrics = collect_baseline_metrics(
Expand Down Expand Up @@ -1373,6 +1424,11 @@ def _impl(
except Exception as exc: # noqa: BLE001 — never let the gate crash finalize
logger.debug("render_commandment bypass-gate skipped (validator error): %s", exc)

# compile_command is intentionally left unset: for HIP the harness owns
# its own build (the harness-generator contract requires it to self-build
# via subprocess.run(make/hipcc), mtime-keyed on the kernel source). So the
# rendered COMMANDMENT carries no compile step — that is by design, not a
# gap. A changed CK/HIP source is still recompiled by the harness/JIT layer.
ctx = CommandmentContext(
kernel_path=Path(kernel_path),
harness_path=Path(harness_path),
Expand Down Expand Up @@ -1462,6 +1518,28 @@ def _impl(
agent.model,
)

# Deterministic amalgamation backstop (must run BEFORE harness extraction).
# A non-build ``&&`` command is a joint/amalgamation instruction (same
# script run twice with different settings, or two different tests chained).
# Splitting it left=correctness / right=performance silently drops one
# latency number. This guard is flag-INDEPENDENT on purpose: a flag-bearing
# amalgamation (e.g. "python t.py --benchmark --a && python t.py --benchmark
# --b") would otherwise yield a harness_path via
# ``_extract_harness_from_command`` and slip past the flag-less backstop
# below, running only the first half. Refuse here and route to A2.
if _is_amalgamation_command(cmd):
return {
"ok": False,
"error": "PATH_A_FLAG_MISSING",
"warnings": [
"PATH_A_FLAG_MISSING: command chains segments with '&&' but has "
"no build/compile prefix (amalgamation); a deterministic split "
"would drop one metric. Dispatch harness-generator (Case A2) to "
"resolve it into a single test emitting one metric. Do NOT retry "
"commandment_from_user_command with the same command."
],
}

# Extract the harness path from the (possibly sanitized) command
# so collect_baseline/collect_profile can use the real filesystem path.
original_harness_path = _extract_harness_from_command(cmd)
Expand All @@ -1487,6 +1565,32 @@ def _impl(
original_harness_path = None
if original_harness_path:
agent._collected["harness_path"] = original_harness_path

# Deterministic backstop for the flag-less Path-A case (issue #258).
# A command that (a) exposes none of GEAK's four harness flags, (b) is
# not a compound ``&&`` shell contract, and (c) yielded no usable
# harness_path carries no per-mode contract: every COMMANDMENT section
# would receive the SAME command (``_substitute_mode_flag`` no-ops on a
# flag-less command, see its case 3), and the correctness preflight would
# then run the harness's full default sweep until it times out. Refuse to
# render that all-modes-identical artifact and signal the orchestrator to
# switch to harness synthesis (Case A2). This guard is independent of the
# ``modes_covered`` the LLM passed, so it cannot be defeated by the LLM
# listing all four modes as covered.
is_flagless = _extract_harness_from_command(cmd) is None and "&&" not in cmd and original_harness_path is None
if is_flagless:
return {
"ok": False,
"error": "PATH_A_FLAG_MISSING",
"warnings": [
"PATH_A_FLAG_MISSING: flag-less command exposes none of "
"--correctness/--benchmark/--full-benchmark/--profile and has no "
"harness; dispatch harness-generator (Case A2) instead of copying "
"the command into all four modes. Do NOT retry "
"commandment_from_user_command with the same command."
],
}

# Preserve the sanitized command (with real paths) for
# collect_baseline before we replace repo_root with ${GEAK_WORK_DIR}.
eval_command_for_baseline = cmd
Expand Down
Loading