AMD-AGI · sdubagun-amd · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -34,6 +34,7 @@
     apply_mode_presets,
     resolve_max_rounds,
 )
+from minisweagent.run.preprocess.contract_normalize import is_amalgamation_command
 from minisweagent.run.preprocess_v3.adapter import run_preprocess_v3 as run_preprocessor
 from minisweagent.run.state import (
     PreprocessState,
@@ -682,7 +683,16 @@ def _sigint_handler(_signum, _frame):  # noqa: ANN001
                 else:
                     raise
         else:
-            if isinstance(test_command, str) and "&&" in test_command:
+            # Only pre-split a compound ``cmd_a && cmd_b`` into correctness /
+            # performance hints when it is a genuine build-bearing contract
+            # (mirrors the preprocessor's _try_synthesize_shell_contract_harness
+            # split). A non-build ``&&`` is an amalgamation (same script run twice
+            # with different settings, or two different tests chained); splitting it
+            # left=correctness / right=performance silently drops one metric. Pass
+            # it through whole as eval_command so the preprocessor's deterministic
+            # amalgamation guard fires (PATH_A_FLAG_MISSING) and routes it to the
+            # harness generator (Case A2), which resolves it into one metric.
+            if isinstance(test_command, str) and "&&" in test_command and not is_amalgamation_command(test_command):
                 left, right = test_command.rsplit("&&", 1)
                 correctness_command = left.strip() or None
                 performance_command = right.strip() or None

@@ -102,6 +102,30 @@ def infer_compile_command_from_eval(eval_command: str | None) -> str | None:
     return None
 
 
+def is_amalgamation_command(cmd: str) -> bool:
+    """True when *cmd* chains segments with ``&&`` but has no confident leading
+    compile/build prefix — i.e. a joint/amalgamation command (the same script run
+    twice with different settings, or two different tests chained).
+
+    Such a command must be resolved into a single latency value by the harness
+    generator (Case A2), NOT split blindly left=correctness / right=performance
+    (which silently drops one metric). We gate on
+    :func:`infer_compile_command_from_eval` rather than a raw build-token substring
+    scan so the keep-vs-refuse decision stays consistent with the compile-prefix the
+    split paths actually re-prepend, and to avoid whole-string false positives. This
+    *reduces, not eliminates*, false positives: a build substring living only inside
+    a flag value of the first segment (e.g. ``--mode compile_fwd``) still reads as a
+    compile prefix and is treated as build-bearing.
+
+    Shared by the preprocessor (``commandment_from_user_command``) and the CLI
+    entry point (``mini.py``) so a compound ``--test-command`` is classified the
+    same way regardless of which layer sees it first.
+    """
+    if "&&" not in cmd:
+        return False
+    return infer_compile_command_from_eval(cmd) is None
+
+
 def discovery_digest(discovery: dict[str, Any] | None, *, max_chars: int = 6000) -> dict[str, Any]:
     """Return a JSON-serializable, size-capped snapshot for contract.json."""
     if not discovery:

@@ -202,6 +202,15 @@ def _build_env(
     if work_dir is not None:
         existing = env.get("PYTHONPATH", "")
         env["PYTHONPATH"] = f"{work_dir}:{existing}" if existing else str(work_dir)
+        # Worktree-awareness contract (mirrors run_harness._build_env): a
+        # contract-compliant harness resolves every repo path from GEAK_WORK_DIR
+        # (e.g. ``os.environ.get("GEAK_WORK_DIR", "<fallback>")``). Without this
+        # the harness falls back to its own directory and cannot find the kernel
+        # source, so it runs nothing and emits no GEAK_RESULT_LATENCY_MS marker
+        # (silent "produced no latency"). GEAK_REPO_ROOT uses setdefault so an
+        # already-exported source root (set by the adapter) is preserved.
+        env["GEAK_WORK_DIR"] = str(work_dir)
+        env.setdefault("GEAK_REPO_ROOT", str(work_dir))
     env["HIP_VISIBLE_DEVICES"] = str(gpu_id)
     env["PYTHONUNBUFFERED"] = "1"
     if extra:

@@ -117,23 +117,39 @@ class LimitsExceeded(TerminatingException):
 **Case A — user provided explicit run instructions / commands.**
 Indicators: a literal command-line invocation (``python <script>``, ``pytest ... -k ...``, ``make ...``, shell script, existing custom harness command). The command is opaque: it may NOT support GEAK's four harness flags.
 
-Action: **skip ``run_discovery``** — the user already told you what to run, so test discovery is unnecessary and wastes time. Go directly to ``commandment_from_user_command`` with the extracted user command. Do not generate a harness.
+**Shapes pre-check (decide this FIRST, before A1 vs A2).** If the task prompt carries shapes (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple) **and** the user's command does not already pin those exact shapes (e.g. it has no matching ``-m/-n``/dim arguments), route to **A2-with-shapes** — *regardless of whether the command contains a GEAK mode flag*. A flag-aware command whose shapes are not pinned would otherwise be classified A1 and run the harness's full default sweep, ignoring the prompt shapes, until it times out. When you are **unsure** whether the command already pins the prompt's exact shapes, choose A2-with-shapes: needlessly sending a shape-pinning command to the generator only costs time / a possible regeneration (a perf cost), whereas wrongly keeping it in A1 silently drops the shapes and hits the default-sweep timeout (a correctness bug). Decide the shapes question BEFORE deciding the command is directly runnable.
+
+Split the remaining Case A (no unpinned prompt shapes) on whether the command already speaks GEAK's harness contract:
+
+- **A1 — flag-aware command.** The command text literally contains one of ``--correctness`` / ``--benchmark`` / ``--full-benchmark`` / ``--profile`` (or the "Hints from the call site" section marks the harness **pre-validated** for the four standard modes), **and** the shapes pre-check above did not divert it to A2. Follow the A1 action below.
+- **A2 — flag-less or composite command.** Otherwise (a plain ``python test.py`` / ``make ...`` with no GEAK flag, a request to cover several op/shape/quant facets of one kernel, or any command diverted by the shapes pre-check). Follow the A2 action below.
+
+You do not have to classify A1 vs A2 perfectly: ``commandment_from_user_command`` has a deterministic backstop that refuses a flag-less command and returns ``ok: False`` with ``PATH_A_FLAG_MISSING`` — see the recovery rule under A2.
+
+**A1 action — flag-aware command.** **skip ``run_discovery``** — the user already told you what to run, so test discovery is unnecessary and wastes time. Go directly to ``commandment_from_user_command`` with the extracted user command. Do not generate a harness. (Reminder: this applies only when the shapes pre-check did **not** divert the command — a flag-aware command whose prompt carries shapes the command does not already pin is the A2-with-shapes path, not A1.)
 
 **STRICT keyword-argument names for ``commandment_from_user_command``** (do NOT use synonyms — the tool will TypeError):
 
 ```
 commandment_from_user_command(
-    run_command="<user's verbatim shell command>",      # NOT command/cmd/user_command/raw_command/harness_command
+    run_command="<user's verbatim shell command, which already contains a GEAK mode flag>",  # NOT command/cmd/user_command/raw_command/harness_command
     out_path="<output_dir>/COMMANDMENT.md",             # NOT output/output_path/path/commandment_path
     modes_covered=["correctness","profile","benchmark","full_benchmark"],
     inferred_modes=[],
     notes="<short audit note>"
 )
 ```
 
-**Important exception**: if the "Hints from the call site" section says the harness is **pre-validated** and supports the four standard modes (``--correctness``, ``--benchmark``, ``--full-benchmark``, ``--profile``), you MUST list all four modes in ``modes_covered`` when calling ``commandment_from_user_command``. The tool will substitute the correct flag for each COMMANDMENT section automatically. Do NOT put all modes in ``inferred_modes`` — use ``modes_covered``.
+Listing all four in ``modes_covered`` is correct **only for A1** (the command/harness already supports the four flags, so the tool substitutes the right flag per section). Do NOT use this all-four call for a flag-less command — that is the A2 path.
+
+**A2 action — flag-less or composite command.** Do **NOT** call ``commandment_from_user_command`` with a flag-less command. Instead route on whether the **task prompt carries shapes** (a ``Shapes:`` line, explicit dims, or a dtype/quant tuple):
+
+- **A2-with-shapes:** **skip ``run_discovery``** (the prompt already gives the authoritative shapes) and dispatch ``harness-generator`` (then ``harness-verifier``, then ``render_commandment``) with the **prompt shapes ONLY**, mapped to the harness's CLI params (e.g. for the rmsnorm harness, weight ``(n,)`` → ``-n``, activation ``(m,n)`` → ``-m``, dtype → ``-d``, op → ``--mode``). The generated harness must use ONLY the prompt-provided shapes (same authoritative-override contract as Case B). A composite task (several facets of one source kernel) produces **one** harness that internally iterates all facets and emits **one** aggregate metric. This bullet covers every shape-bearing command the pre-check diverts here, including: (a) a **single-flag** command whose prompt carries shapes it does not pin, and (b) a **build-bearing ``&&``** command whose prompt carries shapes. In all cases do **NOT** inject the shapes into the user's command yourself — the generator owns shape handling (there is no single way to add shapes; some harnesses take ``-m/-n`` flags, others need the shapes edited inside the file). A build-bearing ``&&`` command with **no** prompt shapes is not diverted and keeps the deterministic compile + correctness + performance route via ``commandment_from_user_command``.
+- **A2-no-shapes:** the command is flag-less AND the prompt names no shapes/dims. Do **NOT** dispatch ``harness-generator`` blind — with neither flags nor shapes it would fall back to the harness's full default sweep (the timeout that motivated this rule). Instead fall through to ``run_discovery`` and proceed exactly as **Case C** (discovery's ATD is authoritative for source/shapes). Record the user's flag-less command in ``notes`` for audit; it is not executed as the eval contract.
+
+**A2 recovery (do not omit):** if you do call ``commandment_from_user_command`` and it returns ``ok: False`` / ``PATH_A_FLAG_MISSING``, the command was either flag-less or a non-build ``&&`` amalgamation. **Switch to the A2 action** (route on shapes as above). Do NOT retry ``commandment_from_user_command`` with the same command — it will return ``ok: False`` again.
 
-**After ``commandment_from_user_command`` succeeds**, you **MUST** call ``collect_baseline`` before calling ``finish_preprocess``. Baseline is **required** for downstream verified-speedup evaluation:
+**After ``commandment_from_user_command`` succeeds (A1)**, you **MUST** call ``collect_baseline`` before calling ``finish_preprocess``. Baseline is **required** for downstream verified-speedup evaluation:
 
 - If the return value includes a ``harness_path``, call ``collect_baseline(harness_path=<path>)`` and ``collect_profile(harness_path=<path>)``.
 - If ``harness_path`` is null/absent, call ``collect_baseline(eval_command="<eval_command from the return value>")`` — use the ``eval_command`` field from ``commandment_from_user_command``'s return value (NOT the original ``run_command`` you passed in, because the tool sanitizes the command to add GEAK metric markers). This runs the eval command directly and parses ``GEAK_METRIC`` / ``GEAK_RESULT_LATENCY_MS`` markers from stdout.

@@ -124,6 +124,11 @@ def _substitute_mode_flag(cmd: str, target_mode: str) -> str:
     for other_mode, other_flag in _MODE_TO_FLAG.items():
         if other_mode != target_mode and other_flag in cmd:
             return cmd.replace(other_flag, dst_flag)
+    # Case 3: no known flag at all -> returned unchanged. This silent no-op is a
+    # latent trap for a flag-less command (every mode collapses to one body); the
+    # only caller that matters for #258, ``_make_tool_commandment_from_user_command``,
+    # neutralizes it with the deterministic ``is_flagless`` backstop before this
+    # matters. Do not rely on this helper to signal a missing flag.
     return cmd
 
 
@@ -181,6 +186,28 @@ def _extract_harness_from_command(cmd: str) -> str | None:
     return None
 
 
+def _is_amalgamation_command(cmd: str) -> bool:
+    """Thin wrapper over :func:`contract_normalize.is_amalgamation_command`.
+
+    Kept local so this module's call sites read clearly and to preserve the safe
+    ImportError fallback: if the shared module cannot be imported, treat a ``&&``
+    command as an amalgamation and route to the harness generator (A2) rather than
+    risk a blind split. Flag-independent by design — it must catch flag-bearing
+    amalgamations too, since those would otherwise yield a harness path via
+    ``_extract_harness_from_command`` and slip past the flag-less backstop in
+    ``commandment_from_user_command``.
+    """
+    if "&&" not in cmd:
+        return False
+    try:
+        from minisweagent.run.preprocess.contract_normalize import (
+            is_amalgamation_command,
+        )
+    except ImportError:
+        return True
+    return is_amalgamation_command(cmd)
+
+
 def _try_synthesize_shell_contract_harness(
     cmd: str,
     *,
@@ -216,6 +243,18 @@ def _try_synthesize_shell_contract_harness(
         return None
     if not repo_root_str:
         return None
+    # Only synthesize for a genuine build-bearing command. A non-build `&&` is an
+    # amalgamation (same script twice with different settings, or two different
+    # tests chained); its resolution is delegated to the harness generator (A2) —
+    # the deterministic split below is intentionally NOT used for it. We do not
+    # distinguish same-script-twice from different-scripts here: the generator
+    # handles both, so neither is a "limitation" of this path.
+    if _is_amalgamation_command(cmd):
+        return None
+    # The split below hard-codes left=correctness / right=performance ordering;
+    # this is an intentional assumption that holds for the canonical
+    # "compile && correctness && performance" pattern but degrades for 3+ mixed
+    # segments (e.g. "make && python a && python b"). Kept deliberately simple.
     # Only split when the command has multiple meaningful segments, not
     # when && is just directory navigation (e.g. "cd /dir && bash script.sh").
     # A cd-only left half is not a correctness command.
@@ -1234,7 +1273,19 @@ def _impl(
                 eval_command = saved_eval
 
         resolved_gpu = gpu_id if gpu_id is not None else agent.config.gpu_id
-        resolved_work_dir = Path(work_dir) if work_dir else None
+        # Fall back to the orchestrator's source repo when the subagent omits
+        # work_dir (it has no schema default and is frequently not passed).
+        # A None work_dir means _build_env sets neither PYTHONPATH nor
+        # GEAK_WORK_DIR, so the harness can't find the kernel source and silently
+        # produces no latency. At preprocess time no per-slot worktree exists yet,
+        # so the source repo is the correct target; optimization-time runs pass
+        # their per-slot worktree explicitly via the legacy run_harness path.
+        if work_dir:
+            resolved_work_dir = Path(work_dir)
+        elif agent.config.repo:
+            resolved_work_dir = Path(agent.config.repo)
+        else:
+            resolved_work_dir = None
 
         if harness_path:
             baseline: BaselineMetrics = collect_baseline_metrics(
@@ -1373,6 +1424,11 @@ def _impl(
             except Exception as exc:  # noqa: BLE001 — never let the gate crash finalize
                 logger.debug("render_commandment bypass-gate skipped (validator error): %s", exc)
 
+        # compile_command is intentionally left unset: for HIP the harness owns
+        # its own build (the harness-generator contract requires it to self-build
+        # via subprocess.run(make/hipcc), mtime-keyed on the kernel source). So the
+        # rendered COMMANDMENT carries no compile step — that is by design, not a
+        # gap. A changed CK/HIP source is still recompiled by the harness/JIT layer.
         ctx = CommandmentContext(
             kernel_path=Path(kernel_path),
             harness_path=Path(harness_path),
@@ -1462,6 +1518,28 @@ def _impl(
                 agent.model,
             )
 
+        # Deterministic amalgamation backstop (must run BEFORE harness extraction).
+        # A non-build ``&&`` command is a joint/amalgamation instruction (same
+        # script run twice with different settings, or two different tests chained).
+        # Splitting it left=correctness / right=performance silently drops one
+        # latency number. This guard is flag-INDEPENDENT on purpose: a flag-bearing
+        # amalgamation (e.g. "python t.py --benchmark --a && python t.py --benchmark
+        # --b") would otherwise yield a harness_path via
+        # ``_extract_harness_from_command`` and slip past the flag-less backstop
+        # below, running only the first half. Refuse here and route to A2.
+        if _is_amalgamation_command(cmd):
+            return {
+                "ok": False,
+                "error": "PATH_A_FLAG_MISSING",
+                "warnings": [
+                    "PATH_A_FLAG_MISSING: command chains segments with '&&' but has "
+                    "no build/compile prefix (amalgamation); a deterministic split "
+                    "would drop one metric. Dispatch harness-generator (Case A2) to "
+                    "resolve it into a single test emitting one metric. Do NOT retry "
+                    "commandment_from_user_command with the same command."
+                ],
+            }
+
         # Extract the harness path from the (possibly sanitized) command
         # so collect_baseline/collect_profile can use the real filesystem path.
         original_harness_path = _extract_harness_from_command(cmd)
@@ -1487,6 +1565,32 @@ def _impl(
             original_harness_path = None
         if original_harness_path:
             agent._collected["harness_path"] = original_harness_path
+
+        # Deterministic backstop for the flag-less Path-A case (issue #258).
+        # A command that (a) exposes none of GEAK's four harness flags, (b) is
+        # not a compound ``&&`` shell contract, and (c) yielded no usable
+        # harness_path carries no per-mode contract: every COMMANDMENT section
+        # would receive the SAME command (``_substitute_mode_flag`` no-ops on a
+        # flag-less command, see its case 3), and the correctness preflight would
+        # then run the harness's full default sweep until it times out. Refuse to
+        # render that all-modes-identical artifact and signal the orchestrator to
+        # switch to harness synthesis (Case A2). This guard is independent of the
+        # ``modes_covered`` the LLM passed, so it cannot be defeated by the LLM
+        # listing all four modes as covered.
+        is_flagless = _extract_harness_from_command(cmd) is None and "&&" not in cmd and original_harness_path is None
+        if is_flagless:
+            return {
+                "ok": False,
+                "error": "PATH_A_FLAG_MISSING",
+                "warnings": [
+                    "PATH_A_FLAG_MISSING: flag-less command exposes none of "
+                    "--correctness/--benchmark/--full-benchmark/--profile and has no "
+                    "harness; dispatch harness-generator (Case A2) instead of copying "
+                    "the command into all four modes. Do NOT retry "
+                    "commandment_from_user_command with the same command."
+                ],
+            }
+
         # Preserve the sanitized command (with real paths) for
         # collect_baseline before we replace repo_root with ${GEAK_WORK_DIR}.
         eval_command_for_baseline = cmd