From eb8576a4dc375dc9ba7a7609bd35de9c04aec836 Mon Sep 17 00:00:00 2001 From: Kristoffer Peyron Date: Mon, 8 Jun 2026 13:57:00 +0200 Subject: [PATCH 1/5] feat(translate): median-over-N latency + configurable PyTorch reference mode Replace the translation harness's single timed forward (after 3 warmups) with a median over N timed passes using CUDA events (no Triton), to remove the run-to-run speedup noise. Configured via the existing translation YAML agent: section (bench_warmup=10, bench_iters=30, reference_mode), with no new env vars; bench_iters defaults to the shared optimization constant DEFAULT_EVAL_BENCHMARK_ITERATIONS when omitted so the two stages can't drift. reference_mode (reference only; candidate unchanged): compile_fallback (default, torch.compile then fall back to eager on failure - PyTorch at its best), compile, or eager (reproduces historical numbers). Print/parse/speedup contracts preserved. --- .../config/mini_kernel_pytorch_to_flydsl.yaml | 5 + src/minisweagent/run/preprocess/translate.py | 132 ++++++++++++------ 2 files changed, 91 insertions(+), 46 deletions(-) diff --git a/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml b/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml index 4cff5628d..bf10c4a22 100644 --- a/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml +++ b/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml @@ -106,6 +106,11 @@ agent: step_limit: 200 use_skills: true tool_profile: translation + # Latency benchmarking (median over N timed passes; no env vars). bench_iters + # omitted -> inherits the optimization default (DEFAULT_EVAL_BENCHMARK_ITERATIONS). + bench_warmup: 10 + bench_iters: 30 + reference_mode: compile_fallback # PyTorch at its best; eager | compile also valid model: model_class: amd_llm diff --git a/src/minisweagent/run/preprocess/translate.py b/src/minisweagent/run/preprocess/translate.py index a6aec6e3b..ce741dc85 100644 --- a/src/minisweagent/run/preprocess/translate.py +++ b/src/minisweagent/run/preprocess/translate.py @@ -170,6 +170,21 @@ def _print(msg: str) -> None: _print(f" [red]{msg}[/red]" if console else f" ERROR: {msg}") return result + # -- Benchmark / reference settings (from the translation YAML, no env vars) -- + # bench_iters defaults to the shared optimization constant so the two stages + # cannot drift; the generated harness itself reads no environment. + try: + from minisweagent.run.preprocess.harness_utils import ( + DEFAULT_EVAL_BENCHMARK_ITERATIONS as _DEFAULT_BENCH_ITERS, + ) + except Exception: + _DEFAULT_BENCH_ITERS = 30 + bench_warmup = int(agent_config_dict.get("bench_warmup", 10)) + bench_iters = int(agent_config_dict.get("bench_iters", _DEFAULT_BENCH_ITERS)) + reference_mode = str(agent_config_dict.get("reference_mode", "compile_fallback")).strip().lower() + _print(f" Latency bench: warmup={bench_warmup} iters={bench_iters} (median), " + f"reference_mode={reference_mode}") + # -- Resolve model -- # Precedence: explicit model object > explicit model_name > YAML config > factory default _model = model @@ -219,6 +234,9 @@ def _print(msg: str) -> None: model=_model, repo_root=repo_root, output_dir=output_dir, + bench_warmup=bench_warmup, + bench_iters=bench_iters, + reference_mode=reference_mode, ) except Exception as exc: msg = f"Failed to create translation harness: {exc}" @@ -743,6 +761,9 @@ def _create_translation_harness( model, repo_root: Path, output_dir: Path, + bench_warmup: int = 10, + bench_iters: int = 30, + reference_mode: str = "compile_fallback", ) -> Path: """Create a comparison harness for translation validation. @@ -755,6 +776,9 @@ def _create_translation_harness( kernel_path=kernel_path, candidate_path=candidate_path, candidate_flag=pair.harness_candidate_flag, + bench_warmup=bench_warmup, + bench_iters=bench_iters, + reference_mode=reference_mode, ) harness_path.write_text(harness_code) logger.info("Created translation harness: %s", harness_path) @@ -766,6 +790,9 @@ def _generate_minimal_translation_harness( kernel_path: Path, candidate_path: Path, candidate_flag: str, + bench_warmup: int = 10, + bench_iters: int = 30, + reference_mode: str = "compile_fallback", ) -> str: """Generate a minimal Python harness that validates translation correctness. @@ -817,27 +844,65 @@ def _is_native_pattern(module): and not hasattr(module, "Model")) +# -- Benchmark settings (baked in from the translation YAML; no env reads) -- +_BENCH_WARMUP = {bench_warmup} +_BENCH_ITERS = {bench_iters} +_REFERENCE_MODE = "{reference_mode}" + + +def _bench_median_ms(run_fn, warmup=_BENCH_WARMUP, iters=_BENCH_ITERS): + """Median latency (ms) over ``iters`` timed calls after ``warmup`` warmups. + + Uses CUDA events per iteration (no Triton). Returns (last_output, median_ms). + """ + out = None + with torch.no_grad(): + for _ in range(warmup): + run_fn() + torch.cuda.synchronize() + samples = [] + for _ in range(iters): + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + out = run_fn() + e.record() + torch.cuda.synchronize() + samples.append(s.elapsed_time(e)) + samples.sort() + return out, samples[len(samples) // 2] + + +def _make_reference_callable(model, inputs): + """Return (callable, mode_label) for the PyTorch reference, honoring _REFERENCE_MODE. + + eager -> raw eager forward. + compile -> torch.compile, errors surface. + compile_fallback -> torch.compile, fall back to eager on any failure (PyTorch at its best). + """ + eager_fn = lambda: model(*inputs) + if _REFERENCE_MODE == "eager": + return eager_fn, "eager" + try: + cmodel = torch.compile(model) + with torch.no_grad(): + cmodel(*inputs) # probe: triggers compilation outside the timed loop + return (lambda: cmodel(*inputs)), "compile" + except Exception as exc: + if _REFERENCE_MODE == "compile": + raise + print(f"Reference mode: compile failed ({{type(exc).__name__}}: {{exc}}); falling back to eager") + return eager_fn, "eager (compile fallback)" + + def _run_native(module, inputs): """Run a native-pattern module (build_model + forward).""" get_init_inputs = getattr(module, "get_init_inputs", None) init_inputs = get_init_inputs() if get_init_inputs else [] state = module.build_model(*init_inputs) - # Warmup - with torch.no_grad(): - for _ in range(3): - module.forward(state, *inputs) - torch.cuda.synchronize() - - # Timed run - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - with torch.no_grad(): - start.record() - output = module.forward(state, *inputs) - end.record() - torch.cuda.synchronize() - latency_ms = start.elapsed_time(end) + run_fn = lambda: module.forward(state, *inputs) + output, latency_ms = _bench_median_ms(run_fn) return output, latency_ms @@ -858,21 +923,9 @@ def run_reference(): model = model.half() inputs = [x.cuda().half() if isinstance(x, torch.Tensor) else x for x in inputs] - # Warmup - with torch.no_grad(): - for _ in range(3): - model(*inputs) - torch.cuda.synchronize() - - # Timed run - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - with torch.no_grad(): - start.record() - ref_output = model(*inputs) - end.record() - torch.cuda.synchronize() - latency_ms = start.elapsed_time(end) + run_fn, _ref_mode = _make_reference_callable(model, inputs) + print(f"Reference mode: {{_ref_mode}}") + ref_output, latency_ms = _bench_median_ms(run_fn) return model, inputs, ref_output, latency_ms @@ -892,21 +945,8 @@ def run_candidate(candidate_path: str, ref_inputs): inputs = ref_inputs - # Warmup - with torch.no_grad(): - for _ in range(3): - model(*inputs) - torch.cuda.synchronize() - - # Timed run - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - with torch.no_grad(): - start.record() - cand_output = model(*inputs) - end.record() - torch.cuda.synchronize() - latency_ms = start.elapsed_time(end) + run_fn = lambda: model(*inputs) + cand_output, latency_ms = _bench_median_ms(run_fn) return cand_output, latency_ms @@ -967,7 +1007,7 @@ def main(): print("CORRECTNESS: PASS") speedup = ref_latency / cand_latency if cand_latency > 0 else float("inf") - print(f"Speedup: {{speedup:.2f}}x (ref={{ref_latency:.3f}}ms, cand={{cand_latency:.3f}}ms)") + print(f"Speedup: {{speedup:.2f}}x (ref={{ref_latency:.3f}}ms, cand={{cand_latency:.3f}}ms, median of {bench_iters})") if speedup < 0.5: print("WARNING: FlyDSL candidate is significantly slower than PyTorch reference") From f848a494234044192cacd8f9ec2874a512ac077f Mon Sep 17 00:00:00 2001 From: Kristoffer Peyron Date: Mon, 8 Jun 2026 14:53:28 +0200 Subject: [PATCH 2/5] feat(translate): persist cost + always record PyTorch reference latency translation_result.json now records spend and tokens regardless of outcome: - translation_pytorch_latency_ms is always set when the harness prints it, even when the candidate fails correctness (parsed before the success/fail branch; candidate latency + speedup stay success-only since they're meaningless for an incorrect kernel). - translation_cost_usd / translation_tokens / translation_model_calls / translation_cost_rates_per_mtok aggregated from the round trajectories (input/output/cache read+write), priced with configurable per-Mtok rates (model: cost_per_mtok_*, default public Claude Opus rates). --- .../config/mini_kernel_pytorch_to_flydsl.yaml | 7 ++ src/minisweagent/run/preprocess/translate.py | 110 +++++++++++++++++- 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml b/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml index bf10c4a22..caa59dfee 100644 --- a/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml +++ b/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml @@ -119,3 +119,10 @@ model: model_kwargs: temperature: 0.0 max_tokens: 16000 + # Cost accounting rates (USD per million tokens) used to populate + # translation_cost_usd in translation_result.json. Defaults below are public + # Claude Opus rates; override per model/gateway as needed. + cost_per_mtok_input: 15.0 + cost_per_mtok_output: 75.0 + cost_per_mtok_cache_write: 18.75 + cost_per_mtok_cache_read: 1.5 diff --git a/src/minisweagent/run/preprocess/translate.py b/src/minisweagent/run/preprocess/translate.py index ce741dc85..702d17bc3 100644 --- a/src/minisweagent/run/preprocess/translate.py +++ b/src/minisweagent/run/preprocess/translate.py @@ -70,6 +70,71 @@ def _parse_timing_from_harness_output( ) +# Default LLM pricing (USD per million tokens), Claude Opus public rates. +# Overridable per key via the model: section of the translation YAML +# (cost_per_mtok_input / _output / _cache_write / _cache_read). +_DEFAULT_COST_RATES_PER_MTOK = { + "input": 15.0, + "output": 75.0, + "cache_write": 18.75, + "cache_read": 1.50, +} + + +def _aggregate_trajectory_tokens(output_dir: Path) -> dict[str, int]: + """Sum token usage across all round trajectories under *output_dir*. + + Reads ``round_*/traj.json`` (JSON or concatenated JSONL) written by the + translation agent and accumulates Anthropic-style usage fields. Returns + zeros when no trajectory is found. + """ + agg = {"calls": 0, "input": 0, "output": 0, "cache_write": 0, "cache_read": 0} + decoder = json.JSONDecoder() + + def _walk(obj): + if isinstance(obj, dict): + if "output_tokens" in obj: + agg["calls"] += 1 + agg["input"] += int(obj.get("input_tokens") or 0) + agg["output"] += int(obj.get("output_tokens") or 0) + agg["cache_write"] += int(obj.get("cache_creation_input_tokens") or 0) + agg["cache_read"] += int(obj.get("cache_read_input_tokens") or 0) + for value in obj.values(): + _walk(value) + elif isinstance(obj, list): + for value in obj: + _walk(value) + + for traj in sorted(output_dir.glob("round_*/traj.json")): + try: + text = traj.read_text() + except OSError: + continue + idx, length = 0, len(text) + while idx < length: + while idx < length and text[idx] in " \t\r\n": + idx += 1 + if idx >= length: + break + try: + obj, idx = decoder.raw_decode(text, idx) + except ValueError: + break + _walk(obj) + return agg + + +def _estimate_cost_usd(tokens: dict, rates_per_mtok: dict) -> float: + """Estimate USD cost from a token breakdown and per-million-token rates.""" + return round( + (tokens.get("input", 0) * rates_per_mtok["input"] + + tokens.get("output", 0) * rates_per_mtok["output"] + + tokens.get("cache_write", 0) * rates_per_mtok["cache_write"] + + tokens.get("cache_read", 0) * rates_per_mtok["cache_read"]) / 1e6, + 4, + ) + + def run_translation( kernel_path: Path, output_dir: Path, @@ -144,6 +209,10 @@ def _print(msg: str) -> None: "translation_rounds_used": 0, "translation_pytorch_latency_ms": None, "translation_flydsl_latency_ms": None, + "translation_speedup": None, + "translation_cost_usd": None, + "translation_tokens": None, + "translation_model_calls": None, "translation_errors": [], } @@ -327,13 +396,25 @@ def _print(msg: str) -> None: ) assert isinstance(harness_result, dict) + # Always persist the PyTorch reference latency, even when the candidate + # is incorrect or the harness errors out. The harness prints the + # reference latency before running/comparing the candidate, so it is + # available in stdout regardless of correctness. (Candidate latency and + # speedup are only meaningful for a CORRECT candidate, so those are + # parsed in the success branch below.) + _ref_only = re.search( + r"PyTorch reference latency:\s*([\d.]+)\s*ms", + harness_result.get("stdout", ""), + ) + if _ref_only: + result["translation_pytorch_latency_ms"] = float(_ref_only.group(1)) + if harness_result["success"]: _print(f" Round {round_num}: CORRECT") result["translation_success"] = True result["translation_kernel_path"] = str(candidate_path) - # Parse timing from the validation run's stdout — the harness - # prints latencies and speedup when the candidate is tested. + # Parse full timing (reference + candidate + speedup) from stdout. _parse_timing_from_harness_output( harness_result.get("stdout", ""), result, @@ -445,6 +526,31 @@ def _print(msg: str) -> None: if result["translation_success"]: _print(f" Translation successful in {result['translation_rounds_used']} rounds ({elapsed:.1f}s)") + # -- Cost accounting (token-based estimate from the round trajectories) -- + # Persisted regardless of success so failed/partial runs still record spend. + try: + rates = dict(_DEFAULT_COST_RATES_PER_MTOK) + for _key, _cfg_key in ( + ("input", "cost_per_mtok_input"), + ("output", "cost_per_mtok_output"), + ("cache_write", "cost_per_mtok_cache_write"), + ("cache_read", "cost_per_mtok_cache_read"), + ): + if model_config.get(_cfg_key) is not None: + rates[_key] = float(model_config[_cfg_key]) + tokens = _aggregate_trajectory_tokens(output_dir) + result["translation_tokens"] = tokens + result["translation_model_calls"] = tokens["calls"] or getattr(_model, "n_calls", None) + result["translation_cost_usd"] = _estimate_cost_usd(tokens, rates) + result["translation_cost_rates_per_mtok"] = rates + _print( + f" Cost: ${result['translation_cost_usd']:.2f} " + f"({tokens['calls']} calls, in={tokens['input']} out={tokens['output']} " + f"cache_r={tokens['cache_read']} cache_w={tokens['cache_write']})" + ) + except Exception as exc: + _print(f" Warning: cost accounting failed: {exc}") + # Write result metadata (output_dir / "translation_result.json").write_text(json.dumps(result, indent=2, default=str)) From aa9a235bbb1a8eb0f45c16ad6f30d142af3b05c8 Mon Sep 17 00:00:00 2001 From: Kristoffer Peyron Date: Mon, 8 Jun 2026 15:16:01 +0200 Subject: [PATCH 3/5] fix(translate): pop bench/reference settings so they don't reach the agent bench_warmup/bench_iters/reference_mode live in the agent: YAML section but are translation-harness settings, not agent fields. run_translation_agent splats agent_config into TranslationAgentConfig(**kwargs), so reading them with .get() left them in the dict and crashed every round with "TranslationAgentConfig.__init__() got an unexpected keyword argument". Use .pop() to consume them before the agent config is built. --- src/minisweagent/run/preprocess/translate.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/minisweagent/run/preprocess/translate.py b/src/minisweagent/run/preprocess/translate.py index 702d17bc3..d53c03254 100644 --- a/src/minisweagent/run/preprocess/translate.py +++ b/src/minisweagent/run/preprocess/translate.py @@ -248,9 +248,11 @@ def _print(msg: str) -> None: ) except Exception: _DEFAULT_BENCH_ITERS = 30 - bench_warmup = int(agent_config_dict.get("bench_warmup", 10)) - bench_iters = int(agent_config_dict.get("bench_iters", _DEFAULT_BENCH_ITERS)) - reference_mode = str(agent_config_dict.get("reference_mode", "compile_fallback")).strip().lower() + # pop (not get): these are translation-harness settings, not agent fields, + # so they must not be splatted into TranslationAgentConfig(**agent_config). + bench_warmup = int(agent_config_dict.pop("bench_warmup", 10)) + bench_iters = int(agent_config_dict.pop("bench_iters", _DEFAULT_BENCH_ITERS)) + reference_mode = str(agent_config_dict.pop("reference_mode", "compile_fallback")).strip().lower() _print(f" Latency bench: warmup={bench_warmup} iters={bench_iters} (median), " f"reference_mode={reference_mode}") From 973485c9c85844f9d5dfccb1722704a2aa351d56 Mon Sep 17 00:00:00 2001 From: Kristoffer Peyron Date: Mon, 8 Jun 2026 15:40:35 +0200 Subject: [PATCH 4/5] feat(translate): default translation model to claude-opus-4.8 All translation-bench arms now run on claude-opus-4.8 by default (verified accepted by the amd_llm gateway via cond48/med48 runs). --- .../run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml b/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml index caa59dfee..420482f29 100644 --- a/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml +++ b/src/minisweagent/run/preprocess/config/mini_kernel_pytorch_to_flydsl.yaml @@ -114,7 +114,7 @@ agent: model: model_class: amd_llm - model_name: claude-opus-4.6 + model_name: claude-opus-4.8 api_key: null model_kwargs: temperature: 0.0 From 9927c6f4a4bbba8c795275d94e1c207b103bf866 Mon Sep 17 00:00:00 2001 From: Kristoffer Peyron Date: Mon, 8 Jun 2026 16:54:13 +0200 Subject: [PATCH 5/5] fix(translate): import re for reference-latency parse + ruff format run_translation parsed the PyTorch reference latency with re.search but re was never imported in that scope (the file uses function-local imports), so the always-record-reference-latency path raised NameError at runtime and ruff flagged F821. Add a local import and apply ruff format to the cost helper + bench log line. --- src/minisweagent/run/preprocess/translate.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/minisweagent/run/preprocess/translate.py b/src/minisweagent/run/preprocess/translate.py index d53c03254..122d22c22 100644 --- a/src/minisweagent/run/preprocess/translate.py +++ b/src/minisweagent/run/preprocess/translate.py @@ -127,10 +127,13 @@ def _walk(obj): def _estimate_cost_usd(tokens: dict, rates_per_mtok: dict) -> float: """Estimate USD cost from a token breakdown and per-million-token rates.""" return round( - (tokens.get("input", 0) * rates_per_mtok["input"] - + tokens.get("output", 0) * rates_per_mtok["output"] - + tokens.get("cache_write", 0) * rates_per_mtok["cache_write"] - + tokens.get("cache_read", 0) * rates_per_mtok["cache_read"]) / 1e6, + ( + tokens.get("input", 0) * rates_per_mtok["input"] + + tokens.get("output", 0) * rates_per_mtok["output"] + + tokens.get("cache_write", 0) * rates_per_mtok["cache_write"] + + tokens.get("cache_read", 0) * rates_per_mtok["cache_read"] + ) + / 1e6, 4, ) @@ -253,8 +256,7 @@ def _print(msg: str) -> None: bench_warmup = int(agent_config_dict.pop("bench_warmup", 10)) bench_iters = int(agent_config_dict.pop("bench_iters", _DEFAULT_BENCH_ITERS)) reference_mode = str(agent_config_dict.pop("reference_mode", "compile_fallback")).strip().lower() - _print(f" Latency bench: warmup={bench_warmup} iters={bench_iters} (median), " - f"reference_mode={reference_mode}") + _print(f" Latency bench: warmup={bench_warmup} iters={bench_iters} (median), reference_mode={reference_mode}") # -- Resolve model -- # Precedence: explicit model object > explicit model_name > YAML config > factory default @@ -404,6 +406,8 @@ def _print(msg: str) -> None: # available in stdout regardless of correctness. (Candidate latency and # speedup are only meaningful for a CORRECT candidate, so those are # parsed in the success branch below.) + import re + _ref_only = re.search( r"PyTorch reference latency:\s*([\d.]+)\s*ms", harness_result.get("stdout", ""),