From dc8f2e68a2f7ba98256316bfd1a4f0fd69851cff Mon Sep 17 00:00:00 2001 From: Sina Rafati Date: Wed, 29 Apr 2026 01:30:30 -0400 Subject: [PATCH] no overhead latency measurement --- Magpie/bench/__init__.py | 249 +++++ Magpie/bench/_runner.py | 318 +++++++ Magpie/bench/include/magpie_bench.hpp | 238 +++++ Magpie/config.yaml | 34 + Magpie/config/__init__.py | 11 + Magpie/config/kernel.py | 7 + Magpie/config/latency.py | 238 +++++ Magpie/config/pipeline.py | 14 + Magpie/core/executor.py | 2 + Magpie/core/scheduler.py | 6 + Magpie/core/task.py | 2 + Magpie/eval/__init__.py | 4 + Magpie/eval/evaluator.py | 63 +- Magpie/eval/latency.py | 899 ++++++++++++++++++ Magpie/eval/performance.py | 101 ++ Magpie/kernel_config.yaml.example | 52 + Magpie/main.py | 310 +++++- Magpie/modes/analyze_eval/analyzer.py | 10 + Magpie/modes/compare_eval/comparator.py | 10 + docs/analysis_compare.md | 2 +- docs/benchmark.md | 2 + docs/latency.md | 180 ++++ .../analyze_hipgraph_latency.yaml | 35 + examples/simple_hip_test/analyze_latency.yaml | 37 + examples/simple_hip_test/vector_add_bench | Bin 0 -> 41536 bytes examples/simple_hip_test/vector_add_bench.hip | 69 ++ examples/simple_triton_test/README.md | 71 ++ .../analyze_triton_latency.yaml | 58 ++ .../compare_triton_blocksize.yaml | 59 ++ .../simple_triton_test/triton_vector_add.py | 155 +++ tests/test_latency.py | 364 +++++++ tests/test_main_and_kernel_config.py | 68 +- 32 files changed, 3649 insertions(+), 19 deletions(-) create mode 100644 Magpie/bench/__init__.py create mode 100644 Magpie/bench/_runner.py create mode 100644 Magpie/bench/include/magpie_bench.hpp create mode 100644 Magpie/config/latency.py create mode 100644 Magpie/eval/latency.py create mode 100644 docs/latency.md create mode 100644 examples/simple_hip_test/analyze_hipgraph_latency.yaml create mode 100644 examples/simple_hip_test/analyze_latency.yaml create mode 100755 examples/simple_hip_test/vector_add_bench create mode 100644 examples/simple_hip_test/vector_add_bench.hip create mode 100644 examples/simple_triton_test/README.md create mode 100644 examples/simple_triton_test/analyze_triton_latency.yaml create mode 100644 examples/simple_triton_test/compare_triton_blocksize.yaml create mode 100644 examples/simple_triton_test/triton_vector_add.py create mode 100644 tests/test_latency.py diff --git a/Magpie/bench/__init__.py b/Magpie/bench/__init__.py new file mode 100644 index 0000000..7412be5 --- /dev/null +++ b/Magpie/bench/__init__.py @@ -0,0 +1,249 @@ +############################################################################### +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +``magpie.bench`` — sanctioned 0-overhead kernel timing helpers. + +Two entry points: + +- ``do_bench_cudagraph(fn, rep=20, n_retries=5, estimate_reps=5)`` — + dispatch-inclusive wall-clock latency measured via CUDA graph + estimate-then-unrolled-replay. Works on AMD (HIP graphs through + ``torch.cuda.CUDAGraph``) and NVIDIA without code changes. +- ``LatencyStats`` — dataclass returned by ``do_bench_cudagraph`` carrying + median / p50 / p99 / min / max / std and the meta-parameters used. + +The marker line ``MAGPIE_LATENCY_JSON: { ... }`` printed by ``_runner.py`` +is the contract used by ``Magpie/eval/latency.py`` to ingest results from +either the bundled runner subprocess or any user-provided harness. +""" + +from __future__ import annotations + +import statistics +from dataclasses import dataclass, field, asdict +from typing import Any, Callable, Dict, List, Optional, cast + + +__all__ = [ + "LatencyStats", + "do_bench_cudagraph", + "MAGPIE_LATENCY_JSON_MARKER", +] + + +MAGPIE_LATENCY_JSON_MARKER = "MAGPIE_LATENCY_JSON:" + + +@dataclass +class LatencyStats: + """ + Summary statistics for a latency measurement. + + All time fields are in milliseconds. + + Attributes: + median_ms: Median across ``n_retries`` independent measurements. + p50_ms / p99_ms: Percentiles across the same set. + min_ms / max_ms: Min/max across the same set. + std_ms: Sample standard deviation (0.0 if ``n_retries < 2``). + samples_ms: Raw per-retry measurements (``len == n_retries``). + n_repeat: Number of unrolled ``fn()`` calls inside the timed graph. + n_retries: Number of independent graph-replay measurements taken. + estimate_ms: Per-call cost estimated from the small estimate graph + (used to size ``n_repeat``). + """ + + median_ms: float + p50_ms: float + p99_ms: float + min_ms: float + max_ms: float + std_ms: float + samples_ms: List[float] = field(default_factory=list) + n_repeat: int = 0 + n_retries: int = 0 + estimate_ms: float = 0.0 + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: Optional[Dict[str, Any]]) -> Optional["LatencyStats"]: + if not data: + return None + return cls( + median_ms=float(data.get("median_ms", 0.0)), + p50_ms=float(data.get("p50_ms", 0.0)), + p99_ms=float(data.get("p99_ms", 0.0)), + min_ms=float(data.get("min_ms", 0.0)), + max_ms=float(data.get("max_ms", 0.0)), + std_ms=float(data.get("std_ms", 0.0)), + samples_ms=list(data.get("samples_ms", []) or []), + n_repeat=int(data.get("n_repeat", 0)), + n_retries=int(data.get("n_retries", 0)), + estimate_ms=float(data.get("estimate_ms", 0.0)), + ) + + @classmethod + def from_samples( + cls, + samples_ms: List[float], + *, + n_repeat: int, + n_retries: int, + estimate_ms: float = 0.0, + ) -> "LatencyStats": + """Compute summary statistics from a list of per-retry latency samples.""" + if not samples_ms: + return cls( + median_ms=0.0, + p50_ms=0.0, + p99_ms=0.0, + min_ms=0.0, + max_ms=0.0, + std_ms=0.0, + samples_ms=[], + n_repeat=n_repeat, + n_retries=n_retries, + estimate_ms=estimate_ms, + ) + + sorted_ms = sorted(samples_ms) + n = len(sorted_ms) + # Linear-interpolation percentile (matches numpy's default) + def _pct(p: float) -> float: + if n == 1: + return sorted_ms[0] + rank = p * (n - 1) / 100.0 + lo = int(rank) + hi = min(lo + 1, n - 1) + frac = rank - lo + return sorted_ms[lo] + (sorted_ms[hi] - sorted_ms[lo]) * frac + + std_ms = statistics.stdev(samples_ms) if n >= 2 else 0.0 + + return cls( + median_ms=statistics.median(samples_ms), + p50_ms=_pct(50.0), + p99_ms=_pct(99.0), + min_ms=sorted_ms[0], + max_ms=sorted_ms[-1], + std_ms=std_ms, + samples_ms=list(samples_ms), + n_repeat=n_repeat, + n_retries=n_retries, + estimate_ms=estimate_ms, + ) + + +def do_bench_cudagraph( + fn: Callable[[], Any], + rep: int = 20, + n_retries: int = 5, + estimate_reps: int = 5, +) -> LatencyStats: + """ + Benchmark ``fn`` via CUDA-graph estimate-then-unrolled-replay. + + Algorithm (mirrors the user-attached snippet byte-for-byte): + + 1. Warmup: call ``fn()`` once on a side stream. + 2. Capture an "estimate" graph containing ``estimate_reps`` calls of + ``fn``; replay it once to get ``estimate_ms`` per-call. + 3. Compute ``n_repeat = max(1, int(rep / estimate_ms))`` so the timed + graph runs for roughly ``rep`` milliseconds. + 4. Capture a fresh graph with ``n_repeat`` unrolled calls. + 5. Replay the timed graph ``n_retries`` times, each time bracketed by + a fresh pair of ``torch.cuda.Event`` records, and divide the + elapsed time by ``n_repeat``. + 6. Report ``statistics.median`` of the ``n_retries`` per-call samples + along with min/max/p50/p99/std. + + The dispatch overhead of each ``fn()`` call is amortized across + ``n_repeat`` replays inside the captured graph, so per-call latency + closely tracks the kernel time *plus* one graph-launch's worth of + overhead (typically tens of microseconds). + + Args: + fn: Zero-arg callable that issues the workload onto the current + CUDA stream. Must be safe to capture inside ``torch.cuda.graph``. + rep: Target measurement window in milliseconds. + n_retries: Number of independent replay measurements. + estimate_reps: Number of ``fn()`` calls inside the small estimate graph. + + Returns: + ``LatencyStats`` with per-call median latency in ms. + + Raises: + ImportError: If ``torch`` is not installed. + RuntimeError: If CUDA / HIP is not available. + """ + try: + import torch + except ImportError as e: + raise ImportError( + "magpie.bench.do_bench_cudagraph requires PyTorch. " + "Install torch (or torch+rocm) and retry." + ) from e + + if not torch.cuda.is_available(): + raise RuntimeError( + "magpie.bench.do_bench_cudagraph requires a CUDA / HIP capable GPU; " + "torch.cuda.is_available() is False." + ) + + stream = cast(torch.cuda.Stream, torch.cuda.Stream()) + stream.wait_stream(cast(torch.cuda.Stream, torch.cuda.current_stream())) + with torch.cuda.stream(stream): + torch.cuda.synchronize() + # Warmup + fn() + + # Step 1: capture initial estimate graph + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + for _ in range(estimate_reps): + fn() + torch.cuda.synchronize() + + # Step 2: estimate per-call device time + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record(stream) + g.replay() + end_event.record(stream) + torch.cuda.synchronize() + + estimate_ms = start_event.elapsed_time(end_event) / estimate_reps + if estimate_ms == 0: + n_repeat = 1000 + else: + n_repeat = max(1, int(rep / estimate_ms)) + + # Step 3: capture timed graph with n_repeat unrolled calls + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + for _ in range(n_repeat): + fn() + torch.cuda.synchronize() + + # Step 4: measure n_retries replays + samples_ms: List[float] = [] + for _ in range(n_retries): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record(stream) + g.replay() + end_event.record(stream) + torch.cuda.synchronize() + samples_ms.append(start_event.elapsed_time(end_event) / n_repeat) + + return LatencyStats.from_samples( + samples_ms, + n_repeat=n_repeat, + n_retries=n_retries, + estimate_ms=estimate_ms, + ) diff --git a/Magpie/bench/_runner.py b/Magpie/bench/_runner.py new file mode 100644 index 0000000..ad20c2a --- /dev/null +++ b/Magpie/bench/_runner.py @@ -0,0 +1,318 @@ +############################################################################### +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +Subprocess harness for Magpie's import-based latency benchmarking. + +Spawned by ``Magpie/eval/latency.py``; never imported by the rest of Magpie. + +Two modes (selected by ``--profile``): + +1. (default) **CUDA-graph timing.** Runs ``magpie.bench.do_bench_cudagraph`` + on the user's callable and prints one line: + + MAGPIE_LATENCY_JSON: {"stats": {...}, "n_repeat": ..., ...} + +2. **--profile** (kernel-trace harness mode). Runs a tight + ``for _ in range(N): fn(); torch.cuda.synchronize()`` loop sized to + roughly ``rep_ms`` so the outer ``rocprofv3 --kernel-trace`` invocation + captures clean per-dispatch HW kernel durations. Still prints the + marker line, but ``stats`` is ``null`` (the wrapper parses the rocprofv3 + CSV instead). + +Inputs are read from environment variables (kept on env, not argv, so the +runner is invoked uniformly under both ``python _runner.py`` and +``rocprofv3 ... -- python _runner.py --profile``): + + - MAGPIE_BENCH_MODULE: importable module path (required) + - MAGPIE_BENCH_CALLABLE: attribute of the callable (required) + - MAGPIE_BENCH_INPUTS_FUNC: attribute of the inputs factory + (default: "get_inputs") + - MAGPIE_BENCH_REP_MS: target measurement window in ms (default: 20) + - MAGPIE_BENCH_N_RETRIES: retries (default: 5) + - MAGPIE_BENCH_ESTIMATE_REPS: estimate-graph reps (default: 5) + - MAGPIE_BENCH_WARMUP_ITERS: eager warmup iters before timing (default: 5) + - MAGPIE_BENCH_SEED: torch.manual_seed value (default: 42) + - MAGPIE_BENCH_PROFILE_REP_MS: only used with --profile (default: rep_ms*5) + +Reproducibility note: the seed is set BEFORE inputs are materialized so +tensor shapes/values are stable across runs. +""" + +from __future__ import annotations + +import argparse +import importlib +import json +import os +import sys +import time +import traceback +from typing import Any, Callable, Tuple + + +def _env_int(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None or raw == "": + return default + try: + return int(raw) + except ValueError: + return default + + +def _env_str(name: str, default: str) -> str: + raw = os.environ.get(name) + if raw is None or raw == "": + return default + return raw + + +def _seed_everything(seed: int) -> None: + """Make tensor shapes/values reproducible BEFORE inputs are materialized.""" + try: + import torch + + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + except Exception: + pass + + try: + import random + + random.seed(seed) + except Exception: + pass + + try: + import numpy as np # noqa: F401 + + np.random.seed(seed) + except Exception: + pass + + +def _resolve_target( + module_name: str, callable_name: str, inputs_func_name: str +) -> Tuple[Callable[..., Any], Callable[[], Any]]: + """Import the user's module and look up the callable + inputs factory.""" + module = importlib.import_module(module_name) + if not hasattr(module, callable_name): + raise AttributeError( + f"module {module_name!r} has no attribute {callable_name!r}" + ) + if not hasattr(module, inputs_func_name): + raise AttributeError( + f"module {module_name!r} has no attribute {inputs_func_name!r} " + f"(expected an inputs factory)" + ) + fn = getattr(module, callable_name) + inputs_factory = getattr(module, inputs_func_name) + if not callable(fn): + raise TypeError(f"{module_name}.{callable_name} is not callable") + if not callable(inputs_factory): + raise TypeError(f"{module_name}.{inputs_func_name} is not callable") + return fn, inputs_factory + + +def _normalize_inputs(raw: Any) -> Tuple[tuple, dict]: + """Coerce the inputs factory return value into ``(args, kwargs)``.""" + if raw is None: + return tuple(), {} + # Accept (args, kwargs) + if isinstance(raw, tuple) and len(raw) == 2 and isinstance(raw[1], dict): + first = raw[0] + if isinstance(first, (list, tuple)): + return tuple(first), dict(raw[1]) + # Accept positional iterable + if isinstance(raw, (list, tuple)): + return tuple(raw), {} + # Accept dict-as-kwargs + if isinstance(raw, dict): + return tuple(), dict(raw) + # Single positional value + return (raw,), {} + + +def _emit(payload: dict) -> None: + """Print the canonical marker line.""" + sys.stdout.write( + "MAGPIE_LATENCY_JSON: " + json.dumps(payload, default=str) + "\n" + ) + sys.stdout.flush() + + +def main(argv: list) -> int: + parser = argparse.ArgumentParser( + prog="magpie.bench._runner", + description="Magpie 0-overhead latency benchmark runner", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Run in kernel-trace harness mode (tight loop, no graph capture). " + "Intended to be wrapped by rocprofv3 --kernel-trace.", + ) + parser.add_argument( + "--module", type=str, default=None, + help="Override MAGPIE_BENCH_MODULE", + ) + parser.add_argument( + "--callable", type=str, default=None, + help="Override MAGPIE_BENCH_CALLABLE", + ) + parser.add_argument( + "--get-inputs", type=str, default=None, + help="Override MAGPIE_BENCH_INPUTS_FUNC", + ) + args = parser.parse_args(argv) + + module_name = args.module or _env_str("MAGPIE_BENCH_MODULE", "") + callable_name = args.callable or _env_str("MAGPIE_BENCH_CALLABLE", "") + inputs_func_name = ( + args.get_inputs or _env_str("MAGPIE_BENCH_INPUTS_FUNC", "get_inputs") + ) + + if not module_name or not callable_name: + _emit({ + "stats": None, + "error": "MAGPIE_BENCH_MODULE and MAGPIE_BENCH_CALLABLE are required", + }) + return 2 + + rep_ms = _env_int("MAGPIE_BENCH_REP_MS", 20) + n_retries = _env_int("MAGPIE_BENCH_N_RETRIES", 5) + estimate_reps = _env_int("MAGPIE_BENCH_ESTIMATE_REPS", 5) + warmup_iters = _env_int("MAGPIE_BENCH_WARMUP_ITERS", 5) + seed = _env_int("MAGPIE_BENCH_SEED", 42) + profile_rep_ms = _env_int("MAGPIE_BENCH_PROFILE_REP_MS", max(rep_ms * 5, 50)) + + try: + _seed_everything(seed) + fn_raw, inputs_factory = _resolve_target( + module_name, callable_name, inputs_func_name + ) + raw_inputs = inputs_factory() + args_tuple, kwargs = _normalize_inputs(raw_inputs) + + def call_fn() -> None: + fn_raw(*args_tuple, **kwargs) + + for _ in range(max(0, warmup_iters)): + call_fn() + + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.synchronize() + except Exception: + pass + + if args.profile: + # Kernel-trace harness mode: tight loop, no graph capture. + # Pre-size N from a quick wall-clock estimate so the loop runs + # roughly profile_rep_ms milliseconds without CUDA event overhead. + est_calls = max(estimate_reps, 1) + t0 = time.perf_counter() + for _ in range(est_calls): + call_fn() + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.synchronize() + except Exception: + pass + t1 = time.perf_counter() + per_call_ms = ((t1 - t0) * 1000.0) / est_calls + if per_call_ms <= 0: + n_iter = 1000 + else: + n_iter = max(1, int(profile_rep_ms / per_call_ms)) + + # Tight, dispatch-only loop. No CUDA events. The outer rocprofv3 + # --kernel-trace is what produces timing. + t_start = time.perf_counter() + for _ in range(n_iter): + call_fn() + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.synchronize() + except Exception: + pass + t_end = time.perf_counter() + + _emit( + { + "mode": "profile", + "stats": None, + "n_iter": n_iter, + "per_call_estimate_ms": per_call_ms, + "wall_loop_ms": (t_end - t_start) * 1000.0, + "module": module_name, + "callable": callable_name, + "seed": seed, + } + ) + return 0 + + # Default mode: CUDA-graph based wall-clock timing. + from Magpie.bench import do_bench_cudagraph # type: ignore + + stats = do_bench_cudagraph( + call_fn, + rep=rep_ms, + n_retries=n_retries, + estimate_reps=estimate_reps, + ) + + _emit( + { + "mode": "cuda_graph", + "stats": stats.to_dict(), + "module": module_name, + "callable": callable_name, + "seed": seed, + } + ) + return 0 + + except SystemExit: + raise + except BaseException as e: + _emit( + { + "stats": None, + "error": f"{type(e).__name__}: {e}", + "traceback": traceback.format_exc(), + "module": module_name, + "callable": callable_name, + } + ) + return 1 + + +if __name__ == "__main__": + # Allow ``python -m Magpie.bench._runner`` and ``python _runner.py``. + # The fallback handles being launched directly (no parent package) by + # adding the repo root to sys.path so ``import Magpie.bench`` works. + try: + from Magpie.bench import do_bench_cudagraph # noqa: F401 + except ImportError: + # When invoked as ``python /path/to/Magpie/bench/_runner.py``, the + # parent ``Magpie`` package may not be on sys.path. Walk up two dirs + # and prepend so the import succeeds. + repo_root = os.path.abspath( + os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) + ) + if repo_root not in sys.path: + sys.path.insert(0, repo_root) + + sys.exit(main(sys.argv[1:])) diff --git a/Magpie/bench/include/magpie_bench.hpp b/Magpie/bench/include/magpie_bench.hpp new file mode 100644 index 0000000..87ea352 --- /dev/null +++ b/Magpie/bench/include/magpie_bench.hpp @@ -0,0 +1,238 @@ +// Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +// See LICENSE for license information. +// +// Header-only HIP-graph latency helper that mirrors the Python +// ``magpie.bench.do_bench_cudagraph`` algorithm so HIP testcases produce +// directly comparable wall-clock numbers (same warmup -> estimate -> +// unrolled replay -> median across retries math). +// +// Usage from a .hip / .cpp testcase: +// +// #define MAGPIE_BENCH_IMPLEMENTATION +// #include "magpie_bench.hpp" +// +// int main() { +// ... setup ... +// auto stats = magpie::bench::do_bench_hipgraph( +// [&]() { my_kernel<<>>(...); }, +// /*rep_ms=*/20, /*n_retries=*/5, /*estimate_reps=*/5); +// magpie::bench::print_marker(stats); +// return 0; +// } +// +// Output (one line, parsed by Magpie's user-harness sub-mode): +// +// MAGPIE_LATENCY_JSON: {"stats":{"median_ms":...,"p99_ms":...,...}} +// +// Magpie's ``Latency`` stage (Magpie/eval/latency.py) picks this up via +// the same ``MAGPIE_LATENCY_JSON:`` marker contract used for Triton +// kernels; HIP and Triton report identical wall-clock stats. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define MAGPIE_BENCH_HIP_CHECK(call) \ + do { \ + hipError_t _err = (call); \ + if (_err != hipSuccess) { \ + throw std::runtime_error( \ + std::string("HIP error: ") + hipGetErrorString(_err)); \ + } \ + } while (0) + +namespace magpie { +namespace bench { + +struct LatencyStats { + double median_ms = 0.0; + double p50_ms = 0.0; + double p99_ms = 0.0; + double min_ms = 0.0; + double max_ms = 0.0; + double std_ms = 0.0; + double estimate_ms = 0.0; + int n_repeat = 0; + int n_retries = 0; + std::vector samples_ms; +}; + +inline hipStream_t& current_stream() { + // Side stream used by do_bench_hipgraph; users issuing inside ``fn`` + // should target this stream so the captured graph is well-defined. + static hipStream_t s = nullptr; + return s; +} + +namespace detail { + +inline double percentile(std::vector sorted_v, double p) { + if (sorted_v.empty()) return 0.0; + if (sorted_v.size() == 1) return sorted_v[0]; + double rank = p * (sorted_v.size() - 1) / 100.0; + int lo = static_cast(rank); + int hi = std::min(lo + 1, static_cast(sorted_v.size()) - 1); + double frac = rank - lo; + return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * frac; +} + +inline double median(std::vector v) { + if (v.empty()) return 0.0; + std::sort(v.begin(), v.end()); + size_t n = v.size(); + if (n % 2 == 1) return v[n / 2]; + return 0.5 * (v[n / 2 - 1] + v[n / 2]); +} + +inline double stddev(const std::vector& v) { + if (v.size() < 2) return 0.0; + double mean = 0.0; + for (double x : v) mean += x; + mean /= v.size(); + double var = 0.0; + for (double x : v) var += (x - mean) * (x - mean); + return std::sqrt(var / (v.size() - 1)); +} + +inline LatencyStats stats_from_samples(const std::vector& samples, + int n_repeat, + int n_retries, + double estimate_ms) { + LatencyStats s; + s.n_repeat = n_repeat; + s.n_retries = n_retries; + s.estimate_ms = estimate_ms; + s.samples_ms = samples; + if (samples.empty()) return s; + + auto sorted = samples; + std::sort(sorted.begin(), sorted.end()); + s.min_ms = sorted.front(); + s.max_ms = sorted.back(); + s.median_ms = median(samples); + s.p50_ms = percentile(sorted, 50.0); + s.p99_ms = percentile(sorted, 99.0); + s.std_ms = stddev(samples); + return s; +} + +} // namespace detail + +// Benchmark ``fn`` via HIP-graph estimate-then-unrolled-replay. Mirrors +// ``magpie.bench.do_bench_cudagraph`` byte-for-byte (same n_repeat +// formula, same median across n_retries). +template +inline LatencyStats do_bench_hipgraph(Fn&& fn, + int rep_ms = 20, + int n_retries = 5, + int estimate_reps = 5) { + hipStream_t& stream = current_stream(); + if (stream == nullptr) { + MAGPIE_BENCH_HIP_CHECK(hipStreamCreate(&stream)); + } + + // Warmup + fn(); + MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream)); + + // ----- Step 1: capture estimate graph ----------------------------- + hipGraph_t est_graph = nullptr; + hipGraphExec_t est_exec = nullptr; + + MAGPIE_BENCH_HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeThreadLocal)); + for (int i = 0; i < estimate_reps; ++i) fn(); + MAGPIE_BENCH_HIP_CHECK(hipStreamEndCapture(stream, &est_graph)); + MAGPIE_BENCH_HIP_CHECK(hipGraphInstantiate(&est_exec, est_graph, nullptr, nullptr, 0)); + MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream)); + + // ----- Step 2: estimate per-call cost ----------------------------- + hipEvent_t e0, e1; + MAGPIE_BENCH_HIP_CHECK(hipEventCreate(&e0)); + MAGPIE_BENCH_HIP_CHECK(hipEventCreate(&e1)); + + MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e0, stream)); + MAGPIE_BENCH_HIP_CHECK(hipGraphLaunch(est_exec, stream)); + MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e1, stream)); + MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream)); + float est_total_ms = 0.0f; + MAGPIE_BENCH_HIP_CHECK(hipEventElapsedTime(&est_total_ms, e0, e1)); + double estimate_ms = static_cast(est_total_ms) / estimate_reps; + + int n_repeat; + if (estimate_ms <= 0.0) { + n_repeat = 1000; + } else { + n_repeat = std::max(1, static_cast(rep_ms / estimate_ms)); + } + + MAGPIE_BENCH_HIP_CHECK(hipGraphExecDestroy(est_exec)); + MAGPIE_BENCH_HIP_CHECK(hipGraphDestroy(est_graph)); + + // ----- Step 3: capture timed graph with n_repeat unrolled calls -- + hipGraph_t timed_graph = nullptr; + hipGraphExec_t timed_exec = nullptr; + + MAGPIE_BENCH_HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeThreadLocal)); + for (int i = 0; i < n_repeat; ++i) fn(); + MAGPIE_BENCH_HIP_CHECK(hipStreamEndCapture(stream, &timed_graph)); + MAGPIE_BENCH_HIP_CHECK(hipGraphInstantiate(&timed_exec, timed_graph, nullptr, nullptr, 0)); + MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream)); + + // ----- Step 4: measure n_retries replays -------------------------- + std::vector samples; + samples.reserve(n_retries); + + for (int i = 0; i < n_retries; ++i) { + MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e0, stream)); + MAGPIE_BENCH_HIP_CHECK(hipGraphLaunch(timed_exec, stream)); + MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e1, stream)); + MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream)); + float total_ms = 0.0f; + MAGPIE_BENCH_HIP_CHECK(hipEventElapsedTime(&total_ms, e0, e1)); + samples.push_back(static_cast(total_ms) / n_repeat); + } + + MAGPIE_BENCH_HIP_CHECK(hipEventDestroy(e0)); + MAGPIE_BENCH_HIP_CHECK(hipEventDestroy(e1)); + MAGPIE_BENCH_HIP_CHECK(hipGraphExecDestroy(timed_exec)); + MAGPIE_BENCH_HIP_CHECK(hipGraphDestroy(timed_graph)); + + return detail::stats_from_samples(samples, n_repeat, n_retries, estimate_ms); +} + +// Print the canonical ``MAGPIE_LATENCY_JSON: {...}`` line to stdout. The +// payload schema matches what ``Magpie/bench/_runner.py`` emits so the +// same parser in ``Magpie/eval/latency.py`` ingests both. +inline void print_marker(const LatencyStats& s, + const std::string& kernel_name = "") { + std::printf( + "MAGPIE_LATENCY_JSON: " + "{\"mode\":\"hip_graph\"," + "\"stats\":{\"median_ms\":%.6f,\"p50_ms\":%.6f,\"p99_ms\":%.6f," + "\"min_ms\":%.6f,\"max_ms\":%.6f,\"std_ms\":%.6f," + "\"n_repeat\":%d,\"n_retries\":%d,\"estimate_ms\":%.6f," + "\"samples_ms\":[", + s.median_ms, s.p50_ms, s.p99_ms, + s.min_ms, s.max_ms, s.std_ms, + s.n_repeat, s.n_retries, s.estimate_ms); + for (size_t i = 0; i < s.samples_ms.size(); ++i) { + std::printf("%s%.6f", (i ? "," : ""), s.samples_ms[i]); + } + std::printf("]}"); + if (!kernel_name.empty()) { + std::printf(",\"kernel_name\":\"%s\"", kernel_name.c_str()); + } + std::printf("}\n"); + std::fflush(stdout); +} + +} // namespace bench +} // namespace magpie diff --git a/Magpie/config.yaml b/Magpie/config.yaml index 95999e1..f99a192 100644 --- a/Magpie/config.yaml +++ b/Magpie/config.yaml @@ -157,6 +157,40 @@ performance: timeout_seconds: 600 # Timeout per profiling run +# ============================================================================= +# Latency Harness (0-overhead in-process kernel timing) +# ============================================================================= +# Complements the Performance stage with explicit dispatch-inclusive +# wall-clock latency (cuda_graph) and/or kernel-only latency (kernel_trace +# via rocprofv3 --kernel-trace). See docs/latency.md. +latency: + enabled: true + + # auto -> both for triton/pytorch/cuda; rocprof_timestamps for hip + # cuda_graph | kernel_trace | rocprof_timestamps | both | none + method: auto + + # The headline number used by `compare` mode for ranking + # wall_median_ms (end-to-end) | kernel_median_ms (kernel-only, autotuning) + primary_metric: wall_median_ms + + # do_bench_cudagraph parameters (mirrors the canonical Triton snippet) + rep_ms: 20 + n_retries: 5 + estimate_reps: 5 + warmup_iters: 5 + + # Reproducibility: torch.manual_seed + torch.cuda.manual_seed_all + seed: 42 + + # Optional regex applied to per-dispatch kernel names when aggregating + # rocprof timestamps / rocprofv3 kernel-trace output + kernel_filter: null + + # Per-subprocess timeout (covers the cuda_graph runner + the rocprofv3 wrap) + timeout_seconds: 120 + + compare: # Per-backend weights (rocprof-compute for AMD, ncu for NVIDIA) perf_weights_rocprof: diff --git a/Magpie/config/__init__.py b/Magpie/config/__init__.py index e87e054..b52b730 100644 --- a/Magpie/config/__init__.py +++ b/Magpie/config/__init__.py @@ -36,6 +36,12 @@ METRIX_KEY_METRICS, DEFAULT_ROCPROF_METRIC_BLOCKS, ) +from .latency import ( + LatencyConfig, + BenchTarget, + LATENCY_METHODS, + PRIMARY_METRICS, +) __all__ = [ # Pipeline configuration @@ -60,4 +66,9 @@ "ROCPROF_KEY_METRICS", "METRIX_KEY_METRICS", "DEFAULT_ROCPROF_METRIC_BLOCKS", + # Latency configuration + "LatencyConfig", + "BenchTarget", + "LATENCY_METHODS", + "PRIMARY_METRICS", ] diff --git a/Magpie/config/kernel.py b/Magpie/config/kernel.py index 4eab7d1..76bf9f8 100644 --- a/Magpie/config/kernel.py +++ b/Magpie/config/kernel.py @@ -68,6 +68,11 @@ class KernelEvalConfig: # Performance profiling - custom command(s) to replace built-in profiler prof_command: Optional[List] = None + # Latency benchmarking (Triton/PyTorch/CUDA in-process harness) + # Optional per-kernel BenchTarget spec ({"module", "callable", "get_inputs"}). + # Wins over LatencyConfig.bench_target when both are set. + bench_target: Optional[Dict[str, Any]] = None + # Input generation (for KenrelBench) get_inputs_func: str = "get_inputs" get_init_inputs_func: str = "get_init_inputs" @@ -153,6 +158,7 @@ def to_dict(self) -> Dict[str, Any]: "compiling_command": self.compiling_command, "testcase_command": self.testcase_command, "prof_command": self.prof_command, + "bench_target": self.bench_target, "get_inputs_func": self.get_inputs_func, "get_init_inputs_func": self.get_init_inputs_func, "input_shapes": self.input_shapes, @@ -181,6 +187,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "KernelEvalConfig": compiling_command=data.get("compiling_command"), testcase_command=data.get("testcase_command"), prof_command=data.get("prof_command"), + bench_target=data.get("bench_target"), get_inputs_func=data.get("get_inputs_func", "get_inputs"), get_init_inputs_func=data.get("get_init_inputs_func", "get_init_inputs"), input_shapes=data.get("input_shapes", []), diff --git a/Magpie/config/latency.py b/Magpie/config/latency.py new file mode 100644 index 0000000..878b4cf --- /dev/null +++ b/Magpie/config/latency.py @@ -0,0 +1,238 @@ +############################################################################### +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +Latency evaluation configuration. + +Defines a 0-overhead in-process timing harness that complements the +HW-counter-based ``Performance`` stage: + +- ``cuda_graph`` — dispatch-inclusive wall-clock latency via + ``do_bench_cudagraph`` (warmup -> capture -> unrolled + replay -> median across retries). +- ``kernel_trace`` — kernel-only timing free of dispatch noise; the harness + runs in ``--profile`` mode (tight loop, no graph) and + the outer ``rocprofv3 --kernel-trace`` produces HW + per-dispatch durations. +- ``rocprof_timestamps`` — reuse ``pmc_perf.csv`` already produced by the + ``Performance`` stage; no extra subprocess. +- ``both`` — run both wall-clock and kernel-only and emit + ``dispatch_overhead_us = wall - kernel``. + +For kernel-config autotuning (BLOCK sizes, num_warps, num_stages) the dispatch +overhead is roughly constant across configs and dominates wall-clock numbers +when the kernel runs in microseconds — use ``primary_metric=kernel_median_ms`` +to rank by the dispatch-free measurement. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from .pipeline import KernelType + + +# --------------------------------------------------------------------------- +# Sentinel string literals (not Enums to keep YAML/JSON round-tripping trivial) +# --------------------------------------------------------------------------- + +LATENCY_METHODS = ( + "auto", + "cuda_graph", + "kernel_trace", + "rocprof_timestamps", + "both", + "none", +) + +PRIMARY_METRICS = ("wall_median_ms", "kernel_median_ms") + + +@dataclass +class BenchTarget: + """ + Import-based benchmark target. + + The runner imports ``module``, looks up ``callable`` and ``get_inputs``, + materializes inputs by calling ``get_inputs()`` (which must return either + a tuple ``(args, kwargs)`` or a single args tuple/list), then times + ``callable(*args, **kwargs)`` with ``magpie.bench.do_bench_cudagraph``. + + Attributes: + module: Importable module path (e.g. ``my_kernels.scaled_mm``). + callable: Attribute name of the callable inside the module. + get_inputs: Attribute name of the inputs factory inside the module. + Must return ``(args, kwargs)`` or a positional tuple. + """ + + module: str + callable: str + get_inputs: str = "get_inputs" + + def to_dict(self) -> Dict[str, Any]: + return { + "module": self.module, + "callable": self.callable, + "get_inputs": self.get_inputs, + } + + @classmethod + def from_dict(cls, data: Optional[Dict[str, Any]]) -> Optional["BenchTarget"]: + if not data: + return None + if "module" not in data or "callable" not in data: + return None + return cls( + module=str(data["module"]), + callable=str(data["callable"]), + get_inputs=str(data.get("get_inputs", "get_inputs")), + ) + + +@dataclass +class LatencyConfig: + """ + Configuration for the Latency evaluation stage. + + Attributes: + enabled: Master switch. ``False`` skips the stage entirely. + method: One of ``LATENCY_METHODS``. ``auto`` selects ``both`` for + ``TRITON``/``PYTORCH``/``CUDA`` and ``rocprof_timestamps`` for + ``HIP`` (since HIP testcases are native binaries that don't + import torch). + primary_metric: Which median is reported as the headline number and + used by ``compare`` mode for ranking. + rep_ms: Target measurement window in milliseconds. ``do_bench_cudagraph`` + picks ``n_repeat = max(1, int(rep_ms / estimate_ms))``. + n_retries: How many independent measurements to take (median is reported). + estimate_reps: How many ``fn()`` calls to capture for the initial + cost-estimate graph. + warmup_iters: Eager warmup iterations before any graph capture / timing. + seed: Seed passed to ``torch.manual_seed`` / ``torch.cuda.manual_seed_all`` + before inputs are materialized — guarantees reproducible tensor + contents and sizes across runs. + kernel_filter: Optional regex applied to per-dispatch kernel names when + aggregating ``rocprof_timestamps`` / ``kernel_trace`` + results. + kernel_type: Auto-selection input. Set by ``PipelineConfig`` so that + ``method=auto`` can pick the right backend. + gpu_arch: Auto-selection input (``gfx*`` -> AMD, ``sm_*`` -> NVIDIA). + bench_target: Import-based target (sub-mode A). When ``None``, the + Latency stage falls back to running the user's + ``testcase_command`` and parsing a ``MAGPIE_LATENCY_JSON:`` + stdout marker (sub-mode B). + pythonpath: Extra absolute paths prepended to ``PYTHONPATH`` of the + benchmark subprocess so non-installed user packages import + cleanly. + timeout_seconds: Per-subprocess timeout. + output_dir: Where ``kernel_trace`` mode writes its rocprofv3 CSV. + ``None`` -> a sibling ``latency/`` folder under the + workload dir. + """ + + enabled: bool = True + method: str = "auto" + primary_metric: str = "wall_median_ms" + rep_ms: int = 20 + n_retries: int = 5 + estimate_reps: int = 5 + warmup_iters: int = 5 + seed: int = 42 + kernel_filter: Optional[str] = None + + kernel_type: Optional["KernelType"] = None + gpu_arch: Optional[str] = None + + bench_target: Optional[BenchTarget] = None + pythonpath: List[str] = field(default_factory=list) + + timeout_seconds: float = 120.0 + output_dir: Optional[str] = None + + def __post_init__(self) -> None: + if self.method not in LATENCY_METHODS: + raise ValueError( + f"latency.method must be one of {LATENCY_METHODS}, got {self.method!r}" + ) + if self.primary_metric not in PRIMARY_METRICS: + raise ValueError( + f"latency.primary_metric must be one of {PRIMARY_METRICS}, " + f"got {self.primary_metric!r}" + ) + + # ------------------------------------------------------------------ + # Method resolution + # ------------------------------------------------------------------ + + def resolve_method(self) -> str: + """ + Resolve ``method=auto`` into a concrete method based on the configured + kernel type and GPU architecture. + + Selection table: + - HIP -> ``rocprof_timestamps`` + - TRITON / PYTORCH / CUDA -> ``both`` + - unknown / no kernel type -> ``cuda_graph`` (best portable default) + """ + if self.method != "auto": + return self.method + + from .pipeline import KernelType + + if self.kernel_type == KernelType.HIP: + return "rocprof_timestamps" + if self.kernel_type in (KernelType.TRITON, KernelType.PYTORCH, KernelType.CUDA): + return "both" + return "cuda_graph" + + # ------------------------------------------------------------------ + # Serialization + # ------------------------------------------------------------------ + + def to_dict(self) -> Dict[str, Any]: + return { + "enabled": self.enabled, + "method": self.method, + "primary_metric": self.primary_metric, + "rep_ms": self.rep_ms, + "n_retries": self.n_retries, + "estimate_reps": self.estimate_reps, + "warmup_iters": self.warmup_iters, + "seed": self.seed, + "kernel_filter": self.kernel_filter, + "bench_target": self.bench_target.to_dict() if self.bench_target else None, + "pythonpath": list(self.pythonpath), + "timeout_seconds": self.timeout_seconds, + "output_dir": self.output_dir, + } + + @classmethod + def from_dict( + cls, + data: Optional[Dict[str, Any]], + kernel_type: Optional["KernelType"] = None, + gpu_arch: Optional[str] = None, + ) -> "LatencyConfig": + data = dict(data or {}) + bench_target = BenchTarget.from_dict(data.get("bench_target")) + return cls( + enabled=bool(data.get("enabled", True)), + method=str(data.get("method", "auto")), + primary_metric=str(data.get("primary_metric", "wall_median_ms")), + rep_ms=int(data.get("rep_ms", 20)), + n_retries=int(data.get("n_retries", 5)), + estimate_reps=int(data.get("estimate_reps", 5)), + warmup_iters=int(data.get("warmup_iters", 5)), + seed=int(data.get("seed", 42)), + kernel_filter=data.get("kernel_filter"), + kernel_type=kernel_type, + gpu_arch=gpu_arch, + bench_target=bench_target, + pythonpath=list(data.get("pythonpath", []) or []), + timeout_seconds=float(data.get("timeout_seconds", 120.0)), + output_dir=data.get("output_dir"), + ) diff --git a/Magpie/config/pipeline.py b/Magpie/config/pipeline.py index 357e966..2cbb73b 100644 --- a/Magpie/config/pipeline.py +++ b/Magpie/config/pipeline.py @@ -14,6 +14,7 @@ from typing import Optional from .correctness import CorrectnessConfig +from .latency import LatencyConfig from .performance import PerformanceConfig @@ -69,6 +70,7 @@ class PipelineConfig: compiling_config: Optional[CompilingConfig] = None correctness_config: Optional[CorrectnessConfig] = None performance_config: Optional[PerformanceConfig] = None + latency_config: Optional[LatencyConfig] = None output_dir: str = "./results" verbose: bool = False @@ -87,6 +89,18 @@ def __post_init__(self): kernel_type=self.kernel_type, gpu_arch=self.gpu_arch, ) + if self.latency_config is None: + self.latency_config = LatencyConfig( + kernel_type=self.kernel_type, + gpu_arch=self.gpu_arch, + ) + else: + # Backfill kernel_type / gpu_arch when caller built LatencyConfig + # without them so resolve_method() works. + if self.latency_config.kernel_type is None: + self.latency_config.kernel_type = self.kernel_type + if self.latency_config.gpu_arch is None: + self.latency_config.gpu_arch = self.gpu_arch def _detect_gpu_arch(self) -> str: """Auto-detect GPU architecture.""" diff --git a/Magpie/core/executor.py b/Magpie/core/executor.py index cfb457c..973bfe9 100644 --- a/Magpie/core/executor.py +++ b/Magpie/core/executor.py @@ -692,6 +692,7 @@ def _execute_task_worker(task_dict: Dict[str, Any]) -> Dict[str, Any]: ncu_config=mode_cfg.get("ncu_config", {}), metrix_config=mode_cfg.get("metrix_config", {}), correctness_config=mode_cfg.get("correctness_config", {}), + latency_config=mode_cfg.get("latency_config", {}), gpu_arch=mode_cfg.get("gpu_arch", None), ) analyzer = AnalyzeMode(analyze_config) @@ -716,6 +717,7 @@ def _execute_task_worker(task_dict: Dict[str, Any]) -> Dict[str, Any]: ncu_config=mode_cfg.get("ncu_config", {}), metrix_config=mode_cfg.get("metrix_config", {}), correctness_config=mode_cfg.get("correctness_config", {}), + latency_config=mode_cfg.get("latency_config", {}), gpu_arch=mode_cfg.get("gpu_arch", None), winner_strategy=compare_cfg.get("winner_strategy", "perf_score"), perf_weights_rocprof=compare_cfg.get("perf_weights_rocprof", {}), diff --git a/Magpie/core/scheduler.py b/Magpie/core/scheduler.py index 35eb07c..56b895c 100644 --- a/Magpie/core/scheduler.py +++ b/Magpie/core/scheduler.py @@ -167,6 +167,7 @@ def create_task( ncu_config: Optional[Dict[str, Any]] = None, metrix_config: Optional[Dict[str, Any]] = None, correctness_config: Optional[Dict[str, Any]] = None, + latency_config: Optional[Dict[str, Any]] = None, baseline_index: int = 0, compare_config: Optional[Dict[str, Any]] = None, benchmark_config: Optional[Dict[str, Any]] = None, @@ -204,6 +205,7 @@ def create_task( ncu_config=ncu_config or {}, metrix_config=metrix_config or {}, correctness_config=correctness_config or {}, + latency_config=latency_config or {}, baseline_index=baseline_index, compare_config=compare_config or {}, benchmark_config=benchmark_config or {}, @@ -291,6 +293,7 @@ def run_analyze( ncu_config: Optional[Dict[str, Any]] = None, metrix_config: Optional[Dict[str, Any]] = None, correctness_config: Optional[Dict[str, Any]] = None, + latency_config: Optional[Dict[str, Any]] = None, ) -> TaskResult: """ Convenience method to run analyze mode. @@ -321,6 +324,7 @@ def run_analyze( ncu_config=ncu_config, metrix_config=metrix_config, correctness_config=correctness_config, + latency_config=latency_config, ) return self.execute(task) @@ -337,6 +341,7 @@ def run_compare( ncu_config: Optional[Dict[str, Any]] = None, metrix_config: Optional[Dict[str, Any]] = None, correctness_config: Optional[Dict[str, Any]] = None, + latency_config: Optional[Dict[str, Any]] = None, compare_config: Optional[Dict[str, Any]] = None, ) -> TaskResult: """ @@ -370,6 +375,7 @@ def run_compare( ncu_config=ncu_config, metrix_config=metrix_config, correctness_config=correctness_config, + latency_config=latency_config, compare_config=compare_config, ) return self.execute(task) diff --git a/Magpie/core/task.py b/Magpie/core/task.py index 69092b3..06b03ee 100644 --- a/Magpie/core/task.py +++ b/Magpie/core/task.py @@ -63,6 +63,7 @@ class ModeConfig: ncu_config: Dict[str, Any] = field(default_factory=dict) metrix_config: Dict[str, Any] = field(default_factory=dict) correctness_config: Dict[str, Any] = field(default_factory=dict) + latency_config: Dict[str, Any] = field(default_factory=dict) baseline_index: int = 0 # For compare mode compare_config: Dict[str, Any] = field(default_factory=dict) benchmark_config: Dict[str, Any] = field(default_factory=dict) # For benchmark mode @@ -119,6 +120,7 @@ def to_dict(self) -> Dict[str, Any]: "ncu_config": self.mode_config.ncu_config, "metrix_config": self.mode_config.metrix_config, "correctness_config": self.mode_config.correctness_config, + "latency_config": self.mode_config.latency_config, "baseline_index": self.mode_config.baseline_index, "compare_config": self.mode_config.compare_config, "benchmark_config": self.mode_config.benchmark_config, diff --git a/Magpie/eval/__init__.py b/Magpie/eval/__init__.py index 323ce7b..0812a5a 100644 --- a/Magpie/eval/__init__.py +++ b/Magpie/eval/__init__.py @@ -25,6 +25,7 @@ MetricResult, ) from .performance import Performance, PerformanceResult +from .latency import Latency, LatencyResult __all__ = [ # Evaluator @@ -41,4 +42,7 @@ # Performance "Performance", "PerformanceResult", + # Latency + "Latency", + "LatencyResult", ] diff --git a/Magpie/eval/evaluator.py b/Magpie/eval/evaluator.py index deccd7b..01dc66b 100644 --- a/Magpie/eval/evaluator.py +++ b/Magpie/eval/evaluator.py @@ -19,6 +19,7 @@ from ..config import PipelineConfig, KernelEvalConfig from .correctness import Correctness, CorrectnessResult from .compiling import Compiling, CompilingResult +from .latency import Latency, LatencyResult from .performance import Performance, PerformanceResult @@ -40,12 +41,14 @@ class EvaluationState: compiling_state: BaseKind = BaseKind.SUCCESS correctness_state: BaseKind = BaseKind.SUCCESS performance_state: BaseKind = BaseKind.SUCCESS + latency_state: BaseKind = BaseKind.SKIPPED errors: List[str] = field(default_factory=list) # Results of each evaluation step compiling_result: Optional[CompilingResult] = None correctness_result: Optional[CorrectnessResult] = None performance_result: Optional[PerformanceResult] = None + latency_result: Optional[LatencyResult] = None # Overall score (0.0 to 1.0) score: float = 0.0 @@ -59,6 +62,7 @@ def to_dict(self) -> Dict[str, Any]: "compiling_state": self.compiling_state.name, "correctness_state": self.correctness_state.name, "performance_state": self.performance_state.name, + "latency_state": self.latency_state.name, "errors": self.errors, "score": self.score, "compiling_result": { @@ -84,6 +88,9 @@ def to_dict(self) -> Dict[str, Any]: "performance_result": self.performance_result.to_dict() if self.performance_result else None, + "latency_result": self.latency_result.to_dict() + if self.latency_result + else None, "extra": self.extra, } @@ -104,6 +111,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvaluationState": state.compiling_state = BaseKind[data.get("compiling_state", "SUCCESS")] state.correctness_state = BaseKind[data.get("correctness_state", "SUCCESS")] state.performance_state = BaseKind[data.get("performance_state", "SUCCESS")] + state.latency_state = BaseKind[data.get("latency_state", "SKIPPED")] # Restore errors and score state.errors = data.get("errors", []) @@ -135,6 +143,31 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvaluationState": workload_dir=perf_data.get("workload_dir"), ) + # Restore latency result (best-effort - dict round-trip preserves + # the headline fields; full dataclass restoration not required for + # downstream consumers since they read .to_dict() directly). + lat_data = data.get("latency_result") + if lat_data: + from .latency import LatencyResult + from ..bench import LatencyStats + + state.latency_result = LatencyResult( + success=lat_data.get("success", False), + method=lat_data.get("method", "none"), + primary_metric=lat_data.get("primary_metric", "wall_median_ms"), + wall_stats=LatencyStats.from_dict(lat_data.get("wall_stats")), + kernel_stats=LatencyStats.from_dict(lat_data.get("kernel_stats")), + dispatch_overhead_us=lat_data.get("dispatch_overhead_us"), + crosscheck_vs_rocprof_ratio=lat_data.get( + "crosscheck_vs_rocprof_ratio" + ), + crosscheck_warning=lat_data.get("crosscheck_warning"), + config_snapshot=lat_data.get("config", {}), + command=lat_data.get("command"), + output_dir=lat_data.get("output_dir"), + errors=lat_data.get("errors"), + ) + # Restore extra state.extra = data.get("extra", {}) @@ -159,6 +192,7 @@ def __init__(self, pipeline_cfg: PipelineConfig) -> None: self.compiling = Compiling(pipeline_cfg) self.correctness = Correctness(pipeline_cfg) self.performance = Performance(pipeline_cfg) + self.latency = Latency(pipeline_cfg) def evaluate(self, kernel_cfg: KernelEvalConfig) -> EvaluationState: """ @@ -187,7 +221,12 @@ def evaluate(self, kernel_cfg: KernelEvalConfig) -> EvaluationState: # 3) Performance (skip if no prof_command and profiling disabled) state = self._check_performance(state, kernel_cfg) - # 4) Calculate score + # 4) Latency (0-overhead wall-clock + kernel-only) — runs even if + # Performance was skipped; needs Performance only for the + # rocprof_timestamps method which reuses pmc_perf.csv. + state = self._check_latency(state, kernel_cfg) + + # 5) Calculate score state = self._calculate_score(state) return state @@ -258,6 +297,28 @@ def _check_performance( return state + def _check_latency( + self, state: EvaluationState, kernel_cfg: KernelEvalConfig + ) -> EvaluationState: + """Run the 0-overhead Latency harness.""" + try: + result = self.latency.run(state, kernel_cfg) + state.latency_result = result + + if result is None: + state.latency_state = BaseKind.SKIPPED + elif result.success: + state.latency_state = BaseKind.SUCCESS + else: + state.latency_state = BaseKind.FAILED + if result.errors: + state.errors.append(f"Latency: {result.errors}") + except Exception as e: + state.latency_state = BaseKind.FAILED + state.errors.append(f"Latency error: {str(e)}") + + return state + def _calculate_score(self, state: EvaluationState) -> EvaluationState: """Calculate overall evaluation score.""" score = 0.0 diff --git a/Magpie/eval/latency.py b/Magpie/eval/latency.py new file mode 100644 index 0000000..b0746c7 --- /dev/null +++ b/Magpie/eval/latency.py @@ -0,0 +1,899 @@ +############################################################################### +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +Latency evaluation stage. + +Sits next to ``Performance`` (HW counters) in the pipeline and produces +0-overhead wall-clock and/or kernel-only latency for the kernel under +evaluation. See ``Magpie.config.latency`` for the method semantics. + +This module never imports ``torch`` at module level — all heavyweight work +happens inside short-lived subprocesses spawned by :class:`Latency`. +""" + +from __future__ import annotations + +import csv +import json +import logging +import os +import shutil +import subprocess +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +from ..bench import LatencyStats, MAGPIE_LATENCY_JSON_MARKER +from ..config import ( + BenchTarget, + KernelEvalConfig, + LatencyConfig, + PipelineConfig, +) +from ..utils import get_updated_env + +logger = logging.getLogger(__name__) + + +# Filter out HIP / CUDA runtime kernels from per-kernel aggregations. +_RUNTIME_KERNEL_PREFIXES = ("__amd_rocclr_", "__hip_", "cuLaunchKernel") + + +# --------------------------------------------------------------------------- +# Result types +# --------------------------------------------------------------------------- + + +@dataclass +class LatencyResult: + """ + Result of the Latency evaluation stage. + + Fields are populated based on the resolved ``method``: + + - ``cuda_graph`` -> ``wall_stats`` only + - ``kernel_trace`` -> ``kernel_stats`` + per-kernel breakdown + - ``rocprof_timestamps`` -> ``kernel_stats`` + per-kernel breakdown + - ``both`` -> both, plus ``dispatch_overhead_us`` + """ + + success: bool + method: str = "none" + primary_metric: str = "wall_median_ms" + wall_stats: Optional[LatencyStats] = None + kernel_stats: Optional[LatencyStats] = None + per_kernel: Dict[str, LatencyStats] = field(default_factory=dict) + dispatch_overhead_us: Optional[float] = None + crosscheck_vs_rocprof_ratio: Optional[float] = None + crosscheck_warning: Optional[str] = None + config_snapshot: Dict[str, Any] = field(default_factory=dict) + command: Optional[str] = None + output_dir: Optional[str] = None + raw_output: Optional[str] = None + errors: Optional[str] = None + + def get_primary_value(self) -> Optional[float]: + """Return the headline number used by ``compare`` rankings.""" + if self.primary_metric == "kernel_median_ms" and self.kernel_stats: + return self.kernel_stats.median_ms + if self.wall_stats: + return self.wall_stats.median_ms + if self.kernel_stats: + return self.kernel_stats.median_ms + return None + + def to_dict(self) -> Dict[str, Any]: + d: Dict[str, Any] = { + "success": self.success, + "method": self.method, + "primary_metric": self.primary_metric, + "primary_value_ms": self.get_primary_value(), + "config": self.config_snapshot, + "command": self.command, + "output_dir": self.output_dir, + "errors": self.errors, + } + if self.wall_stats is not None: + d["wall_stats"] = self.wall_stats.to_dict() + if self.kernel_stats is not None: + d["kernel_stats"] = self.kernel_stats.to_dict() + if self.per_kernel: + d["per_kernel"] = {k: v.to_dict() for k, v in self.per_kernel.items()} + if self.dispatch_overhead_us is not None: + d["dispatch_overhead_us"] = self.dispatch_overhead_us + if self.crosscheck_vs_rocprof_ratio is not None: + d["crosscheck_vs_rocprof_ratio"] = self.crosscheck_vs_rocprof_ratio + if self.crosscheck_warning: + d["crosscheck_warning"] = self.crosscheck_warning + return d + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _is_runtime_kernel(name: str) -> bool: + return name.startswith(_RUNTIME_KERNEL_PREFIXES) + + +def _aggregate_per_kernel_durations_ns( + per_kernel_ns: Dict[str, List[float]] +) -> Dict[str, LatencyStats]: + """Build per-kernel ``LatencyStats`` from raw ns durations.""" + out: Dict[str, LatencyStats] = {} + for name, samples_ns in per_kernel_ns.items(): + samples_ms = [v / 1e6 for v in samples_ns if v > 0] + if not samples_ms: + continue + out[name] = LatencyStats.from_samples( + samples_ms, n_repeat=1, n_retries=len(samples_ms) + ) + return out + + +def _summary_stats_from_per_kernel( + per_kernel: Dict[str, LatencyStats], +) -> Optional[LatencyStats]: + """ + Build a single ``kernel_stats`` summary from per-kernel breakdown: + + - For each per-kernel timeline, take the median. + - The summary's ``median_ms`` is the SUM of per-kernel medians (i.e. the + median total kernel time per "iteration"), with min/max/p99 derived + from the collated set of all dispatch durations. + """ + if not per_kernel: + return None + + total_median_ms = sum(stats.median_ms for stats in per_kernel.values()) + all_samples: List[float] = [] + for stats in per_kernel.values(): + all_samples.extend(stats.samples_ms) + + if not all_samples: + return None + + base = LatencyStats.from_samples( + all_samples, n_repeat=1, n_retries=len(all_samples) + ) + # Override median with the per-kernel-summed median (truer "per iter" cost) + base.median_ms = total_median_ms + base.p50_ms = total_median_ms + return base + + +def _merge_bench_target( + cfg: LatencyConfig, kernel_cfg: KernelEvalConfig +) -> Optional[BenchTarget]: + """Per-kernel ``bench_target`` wins over LatencyConfig default.""" + if kernel_cfg.bench_target: + return BenchTarget.from_dict(kernel_cfg.bench_target) + return cfg.bench_target + + +def _build_runner_env( + cfg: LatencyConfig, + bench_target: BenchTarget, + kernel_env: Optional[Dict[str, str]], +) -> Dict[str, str]: + """Construct env for the runner subprocess (no leakage of MAGPIE state).""" + extra: Dict[str, str] = dict(kernel_env or {}) + + extra["MAGPIE_BENCH_MODULE"] = bench_target.module + extra["MAGPIE_BENCH_CALLABLE"] = bench_target.callable + extra["MAGPIE_BENCH_INPUTS_FUNC"] = bench_target.get_inputs + extra["MAGPIE_BENCH_REP_MS"] = str(cfg.rep_ms) + extra["MAGPIE_BENCH_N_RETRIES"] = str(cfg.n_retries) + extra["MAGPIE_BENCH_ESTIMATE_REPS"] = str(cfg.estimate_reps) + extra["MAGPIE_BENCH_WARMUP_ITERS"] = str(cfg.warmup_iters) + extra["MAGPIE_BENCH_SEED"] = str(cfg.seed) + + if cfg.pythonpath: + extra["PYTHONPATH"] = ":".join(cfg.pythonpath) + + return get_updated_env(extra) + + +def _runner_module_args() -> List[str]: + """Command-line invocation of the runner as a module.""" + return [sys.executable, "-m", "Magpie.bench._runner"] + + +def _parse_marker_line(stdout: str) -> Optional[Dict[str, Any]]: + """Find and parse the ``MAGPIE_LATENCY_JSON: {...}`` line in *stdout*.""" + if not stdout: + return None + for line in reversed(stdout.splitlines()): + line = line.strip() + if line.startswith(MAGPIE_LATENCY_JSON_MARKER): + payload = line[len(MAGPIE_LATENCY_JSON_MARKER):].strip() + try: + return json.loads(payload) + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse MAGPIE_LATENCY_JSON line: {e}") + return None + return None + + +# --------------------------------------------------------------------------- +# rocprofv3 / pmc_perf parsing +# --------------------------------------------------------------------------- + + +def _parse_rocprofv3_kernel_trace_csv( + csv_path: Path, + kernel_filter_re: Optional[str] = None, +) -> Dict[str, List[float]]: + """ + Parse a rocprofv3 ``--kernel-trace`` CSV and return per-kernel ns durations. + + rocprofv3 emits columns like ``Kernel_Name``, ``Start_Timestamp``, + ``End_Timestamp`` (units = ns). Different rocprofv3 versions also use + ``KernelName`` / ``Start_Time`` / ``End_Time`` — we accept both spellings. + """ + import re + + rx = re.compile(kernel_filter_re) if kernel_filter_re else None + + per_kernel: Dict[str, List[float]] = {} + if not csv_path.exists(): + return per_kernel + + try: + with open(csv_path, "r") as f: + reader = csv.DictReader(f) + for row in reader: + name = ( + row.get("Kernel_Name") + or row.get("KernelName") + or row.get("kernel_name") + or "unknown" + ) + if _is_runtime_kernel(name): + continue + if rx and not rx.search(name): + continue + start = ( + row.get("Start_Timestamp") + or row.get("Start_Time") + or row.get("start_timestamp") + ) + end = ( + row.get("End_Timestamp") + or row.get("End_Time") + or row.get("end_timestamp") + ) + if not start or not end: + continue + try: + duration_ns = float(end) - float(start) + except (TypeError, ValueError): + continue + if duration_ns <= 0: + continue + per_kernel.setdefault(name, []).append(duration_ns) + except Exception as e: + logger.warning(f"Failed to parse rocprofv3 kernel-trace CSV {csv_path}: {e}") + + return per_kernel + + +def _find_rocprofv3_csv(out_dir: Path) -> Optional[Path]: + """Locate the kernel-trace CSV inside *out_dir*. + + rocprofv3 layouts seen in the wild: + - ``/kernel_trace.csv`` + - ``/_kernel_trace.csv`` + - ``//_kernel_trace.csv`` (default in 7.x+) + Walks recursively to find the first matching file. + """ + if not out_dir.exists(): + return None + + direct = out_dir / "kernel_trace.csv" + if direct.exists(): + return direct + + # Recursive glob so ``/_kernel_trace.csv`` works. + for pattern in ("*kernel_trace*.csv", "*_kernel_trace.csv"): + for hit in sorted(out_dir.rglob(pattern)): + # Skip the agent_info / per-process metadata files. + if "agent_info" in hit.name: + continue + return hit + return None + + +def _parse_pmc_perf_csv_for_durations( + csv_path: Path, + kernel_filter_re: Optional[str] = None, +) -> Dict[str, List[float]]: + """Parse rocprof-compute's ``pmc_perf.csv`` for per-dispatch ns durations.""" + import re + + rx = re.compile(kernel_filter_re) if kernel_filter_re else None + + per_kernel: Dict[str, List[float]] = {} + if not csv_path.exists(): + return per_kernel + + try: + with open(csv_path, "r") as f: + reader = csv.DictReader(f) + for row in reader: + name = ( + row.get("Kernel_Name") + or row.get("KernelName") + or "unknown" + ) + if _is_runtime_kernel(name): + continue + if rx and not rx.search(name): + continue + try: + start = float(row.get("Start_Timestamp", 0)) + end = float(row.get("End_Timestamp", 0)) + except (TypeError, ValueError): + continue + if end <= start: + continue + per_kernel.setdefault(name, []).append(end - start) + except Exception as e: + logger.warning(f"Failed to parse pmc_perf.csv {csv_path}: {e}") + + return per_kernel + + +# --------------------------------------------------------------------------- +# Latency stage +# --------------------------------------------------------------------------- + + +class Latency: + """ + Latency evaluation handler. + + Methods: + - ``cuda_graph`` : in-process ``do_bench_cudagraph`` (subprocess) + - ``kernel_trace`` : runner ``--profile`` + rocprofv3 ``--kernel-trace`` + - ``rocprof_timestamps``: reuse pmc_perf.csv from the Performance stage + - ``both`` : run both wall-clock and a kernel-only method + """ + + def __init__(self, pipeline_cfg: PipelineConfig) -> None: + self.pipeline_cfg = pipeline_cfg + self.lat_cfg: LatencyConfig = ( + pipeline_cfg.latency_config + or LatencyConfig( + kernel_type=pipeline_cfg.kernel_type, + gpu_arch=pipeline_cfg.gpu_arch, + ) + ) + + # ------------------------------------------------------------------ + # Public entry point + # ------------------------------------------------------------------ + + def run( + self, + eval_state: Any, + kernel_cfg: KernelEvalConfig, + ) -> Optional[LatencyResult]: + """ + Run the Latency stage. + + Returns: + - ``None`` when the stage is skipped (disabled or method=``none``). + - ``LatencyResult`` otherwise. + """ + if not self.lat_cfg.enabled: + return None + + method = self.lat_cfg.resolve_method() + if method == "none": + return None + + snapshot = self._config_snapshot(method) + + # rocprof_timestamps reuses the Performance stage's pmc_perf.csv + if method == "rocprof_timestamps": + return self._run_rocprof_timestamps(eval_state, snapshot) + + # cuda_graph / kernel_trace / both — need either a bench_target OR a + # testcase_command harness that emits MAGPIE_LATENCY_JSON. + bench_target = _merge_bench_target(self.lat_cfg, kernel_cfg) + has_harness_cmd = kernel_cfg.has_testcase() + + try: + if method == "cuda_graph": + return self._run_cuda_graph( + bench_target, kernel_cfg, has_harness_cmd, snapshot + ) + if method == "kernel_trace": + return self._run_kernel_trace( + bench_target, kernel_cfg, snapshot + ) + if method == "both": + return self._run_both( + bench_target, kernel_cfg, has_harness_cmd, eval_state, snapshot + ) + except Exception as e: # pragma: no cover — defensive + logger.exception(f"Latency stage failed: {e}") + return LatencyResult( + success=False, + method=method, + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors=str(e), + ) + + return LatencyResult( + success=False, + method=method, + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors=f"Unknown latency method: {method}", + ) + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _config_snapshot(self, resolved_method: str) -> Dict[str, Any]: + snap = self.lat_cfg.to_dict() + snap["resolved_method"] = resolved_method + snap["gpu_arch"] = self.lat_cfg.gpu_arch + snap["kernel_type"] = ( + self.lat_cfg.kernel_type.name if self.lat_cfg.kernel_type else None + ) + return snap + + # ----- cuda_graph ------------------------------------------------- + + def _run_cuda_graph( + self, + bench_target: Optional[BenchTarget], + kernel_cfg: KernelEvalConfig, + has_harness_cmd: bool, + snapshot: Dict[str, Any], + ) -> LatencyResult: + if bench_target is None and not has_harness_cmd: + return LatencyResult( + success=False, + method="cuda_graph", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors=( + "cuda_graph requires either a bench_target (import-based) " + "or testcase_command (user harness emitting " + "MAGPIE_LATENCY_JSON: {...})." + ), + ) + + if bench_target is not None: + return self._exec_runner_cuda_graph(bench_target, kernel_cfg, snapshot) + + return self._exec_user_harness(kernel_cfg, snapshot) + + def _exec_runner_cuda_graph( + self, + bench_target: BenchTarget, + kernel_cfg: KernelEvalConfig, + snapshot: Dict[str, Any], + ) -> LatencyResult: + env = _build_runner_env(self.lat_cfg, bench_target, kernel_cfg.env) + cmd = _runner_module_args() + return self._invoke_runner(cmd, env, kernel_cfg.working_dir, snapshot, "cuda_graph") + + def _exec_user_harness( + self, + kernel_cfg: KernelEvalConfig, + snapshot: Dict[str, Any], + ) -> LatencyResult: + cmds = kernel_cfg.get_testcase_commands() + if not cmds: + return LatencyResult( + success=False, + method="cuda_graph", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors="No testcase command available for user harness", + ) + env = get_updated_env(kernel_cfg.env) + # Use the LAST testcase command — earlier commands are usually setup + cmd = cmds[-1] + return self._invoke_runner(cmd, env, kernel_cfg.working_dir, snapshot, "cuda_graph") + + def _invoke_runner( + self, + cmd: List[str], + env: Dict[str, str], + cwd: Optional[str], + snapshot: Dict[str, Any], + method: str, + ) -> LatencyResult: + cmd_str = " ".join(cmd) + logger.info(f"[Latency:{method}] running: {cmd_str}") + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + env=env, + cwd=cwd, + timeout=self.lat_cfg.timeout_seconds, + ) + except subprocess.TimeoutExpired: + return LatencyResult( + success=False, + method=method, + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + command=cmd_str, + errors=f"Latency runner timed out after {self.lat_cfg.timeout_seconds}s", + ) + + stdout = proc.stdout or "" + stderr = proc.stderr or "" + marker = _parse_marker_line(stdout) + + if proc.returncode != 0 or marker is None: + return LatencyResult( + success=False, + method=method, + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + command=cmd_str, + raw_output=stdout, + errors=( + marker.get("error") + if isinstance(marker, dict) + else (stderr.strip() or stdout.strip() or "no MAGPIE_LATENCY_JSON marker") + ), + ) + + if marker.get("error"): + return LatencyResult( + success=False, + method=method, + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + command=cmd_str, + raw_output=stdout, + errors=str(marker.get("error")), + ) + + wall_stats = LatencyStats.from_dict(marker.get("stats")) + + return LatencyResult( + success=True, + method=method, + primary_metric=self.lat_cfg.primary_metric, + wall_stats=wall_stats, + config_snapshot=snapshot, + command=cmd_str, + raw_output=stdout, + ) + + # ----- kernel_trace ---------------------------------------------- + + def _run_kernel_trace( + self, + bench_target: Optional[BenchTarget], + kernel_cfg: KernelEvalConfig, + snapshot: Dict[str, Any], + ) -> LatencyResult: + if shutil.which("rocprofv3") is None: + return LatencyResult( + success=False, + method="kernel_trace", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors=( + "rocprofv3 not found. kernel_trace requires ROCm rocprofv3 " + "on PATH. Use method=cuda_graph for a torch-only fallback." + ), + ) + + if bench_target is None: + # User harness: just wrap the testcase command with rocprofv3. + cmds = kernel_cfg.get_testcase_commands() + if not cmds: + return LatencyResult( + success=False, + method="kernel_trace", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors=( + "kernel_trace requires either a bench_target or a " + "testcase_command to wrap with rocprofv3." + ), + ) + inner = cmds[-1] + env = get_updated_env(kernel_cfg.env) + else: + inner = _runner_module_args() + ["--profile"] + env = _build_runner_env(self.lat_cfg, bench_target, kernel_cfg.env) + + out_dir = self._kernel_trace_output_dir(kernel_cfg) + out_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + "rocprofv3", + "--kernel-trace", + "--output-format", + "csv", + "-d", + str(out_dir), + "--", + *inner, + ] + cmd_str = " ".join(cmd) + logger.info(f"[Latency:kernel_trace] running: {cmd_str}") + + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + env=env, + cwd=kernel_cfg.working_dir, + timeout=self.lat_cfg.timeout_seconds, + ) + except subprocess.TimeoutExpired: + return LatencyResult( + success=False, + method="kernel_trace", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + command=cmd_str, + output_dir=str(out_dir), + errors=f"rocprofv3 timed out after {self.lat_cfg.timeout_seconds}s", + ) + + stdout = proc.stdout or "" + stderr = proc.stderr or "" + + if proc.returncode != 0: + return LatencyResult( + success=False, + method="kernel_trace", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + command=cmd_str, + output_dir=str(out_dir), + raw_output=stdout, + errors=stderr.strip() or stdout.strip() or "rocprofv3 failed", + ) + + csv_path = _find_rocprofv3_csv(out_dir) + if csv_path is None: + return LatencyResult( + success=False, + method="kernel_trace", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + command=cmd_str, + output_dir=str(out_dir), + raw_output=stdout, + errors=f"No kernel-trace CSV found under {out_dir}", + ) + + per_kernel_ns = _parse_rocprofv3_kernel_trace_csv( + csv_path, self.lat_cfg.kernel_filter + ) + per_kernel = _aggregate_per_kernel_durations_ns(per_kernel_ns) + + # Heuristic: when no kernel_filter is set, drop "outlier" kernels + # that fired only a handful of times (typical for torch setup like + # randn / fill / kaiming) so they don't pollute the sum-of-medians + # summary. Always emit a warning when many distinct kernels were + # captured so users know to set kernel_filter for tighter numbers. + if not self.lat_cfg.kernel_filter and len(per_kernel) > 1: + max_n = max(len(s.samples_ms) for s in per_kernel.values()) + cutoff = max(2, max_n // 10) + dropped = {k: len(s.samples_ms) for k, s in per_kernel.items() + if len(s.samples_ms) < cutoff} + if dropped: + logger.warning( + "[Latency:kernel_trace] dropping %d low-dispatch kernels " + "from kernel_stats summary (set latency.kernel_filter to " + "silence): %s", + len(dropped), + ", ".join(f"{k} ({n} dispatches)" for k, n in dropped.items()), + ) + per_kernel = { + k: s for k, s in per_kernel.items() + if len(s.samples_ms) >= cutoff + } + + kernel_stats = _summary_stats_from_per_kernel(per_kernel) + + if kernel_stats is None: + return LatencyResult( + success=False, + method="kernel_trace", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + command=cmd_str, + output_dir=str(out_dir), + raw_output=stdout, + errors=( + f"rocprofv3 produced no kernel timings in {csv_path} " + "(check kernel_filter regex)" + ), + ) + + return LatencyResult( + success=True, + method="kernel_trace", + primary_metric=self.lat_cfg.primary_metric, + kernel_stats=kernel_stats, + per_kernel=per_kernel, + config_snapshot=snapshot, + command=cmd_str, + output_dir=str(out_dir), + raw_output=stdout, + ) + + def _kernel_trace_output_dir(self, kernel_cfg: KernelEvalConfig) -> Path: + base = self.lat_cfg.output_dir or os.path.join( + kernel_cfg.working_dir or os.getcwd(), "latency" + ) + # Use a per-kernel subdir + timestamp to avoid clobbering across runs + kid = ( + kernel_cfg.kernel_id + if kernel_cfg.kernel_id + else "kernel" + ) + safe_kid = "".join(c if c.isalnum() or c in "-_" else "_" for c in kid) + ts = int(time.time()) + return Path(base) / f"kernel_trace_{safe_kid}_{ts}" + + # ----- rocprof_timestamps ---------------------------------------- + + def _run_rocprof_timestamps( + self, + eval_state: Any, + snapshot: Dict[str, Any], + ) -> LatencyResult: + """ + Reuse ``pmc_perf.csv`` produced by a prior Performance stage. + + Looks at ``eval_state.performance_result.workload_dir`` to find the + rocprof-compute workload directory. If it's missing (e.g. the + Performance stage was skipped), returns a soft failure suggesting + the user run with profiling enabled. + """ + perf_result = getattr(eval_state, "performance_result", None) + workload_dir = getattr(perf_result, "workload_dir", None) if perf_result else None + + if not workload_dir: + return LatencyResult( + success=False, + method="rocprof_timestamps", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors=( + "rocprof_timestamps requires the Performance stage to have " + "produced a workload_dir (rocprof-compute). Re-run with " + "performance enabled, or use method=kernel_trace." + ), + ) + + pmc_csv = Path(workload_dir) / "pmc_perf.csv" + if not pmc_csv.exists(): + return LatencyResult( + success=False, + method="rocprof_timestamps", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + output_dir=str(workload_dir), + errors=f"No pmc_perf.csv found in {workload_dir}", + ) + + per_kernel_ns = _parse_pmc_perf_csv_for_durations( + pmc_csv, self.lat_cfg.kernel_filter + ) + per_kernel = _aggregate_per_kernel_durations_ns(per_kernel_ns) + kernel_stats = _summary_stats_from_per_kernel(per_kernel) + + if kernel_stats is None: + return LatencyResult( + success=False, + method="rocprof_timestamps", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + output_dir=str(workload_dir), + errors=( + f"No kernel dispatch durations parsed from {pmc_csv} " + "(check kernel_filter regex)" + ), + ) + + return LatencyResult( + success=True, + method="rocprof_timestamps", + primary_metric=self.lat_cfg.primary_metric, + kernel_stats=kernel_stats, + per_kernel=per_kernel, + config_snapshot=snapshot, + output_dir=str(workload_dir), + ) + + # ----- both -------------------------------------------------------- + + def _run_both( + self, + bench_target: Optional[BenchTarget], + kernel_cfg: KernelEvalConfig, + has_harness_cmd: bool, + eval_state: Any, + snapshot: Dict[str, Any], + ) -> LatencyResult: + wall = self._run_cuda_graph(bench_target, kernel_cfg, has_harness_cmd, snapshot) + + # Pick the kernel-only method based on environment availability + if shutil.which("rocprofv3") is not None and bench_target is not None: + kern = self._run_kernel_trace(bench_target, kernel_cfg, snapshot) + else: + # Try rocprof_timestamps as a fallback; only works when + # Performance has produced a workload_dir. + kern = self._run_rocprof_timestamps(eval_state, snapshot) + + wall_ok = wall.success and wall.wall_stats is not None + kern_ok = kern.success and kern.kernel_stats is not None + + if not wall_ok and not kern_ok: + errs = [ + e for e in (wall.errors, kern.errors) if e + ] + return LatencyResult( + success=False, + method="both", + primary_metric=self.lat_cfg.primary_metric, + config_snapshot=snapshot, + errors="; ".join(errs) or "both methods failed", + ) + + merged = LatencyResult( + success=True, + method="both", + primary_metric=self.lat_cfg.primary_metric, + wall_stats=wall.wall_stats if wall_ok else None, + kernel_stats=kern.kernel_stats if kern_ok else None, + per_kernel=kern.per_kernel if kern_ok else {}, + config_snapshot=snapshot, + command=" && ".join( + c for c in (wall.command, kern.command) if c + ) or None, + output_dir=kern.output_dir, + raw_output=wall.raw_output, + ) + + if wall_ok and kern_ok and wall.wall_stats and kern.kernel_stats: + wall_us = wall.wall_stats.median_ms * 1000.0 + kern_us = kern.kernel_stats.median_ms * 1000.0 + merged.dispatch_overhead_us = wall_us - kern_us + if kern_us > 0: + merged.crosscheck_vs_rocprof_ratio = wall_us / kern_us + if not (0.5 <= merged.crosscheck_vs_rocprof_ratio <= 2.0): + merged.crosscheck_warning = ( + f"wall/kernel ratio {merged.crosscheck_vs_rocprof_ratio:.2f} " + "is outside [0.5, 2.0]; check warmup, timer pollution, " + "or kernel_filter." + ) + + if not wall_ok: + # Surface kernel-only error so users see why wall failed + merged.errors = wall.errors + elif not kern_ok: + merged.errors = kern.errors + + return merged diff --git a/Magpie/eval/performance.py b/Magpie/eval/performance.py index 95aa17a..e8441b2 100644 --- a/Magpie/eval/performance.py +++ b/Magpie/eval/performance.py @@ -12,6 +12,33 @@ - CUDA kernels: ncu (NVIDIA Nsight Compute) - Triton kernels: auto-selected based on GPU arch (rocprof-compute on AMD, ncu on NVIDIA) since Triton JIT-compiles to native HIP/CUDA dispatches + +0-overhead guarantee +~~~~~~~~~~~~~~~~~~~~ + +For HIP and Triton-on-AMD, kernel duration comes from +``Start_Timestamp`` / ``End_Timestamp`` columns of ``pmc_perf.csv``, +which are HW per-dispatch timestamps written by the GPU's command +processor. **There is no Python or runtime instrumentation between the +kernel launch and the recorded timestamp** — the host runs the kernel +exactly as it would in production, and the wallclock value is sourced +purely from the device. The same applies to ``ncu``'s GPU-side timing +on NVIDIA. + +Anti-pattern (DO NOT DO THIS in your testcase): + + for j in range(n_iter): + start_events[j].record() + mod.triton_scaled_mm(...) # <- includes Python + JIT + dispatch + end_events[j].record() + torch.cuda.synchronize() + +This pattern times the *host*-driven launch path including JIT +specialization, autotuning, and the Python dispatcher and is **not** +0-overhead. Use :class:`Magpie.eval.latency.Latency` (the dedicated +Latency stage) for in-process wall-clock timing — it captures the +kernel inside a CUDA graph so dispatch overhead is amortized across +many replays. """ from __future__ import annotations @@ -40,6 +67,74 @@ logger = logging.getLogger(__name__) +# Anti-pattern signatures that indicate naive in-process CUDA event timing. +# When any pair of these tokens appears in a Python testcase, log a warning +# pointing the user at magpie.bench.do_bench_cudagraph. +_NAIVE_TIMING_PAIRS = [ + ("start_events[", "end_events["), + (".record(", "elapsed_time("), + ("Event(enable_timing=True)", "elapsed_time("), +] + + +def _warn_if_in_process_timing(kernel_cfg: KernelEvalConfig) -> None: + """ + Best-effort scan of the testcase script for naive in-process CUDA-event + timing. Emits a single warning per kernel and silently no-ops when source + is not available. + """ + try: + cmds = kernel_cfg.get_testcase_commands() + except Exception: + return + if not cmds: + return + + # Collect candidate .py files referenced in the testcase command(s) + py_paths: List[Path] = [] + for cmd in cmds: + for tok in cmd: + if isinstance(tok, str) and tok.endswith(".py"): + p = Path(tok) + if not p.is_absolute() and kernel_cfg.working_dir: + p = Path(kernel_cfg.working_dir) / p + if p.exists() and p.is_file(): + py_paths.append(p) + # Also scan declared source files + for src in kernel_cfg.get_source_file_paths(): + p = Path(src) + if p.suffix == ".py" and p.exists() and p.is_file(): + py_paths.append(p) + + seen: set = set() + for p in py_paths: + rp = p.resolve() + if rp in seen: + continue + seen.add(rp) + try: + text = rp.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + # Skip if the user is already using do_bench_cudagraph + if "do_bench_cudagraph" in text or "magpie.bench" in text or "Magpie.bench" in text: + continue + for a, b in _NAIVE_TIMING_PAIRS: + if a in text and b in text: + logger.warning( + "[%s] testcase '%s' uses naive in-process CUDA event timing " + "('%s' / '%s'). This includes Python + dispatch + JIT overhead " + "and is NOT 0-overhead. For wall-clock latency, use " + "magpie.bench.do_bench_cudagraph (see docs/latency.md). " + "rocprof-based perf metrics from this run remain accurate.", + kernel_cfg.kernel_id, + rp, + a, + b, + ) + return + + @dataclass class MetricResult: """A single performance metric result.""" @@ -210,6 +305,12 @@ def run( if not self.perf_cfg.enabled: return None + # Heuristic: warn the user if their testcase script uses naive + # torch.cuda.Event timing; it would otherwise look like Magpie is + # reporting that overhead-laden number alongside the 0-overhead + # rocprof timestamps. + _warn_if_in_process_timing(kernel_cfg) + # 2. If custom prof_command is provided, use it if kernel_cfg.has_prof_command(): return self._run_custom_profiler(kernel_cfg) diff --git a/Magpie/kernel_config.yaml.example b/Magpie/kernel_config.yaml.example index fc6b8ef..847b0a3 100644 --- a/Magpie/kernel_config.yaml.example +++ b/Magpie/kernel_config.yaml.example @@ -142,3 +142,55 @@ kernel: # kernel_filter: null # regex to filter kernel names # num_replays: 3 # timeout_seconds: 120 + +# ============================================================================= +# Triton Kernel with 0-overhead Latency Harness +# ============================================================================= +# For Triton (or any PyTorch / CUDA Python kernel), Magpie can run an in-process +# CUDA-graph based timer (do_bench_cudagraph) and/or wrap the kernel in a +# rocprofv3 --kernel-trace subprocess to extract pure HW kernel durations. +# +# When `bench_target` is set on the kernel, Magpie spawns a tiny subprocess +# that imports `module.callable` and times it with the user's inputs. + +# kernel: +# id: "triton_scaled_mm" +# type: triton +# source_files: ["./my_kernels/scaled_mm.py"] +# working_dir: "./my_kernels" +# testcase_command: "python -m my_kernels.scaled_mm --check" # for correctness/perf +# # Latency-harness target (per-kernel; wins over latency.bench_target) +# bench_target: +# module: "my_kernels.scaled_mm" +# callable: "triton_scaled_mm" +# get_inputs: "get_inputs" # returns (args, kwargs) or args tuple + +# ============================================================================= +# Latency Harness Settings (Optional, framework-wide) +# ============================================================================= +# Add a top-level `latency:` section to control the 0-overhead timing stage. +# +# Methods: +# auto - both for triton/pytorch/cuda; rocprof_timestamps for hip +# cuda_graph - in-process do_bench_cudagraph (dispatch-inclusive) +# kernel_trace - rocprofv3 --kernel-trace harness (kernel-only timing, +# required for autotuning small kernels) +# rocprof_timestamps - reuse pmc_perf.csv from the Performance stage +# both - run wall-clock + kernel-only and report +# dispatch_overhead_us = wall - kernel +# none - disable + +# latency: +# enabled: true +# method: auto +# primary_metric: kernel_median_ms # use kernel-only timing for compare ranking +# # (set to wall_median_ms for end-to-end ranking) +# rep_ms: 20 # target measurement window (ms) +# n_retries: 5 # number of independent measurements +# estimate_reps: 5 # reps inside the cost-estimate graph +# warmup_iters: 5 # eager warmup iterations +# seed: 42 # torch.manual_seed for reproducibility +# kernel_filter: null # regex applied to per-dispatch kernel names +# pythonpath: # prepended to PYTHONPATH of subprocess +# - "/abs/path/to/my_kernels_repo" # so non-installed user packages import cleanly +# timeout_seconds: 120 diff --git a/Magpie/main.py b/Magpie/main.py index e03f923..f343f78 100644 --- a/Magpie/main.py +++ b/Magpie/main.py @@ -71,19 +71,27 @@ def parse_kernel_type(type_str: str) -> KernelType: def load_kernel_config( kernel_config_path: Path, -) -> tuple[List[KernelEvalConfig], Dict[str, Any], Dict[str, Any], Dict[str, Any]]: +) -> tuple[ + List[KernelEvalConfig], + Dict[str, Any], + Dict[str, Any], + Dict[str, Any], + Dict[str, Any], +]: """ Load kernel configuration from YAML file. The YAML may optionally contain: - ``performance:`` — overrides framework-level profiler settings - ``correctness:`` — overrides framework-level correctness settings - - ``ray_config:`` — Ray cluster settings (implies ``environment: ray``) - - ``scheduler:`` — scheduler-level overrides (environment, workers, …) + - ``latency:`` — overrides framework-level latency-harness settings + - ``ray_config:`` — Ray cluster settings (implies ``environment: ray``) + - ``scheduler:`` — scheduler-level overrides (environment, workers, …) Returns: Tuple of (kernel configs, performance overrides, correctness overrides, - scheduler overrides). Override dicts are empty when absent. + scheduler overrides, latency overrides). Override dicts are empty when + absent. """ data = load_yaml(kernel_config_path) configs = [] @@ -106,12 +114,14 @@ def load_kernel_config( corr_overrides = _expand_env_vars(data.get("correctness", {})) + lat_overrides = _expand_env_vars(data.get("latency", {})) + sched_overrides: Dict[str, Any] = dict(data.get("scheduler", {})) if "ray_config" in data: sched_overrides["ray_config"] = data["ray_config"] sched_overrides.setdefault("environment", "ray") - return configs, perf_overrides, corr_overrides, sched_overrides + return configs, perf_overrides, corr_overrides, sched_overrides, lat_overrides def _parse_command_list(cmd_entry) -> Optional[List]: @@ -199,6 +209,14 @@ def _parse_kernel_entry(entry: Dict[str, Any]) -> Optional[KernelEvalConfig]: # Parse prof command(s) prof_cmd = _parse_command_list(entry.get("prof_command")) + # Parse optional bench_target (per-kernel latency harness target) + bench_target_raw = entry.get("bench_target") + bench_target = ( + _expand_env_vars(bench_target_raw) + if isinstance(bench_target_raw, dict) + else None + ) + return KernelEvalConfig( kernel_id=entry.get("id", "kernel"), kernel_type=kernel_type, @@ -208,6 +226,7 @@ def _parse_kernel_entry(entry: Dict[str, Any]) -> Optional[KernelEvalConfig]: testcase_command=testcase_cmd, compiling_command=compile_cmd, prof_command=prof_cmd, + bench_target=bench_target, get_inputs_func=entry.get("get_inputs_func", "get_inputs"), get_init_inputs_func=entry.get("get_init_inputs_func", "get_init_inputs"), ) @@ -418,6 +437,73 @@ def _get_compare_config(config: Dict[str, Any]) -> Dict[str, Any]: return config.get("compare", {}) +def _get_latency_config(config: Dict[str, Any]) -> Dict[str, Any]: + """ + Get latency-harness configuration from framework config. + + Returns: + Dict that can be passed straight into ``LatencyConfig.from_dict``. + """ + lat_cfg = config.get("latency", {}) or {} + return { + "enabled": lat_cfg.get("enabled", True), + "method": lat_cfg.get("method", "auto"), + "primary_metric": lat_cfg.get("primary_metric", "wall_median_ms"), + "rep_ms": lat_cfg.get("rep_ms", 20), + "n_retries": lat_cfg.get("n_retries", 5), + "estimate_reps": lat_cfg.get("estimate_reps", 5), + "warmup_iters": lat_cfg.get("warmup_iters", 5), + "seed": lat_cfg.get("seed", 42), + "kernel_filter": lat_cfg.get("kernel_filter"), + "bench_target": lat_cfg.get("bench_target"), + "pythonpath": list(lat_cfg.get("pythonpath", []) or []), + "timeout_seconds": lat_cfg.get("timeout_seconds", 120.0), + "output_dir": lat_cfg.get("output_dir"), + } + + +def _apply_latency_overrides( + settings: Dict[str, Any], overrides: Dict[str, Any] +) -> Dict[str, Any]: + """Merge per-kernel-YAML ``latency:`` overrides into framework defaults.""" + merged = dict(settings) + for key in ( + "enabled", + "method", + "primary_metric", + "rep_ms", + "n_retries", + "estimate_reps", + "warmup_iters", + "seed", + "kernel_filter", + "bench_target", + "timeout_seconds", + "output_dir", + ): + if key in overrides: + merged[key] = overrides[key] + if "pythonpath" in overrides: + merged["pythonpath"] = list(overrides["pythonpath"] or []) + return merged + + +def _apply_latency_cli_overrides( + settings: Dict[str, Any], args +) -> Dict[str, Any]: + """Apply CLI flags (--no-latency, --latency-method, --latency-rep-ms).""" + merged = dict(settings) + if getattr(args, "no_latency", False): + merged["enabled"] = False + method = getattr(args, "latency_method", None) + if method: + merged["method"] = method + rep_ms = getattr(args, "latency_rep_ms", None) + if rep_ms: + merged["rep_ms"] = int(rep_ms) + return merged + + def _get_scheduler_config( config: Dict[str, Any], args, @@ -481,12 +567,17 @@ def run_analyze(args, config: Dict[str, Any]) -> int: perf_overrides: Dict[str, Any] = {} corr_overrides: Dict[str, Any] = {} sched_overrides: Dict[str, Any] = {} + lat_overrides: Dict[str, Any] = {} if args.kernel_config: # Load from kernel config file - kernel_configs, perf_overrides, corr_overrides, sched_overrides = ( - load_kernel_config(args.kernel_config) - ) + ( + kernel_configs, + perf_overrides, + corr_overrides, + sched_overrides, + lat_overrides, + ) = load_kernel_config(args.kernel_config) if not kernel_configs: logger.error(f"No kernels found in {args.kernel_config}") return 1 @@ -527,12 +618,18 @@ def run_analyze(args, config: Dict[str, Any]) -> int: compile_settings = _get_compiling_config(config) perf_settings = _get_performance_config(config, kernel_type) corr_settings = _get_correctness_config(config) + lat_settings = _get_latency_config(config) # Apply per-config overrides (from kernel config YAML) if perf_overrides: perf_settings = _apply_perf_overrides(perf_settings, perf_overrides, kernel_type) if corr_overrides: corr_settings = _apply_correctness_overrides(corr_settings, corr_overrides) + if lat_overrides: + lat_settings = _apply_latency_overrides(lat_settings, lat_overrides) + + # CLI flag overrides + lat_settings = _apply_latency_cli_overrides(lat_settings, args) # Create workspace before profiling so profiler writes directly there label = kernel_configs[0].kernel_id if kernel_configs else "" @@ -543,6 +640,10 @@ def run_analyze(args, config: Dict[str, Any]) -> int: perf_settings["rocprof_config"]["output_dir"] = perf_dir perf_settings["metrix_config"]["output_dir"] = perf_dir + lat_dir = str(ws_path / "latency") + Path(lat_dir).mkdir(parents=True, exist_ok=True) + lat_settings["output_dir"] = lat_dir + corr_settings["workspace_path"] = str(ws_path) scheduler_config = _get_scheduler_config(config, args, sched_overrides) @@ -564,6 +665,7 @@ def run_analyze(args, config: Dict[str, Any]) -> int: ncu_config=perf_settings["ncu_config"], metrix_config=perf_settings["metrix_config"], correctness_config=corr_settings, + latency_config=lat_settings, ) # Unwrap Ray result format: {'task_id': ..., 'results': [...]} @@ -610,11 +712,16 @@ def run_compare(args, config: Dict[str, Any]) -> int: perf_overrides: Dict[str, Any] = {} corr_overrides: Dict[str, Any] = {} sched_overrides: Dict[str, Any] = {} + lat_overrides: Dict[str, Any] = {} if args.kernel_config: - kernel_configs, perf_overrides, corr_overrides, sched_overrides = ( - load_kernel_config(args.kernel_config) - ) + ( + kernel_configs, + perf_overrides, + corr_overrides, + sched_overrides, + lat_overrides, + ) = load_kernel_config(args.kernel_config) elif args.kernels: kernel_type = parse_kernel_type(args.type) @@ -644,6 +751,7 @@ def run_compare(args, config: Dict[str, Any]) -> int: compile_settings = _get_compiling_config(config) perf_settings = _get_performance_config(config, kernel_type) corr_settings = _get_correctness_config(config) + lat_settings = _get_latency_config(config) compare_settings = _get_compare_config(config) # Apply per-config overrides (from kernel config YAML) @@ -651,6 +759,10 @@ def run_compare(args, config: Dict[str, Any]) -> int: perf_settings = _apply_perf_overrides(perf_settings, perf_overrides, kernel_type) if corr_overrides: corr_settings = _apply_correctness_overrides(corr_settings, corr_overrides) + if lat_overrides: + lat_settings = _apply_latency_overrides(lat_settings, lat_overrides) + + lat_settings = _apply_latency_cli_overrides(lat_settings, args) # Create workspace before profiling so profiler writes directly there ws_path = _create_workspace(args.output_dir, "compare") @@ -660,6 +772,10 @@ def run_compare(args, config: Dict[str, Any]) -> int: perf_settings["rocprof_config"]["output_dir"] = perf_dir perf_settings["metrix_config"]["output_dir"] = perf_dir + lat_dir = str(ws_path / "latency") + Path(lat_dir).mkdir(parents=True, exist_ok=True) + lat_settings["output_dir"] = lat_dir + corr_settings["workspace_path"] = str(ws_path) scheduler_config = _get_scheduler_config(config, args, sched_overrides) @@ -682,6 +798,7 @@ def run_compare(args, config: Dict[str, Any]) -> int: ncu_config=perf_settings["ncu_config"], metrix_config=perf_settings["metrix_config"], correctness_config=corr_settings, + latency_config=lat_settings, compare_config=compare_settings, ) @@ -724,6 +841,8 @@ def _dict_to_eval_state(state_dict: Dict[str, Any]) -> EvaluationState: state.correctness_state = BaseKind[state_dict["correctness_state"]] if "performance_state" in state_dict: state.performance_state = BaseKind[state_dict["performance_state"]] + if "latency_state" in state_dict: + state.latency_state = BaseKind[state_dict["latency_state"]] if "score" in state_dict: state.score = state_dict["score"] if "errors" in state_dict: @@ -731,6 +850,26 @@ def _dict_to_eval_state(state_dict: Dict[str, Any]) -> EvaluationState: if "extra" in state_dict: state.extra = state_dict["extra"] + lat_data = state_dict.get("latency_result") + if lat_data: + from .eval.latency import LatencyResult + from .bench import LatencyStats + + state.latency_result = LatencyResult( + success=lat_data.get("success", False), + method=lat_data.get("method", "none"), + primary_metric=lat_data.get("primary_metric", "wall_median_ms"), + wall_stats=LatencyStats.from_dict(lat_data.get("wall_stats")), + kernel_stats=LatencyStats.from_dict(lat_data.get("kernel_stats")), + dispatch_overhead_us=lat_data.get("dispatch_overhead_us"), + crosscheck_vs_rocprof_ratio=lat_data.get("crosscheck_vs_rocprof_ratio"), + crosscheck_warning=lat_data.get("crosscheck_warning"), + config_snapshot=lat_data.get("config", {}), + command=lat_data.get("command"), + output_dir=lat_data.get("output_dir"), + errors=lat_data.get("errors"), + ) + return state @@ -744,14 +883,43 @@ def _print_result(kernel_cfg: KernelEvalConfig, result: EvaluationState) -> None print(f"Compiling: {result.get('compiling_state', 'UNKNOWN')}") print(f"Correctness: {result.get('correctness_state', 'UNKNOWN')}") print(f"Performance: {result.get('performance_state', 'UNKNOWN')}") + print(f"Latency: {result.get('latency_state', 'SKIPPED')}") print(f"Score: {result.get('score', 0.0):.2f}") errors = result.get("errors", []) + latency_dict = result.get("latency_result") else: print(f"Compiling: {result.compiling_state.name}") print(f"Correctness: {result.correctness_state.name}") print(f"Performance: {result.performance_state.name}") + print(f"Latency: {result.latency_state.name}") print(f"Score: {result.score:.2f}") errors = result.errors + latency_dict = ( + result.latency_result.to_dict() if result.latency_result else None + ) + + if latency_dict and latency_dict.get("success"): + method = latency_dict.get("method", "?") + wall = latency_dict.get("wall_stats") + kern = latency_dict.get("kernel_stats") + print(f" method: {method}") + if wall and wall.get("median_ms") is not None: + print( + f" wall_median: {wall['median_ms']:.4f} ms " + f"(p99 {wall.get('p99_ms', 0.0):.4f}, std {wall.get('std_ms', 0.0):.4f})" + ) + if kern and kern.get("median_ms") is not None: + print( + f" kernel_median:{kern['median_ms']:.4f} ms " + f"(p99 {kern.get('p99_ms', 0.0):.4f})" + ) + if latency_dict.get("dispatch_overhead_us") is not None: + print( + f" dispatch: {latency_dict['dispatch_overhead_us']:.2f} us " + "(wall - kernel)" + ) + if latency_dict.get("crosscheck_warning"): + print(f" WARN: {latency_dict['crosscheck_warning']}") if errors: print("Errors:") @@ -807,7 +975,35 @@ def _save_config_snapshot( def _save_results(results: List, ws_path: Path, mode: str) -> None: - """Save analysis/compare results into the workspace as a JSON report.""" + """Save analysis/compare results into the workspace as a JSON report. + + Layout:: + + { + "mode": ..., + "timestamp": ..., + "summary": [ # one entry per kernel, headline numbers + { + "kernel_id": ..., + "compiling_state": ..., + "correctness_state": ..., + "performance_state": ..., + "latency_state": ..., + "score": ..., + "latency": { + "method": ..., + "primary_metric": ..., + "primary_value_ms": ..., + "wall_median_ms": ..., + "kernel_median_ms": ..., + "dispatch_overhead_us": ..., + "crosscheck_vs_rocprof_ratio": ... + } + }, ... + ], + "results": [] + } + """ import json from datetime import datetime @@ -815,17 +1011,26 @@ def _save_results(results: List, ws_path: Path, mode: str) -> None: report_file = ws_path / f"{mode}_report.json" serialized_results: List[Any] = [] + summary: List[Dict[str, Any]] = [] for r in results: if isinstance(r, dict): serialized_results.append(r) + summary.append(_build_kernel_summary(r)) elif hasattr(r, "to_dict"): - serialized_results.append(r.to_dict()) + d = r.to_dict() + serialized_results.append(d) + summary.append(_build_kernel_summary(d)) else: serialized_results.append(str(r)) with open(report_file, "w") as f: json.dump( - {"mode": mode, "timestamp": timestamp, "results": serialized_results}, + { + "mode": mode, + "timestamp": timestamp, + "summary": summary, + "results": serialized_results, + }, f, indent=2, ) @@ -833,6 +1038,43 @@ def _save_results(results: List, ws_path: Path, mode: str) -> None: logger.info(f"Results saved to {report_file}") +def _build_kernel_summary(state_dict: Dict[str, Any]) -> Dict[str, Any]: + """Extract per-kernel headline numbers (perf + latency) for the summary block.""" + summary: Dict[str, Any] = { + "kernel_id": (state_dict.get("extra") or {}).get("kernel_id"), + "kernel_type": (state_dict.get("extra") or {}).get("kernel_type"), + "compiling_state": state_dict.get("compiling_state"), + "correctness_state": state_dict.get("correctness_state"), + "performance_state": state_dict.get("performance_state"), + "latency_state": state_dict.get("latency_state"), + "score": state_dict.get("score"), + } + + lat = state_dict.get("latency_result") + if lat: + wall = lat.get("wall_stats") or {} + kern = lat.get("kernel_stats") or {} + summary["latency"] = { + "method": lat.get("method"), + "primary_metric": lat.get("primary_metric"), + "primary_value_ms": lat.get("primary_value_ms"), + "wall_median_ms": wall.get("median_ms"), + "kernel_median_ms": kern.get("median_ms"), + "dispatch_overhead_us": lat.get("dispatch_overhead_us"), + "crosscheck_vs_rocprof_ratio": lat.get("crosscheck_vs_rocprof_ratio"), + "crosscheck_warning": lat.get("crosscheck_warning"), + } + + perf = state_dict.get("performance_result") + if perf and isinstance(perf, dict): + # Bring across just the rolled-up summary metrics if present. + perf_summary = perf.get("summary") or perf.get("summary_metrics") + if perf_summary is not None: + summary["performance_summary"] = perf_summary + + return summary + + def _save_comparison(comparison: Any, ws_path: Path) -> None: """Save comparison results into the workspace as a JSON report.""" import json @@ -848,11 +1090,19 @@ def _save_comparison(comparison: Any, ws_path: Path) -> None: else: comparison_data = {"result": str(comparison)} + # Build per-kernel summary block highlighting latency / perf headlines so + # consumers don't have to walk the full nested kernel_results structure. + kernel_summary: List[Dict[str, Any]] = [] + for r in comparison_data.get("kernel_results", []): + if isinstance(r, dict): + kernel_summary.append(_build_kernel_summary(r)) + with open(report_file, "w") as f: json.dump( { "mode": "compare", "timestamp": timestamp, + "summary": kernel_summary, "results": { "kernel_results": comparison_data.get("kernel_results", []), "comparison_metrics": comparison_data.get("comparison_metrics", {}), @@ -1116,6 +1366,22 @@ def create_parser() -> argparse.ArgumentParser: analyze_parser.add_argument( "--no-perf", action="store_true", help="Skip performance profiling" ) + analyze_parser.add_argument( + "--no-latency", + action="store_true", + help="Skip 0-overhead latency harness (cuda_graph / kernel_trace)", + ) + analyze_parser.add_argument( + "--latency-method", + type=str, + choices=["auto", "cuda_graph", "kernel_trace", "rocprof_timestamps", "both", "none"], + help="Override latency.method (default: auto from kernel type)", + ) + analyze_parser.add_argument( + "--latency-rep-ms", + type=int, + help="Override latency.rep_ms (target measurement window in ms)", + ) analyze_parser.add_argument( "--output-dir", "-o", @@ -1148,6 +1414,22 @@ def create_parser() -> argparse.ArgumentParser: compare_parser.add_argument( "--no-perf", action="store_true", help="Skip performance profiling" ) + compare_parser.add_argument( + "--no-latency", + action="store_true", + help="Skip 0-overhead latency harness (cuda_graph / kernel_trace)", + ) + compare_parser.add_argument( + "--latency-method", + type=str, + choices=["auto", "cuda_graph", "kernel_trace", "rocprof_timestamps", "both", "none"], + help="Override latency.method (default: auto from kernel type)", + ) + compare_parser.add_argument( + "--latency-rep-ms", + type=int, + help="Override latency.rep_ms (target measurement window in ms)", + ) compare_parser.add_argument( "--output-dir", "-o", diff --git a/Magpie/modes/analyze_eval/analyzer.py b/Magpie/modes/analyze_eval/analyzer.py index 5f3df50..8d35ebc 100644 --- a/Magpie/modes/analyze_eval/analyzer.py +++ b/Magpie/modes/analyze_eval/analyzer.py @@ -25,6 +25,7 @@ CompilingConfig, CorrectnessConfig, CorrectnessMode, + LatencyConfig, PerformanceConfig, ) from ...config.performance import RocprofComputeConfig, NcuConfig, MetrixConfig, PerfBackend @@ -59,6 +60,7 @@ class AnalyzeConfig: ncu_config: Dict[str, Any] = field(default_factory=dict) metrix_config: Dict[str, Any] = field(default_factory=dict) correctness_config: Dict[str, Any] = field(default_factory=dict) + latency_config: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): if self.gpu_arch is None: @@ -144,6 +146,13 @@ def analyze(self, kernel_cfg: KernelEvalConfig) -> EvaluationState: # Build correctness config corr_cfg = self._build_correctness_config() + # Build latency config from dict + lat_cfg = LatencyConfig.from_dict( + self.config.latency_config, + kernel_type=kernel_cfg.kernel_type, + gpu_arch=self.config.gpu_arch, + ) + # Build pipeline config for analyze mode pipeline_cfg = PipelineConfig( mode=EvalMode.ANALYZE, @@ -164,6 +173,7 @@ def analyze(self, kernel_cfg: KernelEvalConfig) -> EvaluationState: ncu_config=ncu_cfg, metrix_config=metrix_cfg, ), + latency_config=lat_cfg, ) evaluator = Evaluator(pipeline_cfg) diff --git a/Magpie/modes/compare_eval/comparator.py b/Magpie/modes/compare_eval/comparator.py index 4a3e8a0..fdafdde 100644 --- a/Magpie/modes/compare_eval/comparator.py +++ b/Magpie/modes/compare_eval/comparator.py @@ -25,6 +25,7 @@ CompilingConfig, CorrectnessConfig, CorrectnessMode, + LatencyConfig, PerformanceConfig, ) from ...config.performance import RocprofComputeConfig, NcuConfig, MetrixConfig, PerfBackend @@ -72,6 +73,7 @@ def _detect_gpu_arch(self) -> None: ncu_config: Dict[str, Any] = field(default_factory=dict) metrix_config: Dict[str, Any] = field(default_factory=dict) correctness_config: Dict[str, Any] = field(default_factory=dict) + latency_config: Dict[str, Any] = field(default_factory=dict) # Winner selection strategy: "correctness_first" or "perf_score" winner_strategy: str = "perf_score" # Per-backend scoring weights @@ -198,6 +200,13 @@ def _compare_sequential( # Build correctness config corr_cfg = self._build_correctness_config(corr_mode) + # Build latency config from dict + lat_cfg = LatencyConfig.from_dict( + self.config.latency_config, + kernel_type=cfg.kernel_type, + gpu_arch=self.config.gpu_arch, + ) + # Build pipeline config pipeline_cfg = PipelineConfig( mode=EvalMode.COMPARE, @@ -218,6 +227,7 @@ def _compare_sequential( ncu_config=ncu_cfg, metrix_config=metrix_cfg, ), + latency_config=lat_cfg, ) evaluator = Evaluator(pipeline_cfg) diff --git a/docs/analysis_compare.md b/docs/analysis_compare.md index 988bacb..2590066 100644 --- a/docs/analysis_compare.md +++ b/docs/analysis_compare.md @@ -1,6 +1,6 @@ # Analyze vs Compare -Magpie’s **Analyze** and **Compare** modes both evaluate GPU kernels (HIP, CUDA, PyTorch, Triton) through the same underlying pipeline—compile (optional), correctness, and optional performance profiling—but they differ in how many kernels you evaluate and how a “winner” is chosen. +Magpie’s **Analyze** and **Compare** modes both evaluate GPU kernels (HIP, CUDA, PyTorch, Triton) through the same underlying pipeline—compile (optional), correctness, optional performance profiling, and optional 0-overhead [Latency](latency.md)—but they differ in how many kernels you evaluate and how a “winner” is chosen. ## At a glance diff --git a/docs/benchmark.md b/docs/benchmark.md index 2e9c00f..2471e02 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -2,6 +2,8 @@ Benchmark mode enables framework-level performance benchmarking for LLM inference engines (vLLM, SGLang) with integrated trace analysis capabilities. +> Looking for *per-kernel* 0-overhead latency for HIP / CUDA / Triton kernels? See [Latency](latency.md). Benchmark mode targets framework-level (server / client) measurements, not single-kernel microbenchmarks. + **Execution:** Benchmarks use `run_mode`: **`docker`** (default), **`local`** (host / in-pod, via YAML or `--run-mode local`), or **`ray`** (driver submits `RayJobExecutor`; a **GPU worker** runs the same InferenceX → vLLM/SGLang flow—see [Magpie + Ray](ray-magpie.md)). InferenceX is cloned automatically when `inferencex_path` is empty (see `Magpie/config.yaml` `benchmark.inferencex_path`). ## Overview diff --git a/docs/latency.md b/docs/latency.md new file mode 100644 index 0000000..471d542 --- /dev/null +++ b/docs/latency.md @@ -0,0 +1,180 @@ +# 0-Overhead Kernel Latency + +The **Latency** stage runs alongside Magpie's `Performance` stage and produces a 0-overhead measurement of how long the kernel actually takes to run, free of Python, JIT, and dispatch noise. + +It complements (rather than replaces) the HW-counter metrics from `rocprof-compute` / `ncu` / `metrix`, which are already 0-overhead but measure *throughput*-style quantities (FLOPs, bandwidth, occupancy). Latency answers the orthogonal question: *how many milliseconds does each kernel call take?* + +## Two timing semantics — and when to use each + +There are two fundamentally different ways to ask "how long does this kernel take?" Magpie supports both and emits them side-by-side in the report. + +| Method | What it measures | Use when | +| --- | --- | --- | +| **`cuda_graph`** (wall-clock) | End-to-end per-call latency including one CUDA-graph launch's worth of host overhead. Implemented via `do_bench_cudagraph`: warmup → estimate graph → unrolled replay → median across `n_retries`. Dispatch is amortized across `n_repeat` calls inside a single graph capture. | Comparing two *different* kernel implementations end-to-end (e.g. PyTorch vs Triton vs HIP), reporting numbers that match what production sees. | +| **`kernel_trace`** (kernel-only) | Pure HW per-dispatch duration extracted from `rocprofv3 --kernel-trace`. The runner runs in `--profile` mode (tight `for _ in range(N): fn()` loop, no graph capture, no event recording) so the outer profiler captures clean kernel start/end timestamps. | **Kernel-config autotuning** (BLOCK_M, num_warps, num_stages, etc.). Dispatch overhead is roughly constant across configs; for small kernels (single-digit microseconds) it dominates wall-clock noise and obscures the actual config-to-config kernel improvement. | + +The default `method: auto` resolves to: + +- `both` for `triton` / `pytorch` / `cuda` kernels — runs *both* methods and reports `dispatch_overhead_us = wall_median - kernel_median`. +- `rocprof_timestamps` for `hip` kernels — reuses the `pmc_perf.csv` already produced by the `Performance` stage (no extra subprocess), since a HIP testcase is a native binary that doesn't import torch. + +You can pin the method via `latency.method` in YAML or `--latency-method` on the CLI. + +## Architecture + +```mermaid +flowchart LR + KernelCfg["KernelEvalConfig (yaml)"] --> Eval[Evaluator] + Eval --> Compile[Compiling] + Eval --> Correct[Correctness] + Eval --> Perf["Performance: HW counters (rocprof-compute / ncu / metrix)"] + Eval --> Lat["Latency: 0-overhead timing"] + + Lat -->|"Triton/PyTorch/CUDA: wall-clock"| CudaGraph["magpie.bench.do_bench_cudagraph (subprocess)"] + Lat -->|"Triton/PyTorch/CUDA: kernel-only"| KernelTrace["_runner.py --profile + rocprofv3 --kernel-trace"] + Lat -->|"HIP / native binary"| RocprofTS["pmc_perf.csv aggregation"] + + Perf --> Report["unified JSON report"] + CudaGraph --> Report + KernelTrace --> Report + RocprofTS --> Report +``` + +## Wiring up a Triton kernel + +There are two ways to feed your Triton kernel into the Latency harness. + +### A. Import-based (recommended) + +Add a `bench_target` block to your kernel config. Magpie spawns a tiny subprocess that imports `module.callable`, materializes inputs from `module.get_inputs`, and runs `do_bench_cudagraph`. No additional Python harness required. + +```yaml +kernel: + id: triton_scaled_mm + type: triton + source_files: [./my_kernels/scaled_mm.py] + testcase_command: "python -m my_kernels.scaled_mm --check" + bench_target: + module: my_kernels.scaled_mm + callable: triton_scaled_mm + get_inputs: get_inputs + +latency: + enabled: true + method: auto + primary_metric: kernel_median_ms # rank by kernel-only latency for autotuning + rep_ms: 20 + n_retries: 5 + estimate_reps: 5 + warmup_iters: 5 + seed: 42 + pythonpath: + - /abs/path/to/my_kernels_repo # so non-installed packages import cleanly +``` + +`get_inputs` must return either: + +- a 2-tuple `(args, kwargs)` where `args` is a tuple/list and `kwargs` is a dict, or +- a positional tuple/list (treated as `args`), or +- a dict (treated as `kwargs`). + +Whatever shape it returns, the runner calls your kernel as `callable(*args, **kwargs)`. + +### B. User harness (escape hatch) + +If your kernel has multi-step / multi-stream behavior that doesn't fit the import-based mold, write your own harness that uses `magpie.bench` and prints the canonical marker line. Magpie will pick it up from your `testcase_command` stdout. + +```python +# my_kernels/bench_harness.py +import json +from Magpie.bench import do_bench_cudagraph, MAGPIE_LATENCY_JSON_MARKER + +def fn(): + # ...issue your workload onto the current CUDA stream... + pass + +stats = do_bench_cudagraph(fn, rep=20, n_retries=5, estimate_reps=5) +print(f"{MAGPIE_LATENCY_JSON_MARKER} {json.dumps({'stats': stats.to_dict()})}") +``` + +Then point `kernel.testcase_command` at it; do not set `bench_target`. + +## Reproducibility + +The runner sets `torch.manual_seed(seed)` and `torch.cuda.manual_seed_all(seed)` **before** materializing inputs. Combined with a deterministic `get_inputs` (use `torch.randn` with the seeded RNG), this yields stable tensor shapes and contents across runs — a prerequisite for trustworthy autotuning comparisons. + +If your benchmarked function depends on global state (`torch.set_default_dtype`, environment variables, etc.), set those inside `get_inputs` or in a module-level initializer that runs at import time. + +## What the report contains + +`/analyze_report.json`: + +```json +{ + "summary": [ + { + "kernel_id": "triton_scaled_mm", + "latency_state": "SUCCESS", + "latency": { + "method": "both", + "primary_metric": "kernel_median_ms", + "primary_value_ms": 0.118, + "wall_median_ms": 0.142, + "kernel_median_ms": 0.118, + "dispatch_overhead_us": 24.0, + "crosscheck_vs_rocprof_ratio": 1.20 + } + } + ], + "results": [ + { + "latency_result": { + "method": "both", + "wall_stats": { "median_ms": 0.142, "p99_ms": 0.151, "samples_ms": [...] }, + "kernel_stats": { "median_ms": 0.118, "p99_ms": 0.124, "samples_ms": [...] }, + "per_kernel": { "triton_scaled_mm_kernel": { "median_ms": 0.117, ... } }, + "dispatch_overhead_us": 24.0, + "config": { "rep_ms": 20, "n_retries": 5, "warmup_iters": 5, "seed": 42 } + } + } + ] +} +``` + +`dispatch_overhead_us = wall_median_ms*1000 - kernel_median_ms*1000`. For typical small Triton kernels this lands in the single-digit-to-low-double-digit microsecond range. + +`crosscheck_vs_rocprof_ratio = wall / kernel`. Magpie warns when it's outside `[0.5, 2.0]` — that usually indicates warmup pollution, kernel_filter swallowing the wrong dispatches, or another kernel being captured inadvertently. + +## CLI overrides + +```bash +python -m Magpie analyze -k triton_scaled_mm.yaml \ + --latency-method kernel_trace \ + --latency-rep-ms 50 + +# Disable the latency stage entirely +python -m Magpie analyze -k triton_scaled_mm.yaml --no-latency +``` + +## Anti-pattern (do *not* do this in your testcase) + +```python +for j in range(n_iter): + start_events[j].record() + mod.triton_scaled_mm(...) # includes Python + JIT + dispatch + end_events[j].record() + torch.cuda.synchronize() +``` + +This times the host-driven launch path (Python dispatcher, JIT specialization, autotune cache lookup, runtime overhead) along with the kernel itself. Magpie scans your testcase scripts for this pattern and emits a warning pointing here. + +The `Performance` stage's rocprof-based numbers remain accurate even if your testcase contains this anti-pattern (they come from HW timestamps, not your timing loop) — but any latency the *script itself* prints will be inflated. + +## See also + +- [`Magpie/bench/__init__.py`](../Magpie/bench/__init__.py) — `do_bench_cudagraph` and `LatencyStats`. +- [`Magpie/bench/_runner.py`](../Magpie/bench/_runner.py) — subprocess harness contract (env vars, `--profile` flag, marker line). +- [`Magpie/eval/latency.py`](../Magpie/eval/latency.py) — orchestration, rocprofv3/pmc_perf parsers. +- [`Magpie/config/latency.py`](../Magpie/config/latency.py) — `LatencyConfig`, `BenchTarget`, `auto`-method selection table. +- [Performance + Compare](analysis_compare.md) — how the latency block plugs into ranking. +- [Benchmark Mode](benchmark.md) — framework-level vLLM / SGLang benchmarks (separate use case). diff --git a/examples/simple_hip_test/analyze_hipgraph_latency.yaml b/examples/simple_hip_test/analyze_hipgraph_latency.yaml new file mode 100644 index 0000000..61ce774 --- /dev/null +++ b/examples/simple_hip_test/analyze_hipgraph_latency.yaml @@ -0,0 +1,35 @@ +# HIP vector_add — wall-clock latency via hipGraph helper (Phase 2) +# ================================================================== +# This example uses the C++ ``magpie::bench::do_bench_hipgraph`` helper +# (Magpie/bench/include/magpie_bench.hpp) so a HIP testcase reports +# wall-clock latency on byte-for-byte the same algorithm as the Triton +# CUDA-graph harness — useful for cross-stack comparisons (HIP vs Triton). +# +# Magpie's user-harness sub-mode (no `bench_target`) just runs the +# `testcase_command` and parses the `MAGPIE_LATENCY_JSON: {...}` line +# the binary prints. +# +# Prereq: +# hipcc -g -O2 -std=c++17 \ +# -IMagpie/bench/include \ +# -o examples/simple_hip_test/vector_add_bench \ +# examples/simple_hip_test/vector_add_bench.hip +# +# Usage: +# cd /path/to/Magpie +# python -m Magpie analyze -k examples/simple_hip_test/analyze_hipgraph_latency.yaml --no-perf + +kernel: + id: "vector_add_hipgraph" + type: hip + source_files: + - "./vector_add_bench.hip" + testcase_command: "./vector_add_bench" + working_dir: "examples/simple_hip_test" + +latency: + enabled: true + # No bench_target -> user-harness sub-mode. Magpie just runs + # `testcase_command` and parses its MAGPIE_LATENCY_JSON line. + method: cuda_graph + primary_metric: wall_median_ms diff --git a/examples/simple_hip_test/analyze_latency.yaml b/examples/simple_hip_test/analyze_latency.yaml new file mode 100644 index 0000000..8da20ad --- /dev/null +++ b/examples/simple_hip_test/analyze_latency.yaml @@ -0,0 +1,37 @@ +# HIP vector_add — 0-overhead latency via rocprof timestamps +# =========================================================== +# For HIP (and any native binary), the Latency stage's `auto` method +# resolves to `rocprof_timestamps`: it reuses the per-dispatch HW +# timestamps already produced by the Performance stage's pmc_perf.csv, +# so latency adds **no extra subprocess invocations** beyond what perf +# profiling already runs. +# +# Prereq: +# hipcc -g -O2 -o vector_add vector_add.hip +# +# Usage: +# cd /path/to/Magpie +# python -m Magpie analyze -k examples/simple_hip_test/analyze_latency.yaml +# +# To compare against the wall-clock harness for cross-checking, change +# `latency.method: both` and ensure rocprofv3 is on PATH. + +kernel: + id: "vector_add" + type: hip + source_files: + - "./vector_add.hip" + testcase_command: "./vector_add" + working_dir: "examples/simple_hip_test" + +latency: + enabled: true + # method: auto resolves to `rocprof_timestamps` for HIP, which reuses + # the Performance stage's pmc_perf.csv (rocprof-compute output) for + # zero extra cost. Use `kernel_trace` if you only have rocprofv3 (no + # full rocprof-compute install) — it wraps testcase_command with + # `rocprofv3 --kernel-trace` for kernel-only timestamps. + method: kernel_trace + primary_metric: kernel_median_ms + kernel_filter: "vector_add" + timeout_seconds: 60 diff --git a/examples/simple_hip_test/vector_add_bench b/examples/simple_hip_test/vector_add_bench new file mode 100755 index 0000000000000000000000000000000000000000..cbf5d3818c7116bc6c5a8e6ba6d3322cab11a002 GIT binary patch literal 41536 zcmeHw3w%_?+4tlI%PpIPOM<9uaKWIkY(mI|izbA?S=oe0KoKv?vb(vIWH;U2grKNk zg0hAvwYJ4xtd~}4ZIxDQ(V_&fT%=XdzKT^_>UFpALQyfIlJEb_nc19d_E6fs-}ife z@5eySoaaCDJkQKC*K=mh?9Ju&`H2Y$Ou9sN2_tITbtVp#@GCCgmp6=+up%}L*K9VP z4Fa5uzY-B#!c6>5`gMtFfbDoJriO{?60^B48=)Og#3ujQjWu2`>zuejgDZ=|0l(tls78Xn8HrA@`RJg)Ld zf1AjVN$^WPB1vDX(ARpu$|wCpf=~L=o{QfNy2&nQnoc2xRSCW$^T7;vWWK4Ne>bNX zlfJb5<=~@q{;Q7o>F+s_@8?9hOWz~}N?+=m)=hpo2Sr$J4t;2M_7s zn{{w%uJPpmL)?lV@X=`K z@zQ^;4qmK--=u@L>EI9R;Lqydf6~Dd(ZR(_|6JrZon^6g51PvEmjQ$ZySGqUwctb!9 zWkWy>0j>TUiWz|c=x{cIbC47Rk3R%@5}6VCtU^-NemaF&L9d9?32Z&+`!w|T`q4@3 zmx4wV=D%=|kw$Y9Fp?MG^k9|)np|-5UC6$HT`I!A+4DVEXDC}J=rx=k!Ilg9<;PZG zYA}S=%kaaG4Wm&w2%22;IrT#ra@HZ}s!3}Qp2U9IFFcutQ+n?H)^{;94r9L+@}A&y zBHJzElT0*6mM8pk;IBhY60&m=2>bOL)Bo#F)%UgKoVV%RHMUhxGKT}BpUV*lI{iUM zqqE)%q_)xL6^H|*-&0#32zvaE8o#sA(%)%9MtW3eaD+(-#RXjOBdF0N)(T=g2L_xrq*fvAqT z$gH=_*%WN{YtVA9TZ1j|JDcikUf9eVtfv%#yS&BY4T=Oxada?2Ml1YAf3+nSr3o43*#sHb{CEH)Py za$R*!za!{()&~PNS9!G)m-2E*TVfj^F0PJwx#caQ;sjy$)reFb%(FTioN_dHytTnP zH91uaU9yUya5cGULUMvNtf@H|V1c09*BoRuH4P|dP<}{I75}TbD0WmhT)c9XFK*;E zrg}%o%Po#3&sPh3aAWb*krDqLJI%x&PF$Msq^1Ls+f*^%!5^bIz@olT^Ct5XEf;)4L(YO@T)ZVXbrwhgOAbRD>Qhf2CviLaxO-}O&WZxCVs01r}167 zbsBt}1mSPg;PRaZ1#i~i=V{_^(ctH6@cT6Qcn!W)gPSz?HVr;qgKyX1*&4h{gHO=l z`!)Cl8vKw3H*4@C8hoM#KdQkeY4ETHzfgl8)8IK8yhnpi*5FLckE#A&q`?gu{9+BB zropFZ@GK2JRfC%}xJ85KXz*zo+^WH?8oWq@&(Po{8a!8n%Uhaj4|LWt6|FNYO87@FRq3k)Z4-We$vTIL2UV0VsIrwi?4?V} zE6c8OTvolPa;E8q35`CtXTr=0G;-C_gl)o<37D}30~2N<%;T zVbxN_#Tu%yUK_K~*{TV{V5p(Im@7A7jtZ&nDHDjo^vOd?Kw?@~-$-+_KCJ;~V^f1C zz>%xjjjYAv3i|vGr`yfg6hY0LiOC7(CNw?9+&kdGlwo3kQq2@wm5EPCW}4WP39I-- z+Tq0<9oe23n82>Rk!fdfGiR!Ej~I~VK0Zul{mrhR$=lq}VDkEcrWQ;P+?bo>TKf`J zwXmkz>Zq!zXRNn3VGMf*CHYR`u%k)r_N1|2Cb0*TZlnT1*jp*=T?`HH4jtNE$X*_n z^5fa;<6$X3naw(er~GU-`|ewLc@b|HH9GS7Fki9uF^PU3s`N-@a=PKI3IRnS1it(Q~rDJ%z0w2lzYVGVUm350A_I zSw7n}F8koc>@9*%jLQfWvW@3v-kHxHI5)fPVz%d8z^|U0(O$?R=VpF3owc2p{kx0U zPtF6p{k)9c0`~lQnXgZ0@0^$Y$z-dVE>qq`O`esaY6Q%6WK2=0Niy!#=!#i(gm3vdF+D=vLBzwI?b3j+-uHg%VXQj znQ!N^=gryOX4Y-~BaPpv=e7kYpBh+j#0djipHRx!?u4tqG_W5fqR)CG(R9>+=XL{t zZzn&9j_=77x_>rh3p&473@NyO-H-y#qlOAdJL}F`;H(ACTHve&&RXDqhXovtGJ9#| z0>`5H^X-dD=Q-vst(<2scQ~R?5!#1!WLZ9I*7*WKT7-2J&haf(_k3!;d(OvjmLa6lR zn+W2geHL9}6GN9+w3U97&|i`$ZZ!Ic&nh-KST9k-r7vRl*-Y?}m^0BQ^-8E9NbzTFO}4qw4D%i@M4g_|{)o#v>MI zZId61m)OY0erj?0YXfhv^!7(BSU>f90s%*$$?5VuRNLTN>1=q|;;X3%cqn238~zNB zSg^v`>~XBZromsTQ5-9qYid0He(db9EsYrKQ2;L>-RM`Ij9^oiL#s- zLtmP#XU5Q%q`Wu`zZOaIdua13cCl(wlz&F?`w~=eM)CX7qp0!w)x{^JiFmDbsFhZe zrm8dM=}XtLGv?__+ln}O*(THGuXj7#^^Jk2ETSYmwT;;Riv7T?9yc#n_9SnqXp@eB z2*iehTCd?Z18}Mae@rm_%+lcW);2q9JzE!Hhh&-ERQ9VVXBMQj_sE0Jxt&<>26 z24~Q*+UH-@SC#&ljJ8d?AAYN|YvFem>>h2Q9T+l6n)E(m2}C=S)L*GYpsBv0LG&*V zTLNnuSNa;B))8)5<=$>-_SV<<{Ea+e*mNQ*dBWqC<^Z;NxYr6Hg50A!7WE z$3#&|*ki06z8r76SYj97*OidiMf>`0?%K37sW<7?-d^NADG_y<)g`lyH-5u(%d=zH z#+@u-XcwZfb!<=FVAANSZnKprkuK9 zuqiNe+B6V#%_}V~U*ojJPPfPB<&yZHDlviLa z$~V=_C|YSPaJxLYHN~#n8HLtD*NmDXYf+8cYR&WH7P<1AE>}&VXEI~Xl`oq;Y%t!d8~5cOQKqY zd+>48%H~@B;gkdO0|)kUI(z{K5?WJ_?Vr;EepkPwbK+k~Z`6cwZ`9%IAHA`G>K^5P z^u`v)LLPBo{04!?#=lG8VwjAod0a=%n>zRqE?}VEn+Q)dF!?+!6-)yjZ{D&1c)WSb zO2XsJTbKnKUOmi$=^5sD%z|A8mO3XsDH2!0gMMc5``pf;lUeYoOpwQ>$sAx$gWrb& zcf0B1D$B}1fLYYXRtwGVFk7u}Y;FXII!m);Ml;Evb0z#b$W`xDUw3AsnYgOcYn`c) z#oOHIplPM9B#JYXEp>e~&n2h#{zDBN_|A%W zsn3=CvLBZI3c*PIFzJ%*QufdEjm(MOUZ3Qb{jl_N1iuWI`~v2W10-99%YIw>vHY@s zm;QM0()?cX%W*>ba-5L*C=Xh9Aqc`Gf0sz0^h>mnaUV<5w-CfH$1(YQgBOh4b;*RP zq#9Y?!>4Z}h*v(N$o@yZKWh?k_^g53)$!`@OZOHe1@|+EdOD;D5v@5tD zzkEL^-WSHkkEIvKkx6=_e!ADXSpHfB5@yv#B3kG#j$x3b^vOQ5?6e}9sA9quiHJnf z+(Ft;*56iO(ssCwx~;;%w?`56M(UODEx3utpC>*Bqw^KCE{0_Dt85C#OeSqKmM|`9 z*4;oz%n@t^-v-}+MDK^61^*5B%iwutId<*d10(^4V!p>d7^@~|T zYg)p%;e!pE!9zHG_hmv_CsB-fgSiFiLeKI=z-1k#cnaabir}--)U;a@%SH?eKp030 zufyLGgkJ_f8lG6rBbWef#9uqYdHKvsU6+v9oVqb#Uh3uqdn(Du!QU_NcN7Y`;j_7^ zX>(JvN>fd3$#*2(nV9%yY8HnOnbL3se?Nhp_&h_6J2%x-nwnFZYHdr|r1G>9X@VG) z6G8Zb26;#R4X|wp_XG4a_r*3L7^Y_P5I^fu#y-Wph!0(JRb7KNrF6;^=5Sb}O|@G) z_>X{pEmb_&jwgut=cZaqQ;SMdOWFo+8q}F`NAjIXw>4EeKm{Vc)Q%Ez*S9E$mtgb6Hr0g)O!)r-fa^|1YsH4;a2-VGs-M0Ek=kxmO_A ztKnS|Tiq|L1OI@Mu5|ZX5$8TvXNT$|(Vm$w3?uWL} z;p>#pTT0tI$Euc;=XT{DP&((7qD$%>9YBZpX^q7=zMZpC{5^l@b4!)byC+gAu0RhN zOzggJZ^~S{RXS1@f=_AdVZo6Qbw#+hSLvKF4PoJnFh#v~kCdnKFBSa7ds8kG{39j* zaHV6;{3T#a03!q^_62m#`TG6d-f#~rb*)I0()K~C5?X6klu*!IqJ);1i$ZpDveG$^ zpn2vhx&uDXyo~OO=xznwCDUD<(z({0qjUz%SxV;;bDGj=HygrRAz$eTnsYjqn5{~O z-JCVAW1iV$M_|>?6{Ae7H)uBCFDrZp#a#c~Mwn_(jd46uT6$&a6{Sl{msBh3KiCX- zCx#dnMpwv`&>odQ=gn-Py){N($_tP8_HIu`G!P>WgMdH!9@Rg>t28p%s%+>DrYRjyn5it4$*00eh>_cc!lhEYy7ZdTYfG;yb?iYY zN}U-|ov&gR)}6$I;)&XhU{ZP%{!(oP>49il=!6p5VW#;X>^H9(eshLuzb3@r&}H24 zDkkAJtemJ%Dz=V6_6|cu=fe!o6!y^j&r-42LnnDTpe=_(g6&C?-Z9@`>!?J|V5%Xu zjw`a1jtsIa8pkH{I>b;ycbm7u-`8%YoMKkOqK(I-bhew<0V57HlXf1H2rCg3p9mD> z`_zWe*6udnM_}hB^A`Bw>n;Y-k#9yrGEhURqK34L8j?wENKf#1y-#q#;Y%rc+jB_m zoY{D=3%>}O4A5o(9!;QnzaR3+A#NJtaw~)%6%3Dq;deCMw|D-6Yfw5H%%<+4HJf`6 z8ru&cK(%5#^IgVeG{5pMm~NiA>CEkh$Z?Hk4@JV3izq{}_Tr|pg(5H$Syfsqog%|p zk(^%iwjFbZTn@VP30^GWClE-c8%m}_wdQ5jxx-9m%Y7;L09BOxM4SaY&YS~PF!?nC zyPGgH(x&k30(qMwJ9q;LHw$nYUNcZ9(WG?D2oZmS2w2SnRDB3*q_7ntY=;Qjh_oQ= zatg~AVF^PCn=^;7LzC4!R02ckKY{4vsM$f^-D*=so+uzZQV6&Okv}n}snI_%F5iFL zc-@FQ9_{TtZcIygi2K*=I&PfO)z*{9n~bqye~t0VuH(k^P3EJJRAaR4+8)NEjdAWS zRPMD(hnkD+i|1a%$JP|;C?uTyhUQT(-W zLmA9nG5LLEdk=2lZnhQgGqz7d678WLWmiuUCTj^wXqVD<7`YymV0%`L*K4#FANMEO zL&v#g!lXZ8mp{oCdgZdt*`HB*!mlEc812xa+RYT<_*`Rpa#6*G-kVY}&e>^rJ{jJP zNR^#aW~sJ&-yS*@erzJ~U1P4|wyf-&{RuA%Tj&jLcNllWgQuu3qkiWuVTol!$O59@ zxlw>W5g?g5oFl+v0_-C20s+pZYQqO7$T)%!j74joqO#`6KT2`097`(JqMBe9z^4)` zXbQk6xgG85*u4I^9z=~!1g_nK>0BBypF}LwEItO_ZC(fHS?6dI3%`V?dX!Z+3F%yG zHeuGHgm#Co&cbx+Hwly^6U}~JM|kds&(b+sPt&24guM?eG6xBlI=-vXvBa#X3cjf- zxIIR}Nx)hK*T+$CD=Byb=8_6##8Z%_DwrIj;GP%-W8)~8ND6MyDEO1P&q~o8JT``# zXg|WVCZczB+8k_0c&v$@0~R?)qaYAZL8YpoHb%j#fVB#)jHAFs3Z62cGby3cbc2+} zGTygi!d<*N(&=cS>as?Wn8AiRHGJ`eW*i&R3DRPZo>0SazW75pGZ z!LS$w9dQ(Vj}&}3xKF|ScnY#r1v6t5{2Xwfu|5x96h}b;DcGb@@a}{-Ip`Y2P4pVV zw8d?UQE&j^u_pR2V3CV83fki-aHtAeV-$n|YjaQ+M?sJjJU^(I(Ua0*AJp-pGT zD7Y;~!QeOwMv{Vhje_0TaZL0hRl$=8)0${fjDm*{9&4gs1B)0m3Y_s2%vKdFjZyG> zz*-ZP$5C)ODR?-g&qO%Deqc3yJ%gL*Q&ayOxMLK&gYeiK90L}auTk)000Y{=^{Rqz z$0$gOQP37g!JVYw-Q>O;ToO;gSXDt@jDiOMYfUsEjsgoQpw~CDCBKPh)B!ozj^{bl zqQmt3su>wdV-)N{c&v&32rP1eM#0*63a(TYG{q=53Rs(i6>$_akb>PweK{BxPr(U{ ze8_<@M#0T73VP2Un1f-Yz^PI2G$IW!(RWk@k0MNKqVr=E+#5&1&w)i~O-z=-Rq+($ zs|qS&6g&f1Yoa-E6xc|?j}!Y8^x$B@0XcXXPvRyk?|CKG=k?~aEyZ0F$z8(H!ufD;OV|b*3sV}#(*lcS(W}XglX+F zwx9IR!}ko3{(1PP#6z#DXyS>tomQoIpE|R((o3>Q=O7rb&L}7?8pPk94MR{#`-jF2 z%P^Eg+`~>Q=P!KX99PaB1gUZ`v55>my`1mr%eg_yp&c2Koiv-*&sU|soas`|5zHbY zt4}XykiML5n!l5Mwve1@rKEhO8CtokqES~K1G|8EAdN~j2%juAER+F4}P-v&g*LC`G%A}lp zlJmpU%Ne6D=S!M;Q@;LzDMe(->E*nD$U63UT*`TrFI>}>*%hyRFRz&9M?u%z^ zw~z$-QHTK{=TZp$e8hl|p%n7Xcp>z~Y~;!~AsZf|kiX#)D{aFj3i(645I=?dHeSdw z3V9%2$R!lg885_4A+7O3hEj+tUdS;_Mk1B*LSCVe;&>rDD1?*^$ms(VVvHBEkwQ-5 z62sH%k3{02*F=uQLFY&I#X+r+C*z>!M1DFDHMVadP_F7VW8zQibsypxiI&p70~+az zJdENepQhvF(+`!9f2Znq9|X~jH#K{@chO}BU4BcKU(@B6blFOm2kCMzUGAaF-E_H= zF1OQV16|hP5^2RtQu3RGKTYVnQ2678pQ-xx4!5}~LUj~OdXuqx@JOK3xNf4S1!`RM zvqzELvbf?;>MqxpQy}Hkk(}#KFK39poR4T~PU)FQa_BdsPLr=)h^&*ZhoqdhP{@(L z%i@opuQmE|u9I?pOmZHOa{4-zU~{wz;PN2D;GNFumBdz_b+`-l3J|pm~LTGG3IwAHE0tRHt`w ze^2*ch~qW7{E03v(&c%%e**g*aaRzuc^fr&$*)<<;CvT-9qj%6L!x4bB4P!bc z_IN38ewOwtJiTYMi9JP^((Uw?zdZLq*aeoIht`>BaV94+1CJVfzX4<)$mgtzI?x=y2{>Ji~R~0DjQA&gM119O9kE2 z9f-ysbCbGm~8G& zMSZ}Q3R*sK@g(r5SUMlJH#vcupB5l!AHvRiD4mruS;clobCb%Z8bk)Y>ErL#Is35`@btb9pm7aj^<2o6_5)0K{NB~+c( zJ;<1rU`$U`I;v2itMO-0_LX4riY*x>SXK-j;da`yvz?Mv2$dp5n*9!_xwg=!NEKG> z4%8Ua7g_02LKhQV(r_^<`xaD@RHbcSNvOIgv=DVBz04YRV<5M|ausMe27B=_WBYrU zw5Xgy}(`g2Y!` zJjJYFcLUXaEG?y%Wnt+sZGI2knvCuHktbT0yRDQb@Z<2e@kC5%;~BC|K4ENs6j5kl z%!KtvG-j;8g>OP6dnjyAI93t*+_tMX$!2`!RZYqoXw%HcJpSHrsYn-~kHSyT1RTjp zvmO4p^u%6+9ceT*FQMi#ucrszvScLfs+6^N_V#v{#UvbYa@2&Yi>v8L%Dysm1?Xe% zqCS5g@_;^XB87IRsZFrYM#wSxJEZ@k@UwXQvs11Y@*(i!@Q?7k%p1r)>H$%W{vN)b zUJbN;ZD<}$1{>TR;Dr!QQaY918pY_{RbyP4@|`@UOLRU-}n zqecFwxIg7U=j92#`-=}5E3gUby5$@>8A20QiwEQ6n3 zrq+g!!bABZ>&M~a@MDttB0cT@v!wpLQ%&k_?)OjXDPWY;)PFyzcGwO5YI+%p8MXJL z@N(2ZTgPxza@u4QI;2eAPj#5e27OSX@s>wWR@CH_;={pu+`OcOUchFxb}B>E)MaCo zw!P>RUTOY>%#F45l_`@%9!_ndyyP5#xsMxi*OY1;)^ zhnwG5diRsKLl8ImRW9b5(T}O({tR(oScbprux-w=%RlJ2@$#W`}fd$7N0V<{}D#AcU)rcs4-N8-m92=O6foszAz0# z5;~lg%a!e4s9h@-*!LRS$k-%~+8^3wMe$cbbm|{t^_$xB9VjcMW0p(+gq#Y0ACH3e z(8U!ApV>pJ(z>rwI&MHQo>V69#zK4`hu+RjM{`2gNlWMrkXwN_cNLgfB4-r!lydR( z?hMj&9&dJ}se453346jR+vHD!sMl|WXHYJarrSdSFrcrN+u6dik)r6}ft$Ymx(XU? zlfSZuj*8s^>Bdb8nlT0!8d)~HWNfDg9A)3{sdH2&zp2h24u{bJE~H*?6qW5^+|2K3 z$-?dKmOKziDBqFzOWxBR!1@6df6GjP52La-bYHC*-zglSBU>&;81Y`w(><$yI9hMo zu$D0*yt1cz9BxTewE>-zs%k_ea7^8=Nt$h;4=NI{kCOIpQr}R) z_j}kd*&*$#Kd9)uv8S)Sq*e7`V`UG54z2zNHdO?MgC{)`iVWSq+RTU5H#l9!48Mfm6%_E=(5=ek8)MYE~G2Y=rK_hYf z(uZE!wkvXJMZzvy?*ZH4FSz1^WH{rkmm)q-!k&uIep~SuHy@5n17vIaYRHWj&}1@N z5ov?)IwXVT9)L#5ZI9umH<1DwK+~u&D`EA$Jt#M2@)5Gtu9Mioax!V{bxP-Dy$VvG zbj}{C^uDMZ#`LzUdxDU6S?BO_ZVzPUpqdt2XuneY#f=v(>>N(Ev>&8mwjWGud6^<} zV;qk5kGTi78k#Rs+IFLuPEyBqJ~qn?A-0!M$6*_#bGT~2(Q5tQaKPAJh)fFoO6TS1 zrC(JJe+-`CnEH@i#*$q)bLb0XB4PDwkz_#B4mIshomKN3_b7AL!#KuGci~#md6S`Z z2ih$rEnnM1pY>}Gy#8SV`4-7Hhv#6VCdFoZ=ok)sH614Hk5@WIe`g}9?up1HF!vw+z_GRu&^@P29gQAxLsu}d_Yi-0 z!gejhr!W-69+W`ujBkLocVYvs#_=V%Bf1VDEOHcy?VY%iXsVTwL!4etG*#(H7pE5! zO%t-nHcnqgG^Tn?;;HX$Fl)4lL4g`Z&>e z)?<-l=ofn@{+(!=Y(|c9`YodIx(E5^^q+~wYaryG(=QQ?7mLV0r=KT!6w&u_dNCH{yEWD3_<=m{Q%Ke4?_Mq{e7abNQV4#`Yxie+=l#f z`gWqRxQG06`WB+G9*X>P`g)?VgpB-i+D|kV#gYH7DgWOf+C=nGPOl_7o9IKFUQYA` zqPsY~nCJ_L-p1+6h{hlC&*`~DqYRLLPR}HI64C28ok#SAL^pBzBBFDMUcu=JL{BEV ziqq#1eG$EgObC6Slm$qAxFQHOit4DJIPvleGL7*7@B@3nH*IZt)m7) zZeyy6`_w?lZB;$_xizDTLyjsGIjhmceQL1ewg~x2D#^A!ZsK=TVaZ87)IiCF6HVNw z21ZWGPfFm^}7_FfG?Dq7MKW*y324!@vB7q z5)pDOe3CEwcZx@@7Jn3fbBz2OIX)`dpUHlzo#UgU{g}jOqXSMzJcr5tK;j*UA1}Z6 z0q5yiX~LcQeKz?Y&;=)!&}ZLTcZw6k7=6O6b!mJc8ph}|POY0j_D6o@xOTM&%m+^S zy-(yO}k6`q=pPU>Ayh0AW$tOn>19Izt$II9I5SWnIS1w|>VA~O&C60@ZM_dRl zUsD1S$FSJ`a~SNIn3x_N4`lpwj%P&21&N0shw2*>{Uc4_$lVXU8L-1PQ4eWiLXOUb zAUth_iPI*?CHG6h=Whn!EHLY6FB8d{;uooLgq$}4^;2~aLN}g#4!&vSuKK)%>|HGVheV(d1Z_;>2y-_yZ=sDnSEgYVVBU)RBDcSpQ@ zrJy3k!^i62IXZZu4nAK8U#^3Db?}>XaN3L$5ffyRPoKifG3^>_&tFSY=Ywmt8;?rR* zTK8uiIm}Y;4SM`d>bZE9(z&*&K_^bO)1HqfPH0pQ%M*kHXXN#rofkbd&(q)}R(?>P z)6&#{1MuWYdbKW>!_(^WGzA?v2Co5U@X@bz*EoYK>k%LK^PR!D_1Qf%8gQ1Mdfp#m zsq*;IerIxupToJb9yzLIE`M;P2gg8Ka8RDZ?ZXj}NMiKP;^06%PR738LuVJ4D~|b# zODphO=CEkw%rodd!Ax6*CP^XWMB@}&(e<&N_5O8o41Zk`=SXM0_1s_946%NH$i)YsRu z1!ZOZN-%c-k~aV;Kqjc1@dNG64IT&mD1lZ@M>m zB&U$&CRs4L$Ll7OyBqM!={NzOpZSesPvpyKGxyq}ZxRjSemAj#;rIC^_-|z9T8oD@^dRNzb z-HyecKyxEVs)$w1fx3YLOF+80@29j4bCQ1ye0KWu8dON>T%8Uz|OukQB2Z~ z<=)ketkKih=xgay*%#Ws?H9N`L1%qKZV^%$44`8A8XW;=V-t?0MPXD23)G5bbHl3c z^78t6Y5`PZd3~UErS{)yt@=00sVvV{Jsn+4)j}8aO?UA8c!K4Azt4}FLr1DQmRARx z8uW`=Un?51)v+WO;ulpHIjR;`&p?x0j`+FL{rJ%knAG7RMP<3X=KWihY>91vxVSpz z<(9Vy+XYdbR`YD-Ssgea7?h*IU)D z7nC0oRK@>lQz>>-I9$AWmM?DPRYz^$$X9VRdA?e7p4?bGb!5bUM-TPcUN@C?qng;5 z3g)b)YL`pJ1oZ_pK%$lh=t$)TpRdVQi4yg!iDiHdB#bQiQXGcyydgFdDdT`#&qj|RS`388FdgVwG z!=>|M2{3bOs{;dUz(-SyAbxblDk^1EZ2##>dF4DB@&5Y`(WUt-{o!$&yRC^P;j1() zBx#+T?@DY4ZZJiU^2_&p&k2Dh8BzFnT7K;NAk(x@t!-GenGNw+)$X*FV|D(fC+MPJw>Hd8_0X|EAU5YlKgUgXk#jc_N@=a z>X-Jr5^;!sE=7iu>qS!ceMB&G#|6$--{qlWv(*l*DXmEc4vHE`{_y>t-@;&CZN)>QV>(3vP zefHZJe!0Fj>L!(s%3SMW)Bg+tDg9EvT#vH~ez{I3WyJFDiQz91?`KN{zbU5tW7EGM z{8ZItv=#T8KryyrJ@@F1U_TdYzds?As=DNt>yJ%>KQ?|WeH6TucgZi`Q@0BKLu4E{ zS&WijrYnLliX!=y2Th=`X`_vM{2X8PPQ6NgiG73{!X&?ZPt_#&W9zR;G76tQw9)c! z5&9)R-$|k25!s0%^m_u@^vn0+TZDd@PstFQe)X+qW0lzFe ll|#L5zf<)~q#_X*L-O;z +#include +#include +#include + +#include "magpie_bench.hpp" + +__global__ void vector_add(const float* a, const float* b, float* c, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) c[idx] = a[idx] + b[idx]; +} + +int main() { + const int N = 1 << 20; // 1M elements — same as the Triton example + const size_t bytes = N * sizeof(float); + + float *h_a = (float*)malloc(bytes); + float *h_b = (float*)malloc(bytes); + for (int i = 0; i < N; i++) { + h_a[i] = static_cast(i); + h_b[i] = static_cast(i * 2); + } + + float *d_a, *d_b, *d_c; + hipMalloc(&d_a, bytes); + hipMalloc(&d_b, bytes); + hipMalloc(&d_c, bytes); + + hipMemcpy(d_a, h_a, bytes, hipMemcpyHostToDevice); + hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice); + + // Use the Magpie helper's side stream so the captured graph is well-defined. + hipStream_t& stream = magpie::bench::current_stream(); + if (stream == nullptr) hipStreamCreate(&stream); + + const int blockSize = 256; + const int gridSize = (N + blockSize - 1) / blockSize; + + auto stats = magpie::bench::do_bench_hipgraph( + [&]() { + hipLaunchKernelGGL(vector_add, dim3(gridSize), dim3(blockSize), + 0, stream, d_a, d_b, d_c, N); + }, + /*rep_ms=*/20, + /*n_retries=*/5, + /*estimate_reps=*/5); + + magpie::bench::print_marker(stats, "vector_add"); + + hipFree(d_a); + hipFree(d_b); + hipFree(d_c); + free(h_a); + free(h_b); + return 0; +} diff --git a/examples/simple_triton_test/README.md b/examples/simple_triton_test/README.md new file mode 100644 index 0000000..7ba176b --- /dev/null +++ b/examples/simple_triton_test/README.md @@ -0,0 +1,71 @@ +# Triton 0-overhead Latency Examples + +Self-contained Triton vector-add example exercising the new `Latency` +stage end-to-end. See [`docs/latency.md`](../../docs/latency.md) for the +full design. + +## Files + +| File | Purpose | +| --- | --- | +| `triton_vector_add.py` | Triton kernel + 2 BLOCK_SIZE variants + `get_inputs` factory + `--check` / `--bench` CLI. | +| `analyze_triton_latency.yaml` | Single-kernel analyze with import-based `bench_target` and `method: auto`. | +| `compare_triton_blocksize.yaml` | Compare 2 BLOCK_SIZE configs ranking by `kernel_median_ms` (the autotuning recipe). | + +## Setup + +```bash +pip install triton torch +# rocprofv3 needed for the kernel-only half of method=auto / method=both +``` + +## Run + +```bash +cd /path/to/Magpie + +# Single-kernel analyze (wall-clock + kernel-only) +python -m Magpie analyze -k examples/simple_triton_test/analyze_triton_latency.yaml + +# Compare two BLOCK_SIZE configs — primary_metric: kernel_median_ms +python -m Magpie compare -k examples/simple_triton_test/compare_triton_blocksize.yaml +``` + +## What you should see + +`results//analyze_report.json` will contain a top-level summary +block per kernel: + +```json +{ + "summary": [ + { + "kernel_id": "triton_vector_add", + "latency_state": "SUCCESS", + "latency": { + "method": "both", + "primary_metric": "kernel_median_ms", + "primary_value_ms": 0.012, + "wall_median_ms": 0.044, + "kernel_median_ms": 0.012, + "dispatch_overhead_us": 32.0, + "crosscheck_vs_rocprof_ratio": 3.67 + } + } + ] +} +``` + +(`dispatch_overhead_us` and the high `crosscheck_vs_rocprof_ratio` +illustrate exactly why we need `kernel_median_ms` for autotuning small +kernels — wall-clock is dominated by the ~30 µs dispatch path.) + +## Without Magpie (sanity / dev loop) + +```bash +# Correctness only +python examples/simple_triton_test/triton_vector_add.py --check + +# User-harness benchmark (prints MAGPIE_LATENCY_JSON line directly) +python examples/simple_triton_test/triton_vector_add.py --bench +``` diff --git a/examples/simple_triton_test/analyze_triton_latency.yaml b/examples/simple_triton_test/analyze_triton_latency.yaml new file mode 100644 index 0000000..9586db5 --- /dev/null +++ b/examples/simple_triton_test/analyze_triton_latency.yaml @@ -0,0 +1,58 @@ +# Triton vector_add — 0-overhead latency harness (analyze mode) +# =============================================================== +# Demonstrates the import-based ``bench_target`` path: +# * Magpie spawns Magpie/bench/_runner.py in a subprocess +# * The runner imports `triton_vector_add.triton_vector_add` + `get_inputs` +# * Runs do_bench_cudagraph (wall-clock) AND rocprofv3 --kernel-trace +# (kernel-only) when method=both/auto. +# +# The `testcase_command` still drives correctness checking; the latency +# numbers come from the dedicated Latency stage (independent of the +# testcase exit code). +# +# Prereq: +# pip install triton torch +# # rocprofv3 is required for the kernel_trace half of method=both; +# # without it Magpie falls back to wall-clock-only. +# +# Usage: +# cd /path/to/Magpie +# python -m Magpie analyze -k examples/simple_triton_test/analyze_triton_latency.yaml + +kernel: + id: "triton_vector_add" + type: triton + source_files: + - "./triton_vector_add.py" + working_dir: "examples/simple_triton_test" + + # Drives Correctness (and Performance if rocprof-compute / ncu is + # available). Stays unchanged from a normal Triton workflow. + testcase_command: "python -m triton_vector_add --check" + + # Latency-harness target (per-kernel; overrides any latency.bench_target + # in the framework config). Magpie's runner imports `module.callable` + # and times it with inputs from `module.get_inputs`. + bench_target: + module: "triton_vector_add" + callable: "triton_vector_add" + get_inputs: "get_inputs" + +# Pythonpath so the runner can find ./triton_vector_add.py without the +# user having to `pip install -e .` it. +latency: + enabled: true + method: auto # both for triton; rocprof_timestamps for hip + primary_metric: kernel_median_ms # rank by kernel-only timing (autotuning) + rep_ms: 20 + n_retries: 5 + estimate_reps: 5 + warmup_iters: 5 + seed: 42 + # Restrict per-kernel aggregation to the Triton-emitted kernel, so torch's + # seeded `randn` setup dispatches don't pollute the kernel_stats summary. + # (The Triton kernel function name in the .py becomes the rocprofv3 name.) + kernel_filter: "_vector_add_kernel" + pythonpath: + - "examples/simple_triton_test" + timeout_seconds: 120 diff --git a/examples/simple_triton_test/compare_triton_blocksize.yaml b/examples/simple_triton_test/compare_triton_blocksize.yaml new file mode 100644 index 0000000..7a74055 --- /dev/null +++ b/examples/simple_triton_test/compare_triton_blocksize.yaml @@ -0,0 +1,59 @@ +# Triton vector_add — kernel-config autotuning via Latency +# ========================================================= +# Compare two BLOCK_SIZE variants of the SAME Triton kernel. Both go +# through identical Python/PyTorch/Triton dispatch — the only difference +# is BLOCK_SIZE inside the kernel itself. +# +# This is exactly the scenario the in-team note flags: dispatch overhead +# is constant across configs, so wall_median_ms is dominated by dispatch +# noise on small kernels and may rank the two variants tied or backwards. +# Setting `primary_metric: kernel_median_ms` makes Magpie use rocprofv3 +# --kernel-trace HW timestamps for the ranking, which isolates the actual +# kernel-time delta. +# +# Prereq: +# pip install triton torch +# # rocprofv3 needed for kernel_median_ms (otherwise method falls back). +# +# Usage: +# cd /path/to/Magpie +# python -m Magpie compare -k examples/simple_triton_test/compare_triton_blocksize.yaml + +kernels: + - id: "triton_vector_add_block256" + type: triton + source_files: + - "./triton_vector_add.py" + working_dir: "examples/simple_triton_test" + testcase_command: "python -m triton_vector_add --check" + bench_target: + module: "triton_vector_add" + callable: "triton_vector_add_block256" + get_inputs: "get_inputs" + + - id: "triton_vector_add_block1024" + type: triton + source_files: + - "./triton_vector_add.py" + working_dir: "examples/simple_triton_test" + testcase_command: "python -m triton_vector_add --check" + bench_target: + module: "triton_vector_add" + callable: "triton_vector_add_block1024" + get_inputs: "get_inputs" + +latency: + enabled: true + method: both # explicitly capture wall-clock AND kernel-only + primary_metric: kernel_median_ms # rank by kernel-only — see comment above + rep_ms: 30 + n_retries: 7 # extra retries since variants are close in time + estimate_reps: 5 + warmup_iters: 10 # extra warmup so JIT cache is populated for both + seed: 42 + # Both BLOCK_SIZE variants share this Triton kernel name; the filter + # also drops torch's seeded randn setup kernels from the summary. + kernel_filter: "_vector_add_kernel" + pythonpath: + - "examples/simple_triton_test" + timeout_seconds: 180 diff --git a/examples/simple_triton_test/triton_vector_add.py b/examples/simple_triton_test/triton_vector_add.py new file mode 100644 index 0000000..789b012 --- /dev/null +++ b/examples/simple_triton_test/triton_vector_add.py @@ -0,0 +1,155 @@ +""" +Simple Triton vector-add kernel for Magpie's 0-overhead Latency harness. + +Exposes two BLOCK_SIZE variants of the *same* kernel so the +``compare_triton_blocksize.yaml`` example can demonstrate why +``primary_metric: kernel_median_ms`` matters for kernel-config autotuning +(both variants have identical dispatch overhead; only the kernel duration +differs). + +Module contract for ``Magpie/bench/_runner.py``: + - One or more ``callable`` symbols (here: ``triton_vector_add_block256``, + ``triton_vector_add_block1024``). + - ``get_inputs() -> (args, kwargs)`` — the runner calls ``callable(*args, **kwargs)``. + +You can also run this module directly to sanity-check the kernel and print +a ``MAGPIE_LATENCY_JSON: {...}`` line — that doubles as both: + * the user-harness escape hatch (``method: cuda_graph`` without + ``bench_target``), + * a quick local benchmark for development. + +Usage: + pip install triton torch + python -m Magpie analyze -k examples/simple_triton_test/analyze_triton_latency.yaml +""" + +from __future__ import annotations + +import json +import sys +from typing import Tuple + +import torch + +try: + import triton + import triton.language as tl +except ImportError as e: + raise ImportError( + "This example requires Triton. Install with: pip install triton" + ) from e + + +# --------------------------------------------------------------------------- +# Kernel +# --------------------------------------------------------------------------- + + +@triton.jit +def _vector_add_kernel( + a_ptr, b_ptr, c_ptr, n, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n + a = tl.load(a_ptr + offsets, mask=mask) + b = tl.load(b_ptr + offsets, mask=mask) + tl.store(c_ptr + offsets, a + b, mask=mask) + + +def _launch(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, block_size: int) -> None: + n = a.numel() + grid = (triton.cdiv(n, block_size),) + _vector_add_kernel[grid](a, b, c, n, BLOCK_SIZE=block_size) + + +# --------------------------------------------------------------------------- +# Magpie bench_target callables +# --------------------------------------------------------------------------- + + +def triton_vector_add_block256( + a: torch.Tensor, b: torch.Tensor, c: torch.Tensor +) -> None: + _launch(a, b, c, block_size=256) + + +def triton_vector_add_block1024( + a: torch.Tensor, b: torch.Tensor, c: torch.Tensor +) -> None: + _launch(a, b, c, block_size=1024) + + +# Default callable used by ``analyze_triton_latency.yaml``. +def triton_vector_add( + a: torch.Tensor, b: torch.Tensor, c: torch.Tensor +) -> None: + _launch(a, b, c, block_size=1024) + + +# --------------------------------------------------------------------------- +# Inputs factory (called once by the runner *after* torch.manual_seed(seed)) +# --------------------------------------------------------------------------- + + +N = 1 << 20 # 1M elements — small enough that dispatch overhead matters, + # which is exactly the scenario where kernel_median_ms beats + # wall_median_ms for autotuning rankings. + + +def get_inputs() -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], dict]: + """Magpie runner contract: returns (args, kwargs).""" + if not torch.cuda.is_available(): + raise RuntimeError("CUDA / HIP is required for the Triton example") + device = "cuda" + a = torch.randn(N, device=device, dtype=torch.float32) + b = torch.randn(N, device=device, dtype=torch.float32) + c = torch.empty_like(a) + return (a, b, c), {} + + +# --------------------------------------------------------------------------- +# CLI: --check (correctness, used by testcase_command) +# --bench (user-harness latency mode, prints MAGPIE_LATENCY_JSON line) +# --------------------------------------------------------------------------- + + +def _check() -> int: + (a, b, c), _ = get_inputs() + triton_vector_add(a, b, c) + torch.cuda.synchronize() + expected = a + b + if torch.allclose(c, expected, atol=1e-5, rtol=1e-5): + print(f"PASSED: vector_add OK on {N} elements") + return 0 + print("FAILED: vector_add mismatch") + return 1 + + +def _bench() -> int: + """User-harness escape hatch — emits the MAGPIE_LATENCY_JSON marker.""" + from Magpie.bench import MAGPIE_LATENCY_JSON_MARKER, do_bench_cudagraph + + (a, b, c), _ = get_inputs() + stats = do_bench_cudagraph( + lambda: triton_vector_add(a, b, c), + rep=20, + n_retries=5, + estimate_reps=5, + ) + print( + f"{MAGPIE_LATENCY_JSON_MARKER} " + + json.dumps({"mode": "cuda_graph", "stats": stats.to_dict()}) + ) + return 0 + + +if __name__ == "__main__": + if "--bench" in sys.argv: + sys.exit(_bench()) + if "--check" in sys.argv: + sys.exit(_check()) + # Default: behave like a smoke test. + sys.exit(_check()) diff --git a/tests/test_latency.py b/tests/test_latency.py new file mode 100644 index 0000000..561b262 --- /dev/null +++ b/tests/test_latency.py @@ -0,0 +1,364 @@ +############################################################################### +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +Tests for the Latency evaluation stage. + +Coverage: + * LatencyConfig.method == "auto" selection table per KernelType + * BenchTarget round-trip through to_dict / from_dict + * LatencyStats summary derivation from raw samples + * rocprofv3 --kernel-trace CSV parser + * pmc_perf.csv parser (rocprof_timestamps method) + * Skip-on-no-torch integration test for do_bench_cudagraph +""" + +from __future__ import annotations + +import json +import textwrap +from pathlib import Path + +import pytest + +from Magpie.bench import LatencyStats, MAGPIE_LATENCY_JSON_MARKER +from Magpie.config import ( + BenchTarget, + KernelType, + LatencyConfig, +) +from Magpie.eval.latency import ( + _aggregate_per_kernel_durations_ns, + _find_rocprofv3_csv, + _parse_marker_line, + _parse_pmc_perf_csv_for_durations, + _parse_rocprofv3_kernel_trace_csv, + _summary_stats_from_per_kernel, +) + + +# --------------------------------------------------------------------------- +# LatencyConfig.method == "auto" selection table +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "kernel_type,expected", + [ + (KernelType.HIP, "rocprof_timestamps"), + (KernelType.TRITON, "both"), + (KernelType.PYTORCH, "both"), + (KernelType.CUDA, "both"), + (None, "cuda_graph"), + ], +) +def test_latency_config_auto_method_selection(kernel_type, expected): + cfg = LatencyConfig(method="auto", kernel_type=kernel_type) + assert cfg.resolve_method() == expected + + +def test_latency_config_explicit_method_passthrough(): + cfg = LatencyConfig(method="kernel_trace", kernel_type=KernelType.TRITON) + assert cfg.resolve_method() == "kernel_trace" + + cfg2 = LatencyConfig(method="none", kernel_type=KernelType.HIP) + assert cfg2.resolve_method() == "none" + + +def test_latency_config_validates_method(): + with pytest.raises(ValueError): + LatencyConfig(method="bogus") + + +def test_latency_config_validates_primary_metric(): + with pytest.raises(ValueError): + LatencyConfig(primary_metric="bogus") + + +def test_latency_config_round_trip_from_dict(): + cfg = LatencyConfig.from_dict( + { + "enabled": False, + "method": "kernel_trace", + "primary_metric": "kernel_median_ms", + "rep_ms": 30, + "n_retries": 9, + "seed": 7, + "pythonpath": ["/a", "/b"], + "bench_target": { + "module": "m.foo", + "callable": "f", + "get_inputs": "ginps", + }, + }, + kernel_type=KernelType.TRITON, + gpu_arch="gfx942", + ) + + assert cfg.enabled is False + assert cfg.method == "kernel_trace" + assert cfg.primary_metric == "kernel_median_ms" + assert cfg.rep_ms == 30 + assert cfg.n_retries == 9 + assert cfg.seed == 7 + assert cfg.pythonpath == ["/a", "/b"] + assert cfg.bench_target is not None + assert cfg.bench_target.module == "m.foo" + assert cfg.bench_target.callable == "f" + assert cfg.bench_target.get_inputs == "ginps" + assert cfg.kernel_type is KernelType.TRITON + assert cfg.gpu_arch == "gfx942" + + +# --------------------------------------------------------------------------- +# BenchTarget +# --------------------------------------------------------------------------- + + +def test_bench_target_from_dict_requires_module_and_callable(): + assert BenchTarget.from_dict(None) is None + assert BenchTarget.from_dict({}) is None + assert BenchTarget.from_dict({"module": "m"}) is None + assert BenchTarget.from_dict({"callable": "c"}) is None + + bt = BenchTarget.from_dict({"module": "m", "callable": "c"}) + assert bt is not None + assert bt.module == "m" + assert bt.callable == "c" + assert bt.get_inputs == "get_inputs" + + +# --------------------------------------------------------------------------- +# LatencyStats +# --------------------------------------------------------------------------- + + +def test_latency_stats_from_samples_basic(): + stats = LatencyStats.from_samples( + [1.0, 2.0, 3.0, 4.0, 5.0], + n_repeat=10, + n_retries=5, + estimate_ms=2.5, + ) + assert stats.median_ms == 3.0 + assert stats.min_ms == 1.0 + assert stats.max_ms == 5.0 + assert stats.p50_ms == pytest.approx(3.0) + assert stats.p99_ms == pytest.approx(5.0 - (5.0 - 4.0) * (1 - 0.96)) # interp + assert stats.std_ms > 0 + assert stats.n_repeat == 10 + assert stats.n_retries == 5 + assert stats.estimate_ms == 2.5 + assert len(stats.samples_ms) == 5 + + +def test_latency_stats_from_samples_empty(): + stats = LatencyStats.from_samples([], n_repeat=1, n_retries=0) + assert stats.median_ms == 0.0 + assert stats.samples_ms == [] + + +def test_latency_stats_dict_round_trip(): + stats = LatencyStats.from_samples([1.0, 2.0], n_repeat=4, n_retries=2) + restored = LatencyStats.from_dict(stats.to_dict()) + assert restored is not None + assert restored.median_ms == stats.median_ms + assert restored.n_repeat == 4 + assert restored.n_retries == 2 + + +# --------------------------------------------------------------------------- +# rocprofv3 --kernel-trace CSV parsing +# --------------------------------------------------------------------------- + + +def _write_rocprofv3_csv(path: Path, rows): + header = "Kernel_Name,Start_Timestamp,End_Timestamp\n" + body = "\n".join(",".join(str(c) for c in row) for row in rows) + path.write_text(header + body + "\n") + + +def test_parse_rocprofv3_kernel_trace_csv_basic(tmp_path: Path): + csv = tmp_path / "kernel_trace.csv" + _write_rocprofv3_csv( + csv, + [ + ("triton_scaled_mm_kernel", 1000, 2000), # 1000 ns + ("triton_scaled_mm_kernel", 3000, 4500), # 1500 ns + ("triton_scaled_mm_kernel", 5000, 6000), # 1000 ns + ("__hip_some_runtime_thunk", 7000, 7100), # filtered out + ("other_kernel", 8000, 9000), # 1000 ns + ], + ) + + per_kernel_ns = _parse_rocprofv3_kernel_trace_csv(csv) + assert "triton_scaled_mm_kernel" in per_kernel_ns + assert "__hip_some_runtime_thunk" not in per_kernel_ns + assert per_kernel_ns["triton_scaled_mm_kernel"] == [1000.0, 1500.0, 1000.0] + assert per_kernel_ns["other_kernel"] == [1000.0] + + +def test_parse_rocprofv3_kernel_trace_csv_kernel_filter(tmp_path: Path): + csv = tmp_path / "kernel_trace.csv" + _write_rocprofv3_csv( + csv, + [ + ("triton_scaled_mm_kernel", 1000, 2000), + ("other_kernel", 3000, 4000), + ], + ) + + per_kernel_ns = _parse_rocprofv3_kernel_trace_csv( + csv, kernel_filter_re=r"triton_" + ) + assert "triton_scaled_mm_kernel" in per_kernel_ns + assert "other_kernel" not in per_kernel_ns + + +def test_parse_rocprofv3_kernel_trace_csv_missing_file(tmp_path: Path): + csv = tmp_path / "nope.csv" + assert _parse_rocprofv3_kernel_trace_csv(csv) == {} + + +def test_find_rocprofv3_csv_locates_kernel_trace(tmp_path: Path): + out_dir = tmp_path / "out" + out_dir.mkdir() + target = out_dir / "myrun_kernel_trace.csv" + target.write_text("Kernel_Name,Start_Timestamp,End_Timestamp\n") + found = _find_rocprofv3_csv(out_dir) + assert found == target + + +def test_find_rocprofv3_csv_returns_none_when_empty(tmp_path: Path): + out_dir = tmp_path / "empty" + out_dir.mkdir() + assert _find_rocprofv3_csv(out_dir) is None + + +# --------------------------------------------------------------------------- +# pmc_perf.csv parsing (rocprof_timestamps) +# --------------------------------------------------------------------------- + + +def test_parse_pmc_perf_csv_for_durations(tmp_path: Path): + csv = tmp_path / "pmc_perf.csv" + csv.write_text( + "Kernel_Name,Start_Timestamp,End_Timestamp,Other\n" + "vector_add,1000,5000,x\n" + "vector_add,6000,9000,x\n" + "vector_add,abc,def,x\n" + "__hip_thunk,1,2,x\n" + ) + + per_kernel_ns = _parse_pmc_perf_csv_for_durations(csv) + assert "vector_add" in per_kernel_ns + assert per_kernel_ns["vector_add"] == [4000.0, 3000.0] + assert "__hip_thunk" not in per_kernel_ns + + +# --------------------------------------------------------------------------- +# Aggregation helpers +# --------------------------------------------------------------------------- + + +def test_aggregate_per_kernel_durations_ns_to_stats(): + per_kernel_ns = { + "k1": [1_000_000.0, 2_000_000.0, 3_000_000.0], # 1, 2, 3 ms + "k2": [], # dropped + "k3": [-5.0, 0.0], # dropped (no positive samples) + } + out = _aggregate_per_kernel_durations_ns(per_kernel_ns) + assert "k1" in out + assert "k2" not in out + assert "k3" not in out + assert out["k1"].median_ms == 2.0 + assert out["k1"].min_ms == 1.0 + assert out["k1"].max_ms == 3.0 + + +def test_summary_stats_from_per_kernel(): + per_kernel = { + "a": LatencyStats.from_samples([1.0, 2.0, 3.0], n_repeat=1, n_retries=3), + "b": LatencyStats.from_samples([0.5, 0.5, 0.5], n_repeat=1, n_retries=3), + } + summary = _summary_stats_from_per_kernel(per_kernel) + assert summary is not None + # median is sum-of-medians = 2.0 + 0.5 = 2.5 + assert summary.median_ms == pytest.approx(2.5) + assert summary.p50_ms == pytest.approx(2.5) + # min/max derived from the merged sample set + assert summary.min_ms == 0.5 + assert summary.max_ms == 3.0 + + +def test_summary_stats_from_empty_per_kernel(): + assert _summary_stats_from_per_kernel({}) is None + + +# --------------------------------------------------------------------------- +# MAGPIE_LATENCY_JSON marker parsing +# --------------------------------------------------------------------------- + + +def test_parse_marker_line_picks_up_payload(): + payload = {"stats": {"median_ms": 0.42}, "module": "m"} + output = textwrap.dedent( + f"""\ + garbage line 1 + unrelated output + {MAGPIE_LATENCY_JSON_MARKER} {json.dumps(payload)} + trailing log line + """ + ) + assert _parse_marker_line(output) == payload + + +def test_parse_marker_line_no_marker(): + assert _parse_marker_line("nothing here") is None + assert _parse_marker_line("") is None + + +def test_parse_marker_line_picks_last_marker(): + p1 = {"stats": {"median_ms": 1.0}} + p2 = {"stats": {"median_ms": 2.0}} + output = ( + f"{MAGPIE_LATENCY_JSON_MARKER} {json.dumps(p1)}\n" + f"{MAGPIE_LATENCY_JSON_MARKER} {json.dumps(p2)}\n" + ) + # The last marker wins (closest to "final result") + assert _parse_marker_line(output) == p2 + + +# --------------------------------------------------------------------------- +# Optional torch integration test +# --------------------------------------------------------------------------- + + +def test_do_bench_cudagraph_smoke(): + """ + Smoke test that exercises the real do_bench_cudagraph code path on a + trivial elementwise add. Skipped automatically when torch is missing or + no GPU is present (so the suite stays green on CPU-only CI). + """ + pytest.importorskip("torch") + import torch + + if not torch.cuda.is_available(): + pytest.skip("torch.cuda.is_available() is False") + + from Magpie.bench import do_bench_cudagraph + + a = torch.randn(1024, 1024, device="cuda") + b = torch.randn(1024, 1024, device="cuda") + + def fn(): + torch.add(a, b, out=a) + + stats = do_bench_cudagraph(fn, rep=5, n_retries=3, estimate_reps=3) + + assert stats.median_ms > 0 + assert stats.n_retries == 3 + assert stats.n_repeat >= 1 + assert len(stats.samples_ms) == 3 diff --git a/tests/test_main_and_kernel_config.py b/tests/test_main_and_kernel_config.py index 5410a21..fc3bee8 100644 --- a/tests/test_main_and_kernel_config.py +++ b/tests/test_main_and_kernel_config.py @@ -106,20 +106,82 @@ def test_load_kernel_config_collects_sections_and_expands_env(monkeypatch, tmp_p encoding="utf-8", ) - kernels, perf_overrides, corr_overrides, sched_overrides = load_kernel_config( - config_path - ) + ( + kernels, + perf_overrides, + corr_overrides, + sched_overrides, + lat_overrides, + ) = load_kernel_config(config_path) assert [cfg.kernel_id for cfg in kernels] == ["single", "second"] assert kernels[0].source_file_path == [f"{tmp_path}/workspace/single.hip"] assert kernels[1].compiling_command == [["python", "prepare.py"]] assert perf_overrides == {"backend": f"{tmp_path}/workspace/perf"} assert corr_overrides == {"backend": "testcase"} + assert lat_overrides == {} assert sched_overrides["max_workers"] == 2 assert sched_overrides["ray_config"] == {"cluster_address": "ray://127.0.0.1:10001"} assert sched_overrides["environment"] == "ray" +def test_load_kernel_config_parses_latency_section_and_bench_target( + monkeypatch, tmp_path +): + monkeypatch.setenv("MAGPIE_ROOT", str(tmp_path / "workspace")) + + config_path = tmp_path / "kernel_config.yaml" + config_path.write_text( + yaml.safe_dump( + { + "kernel": { + "id": "triton_scaled_mm", + "type": "triton", + "source_files": ["$MAGPIE_ROOT/k.py"], + "testcase_command": "python -m my_kernels.scaled_mm --check", + "bench_target": { + "module": "my_kernels.scaled_mm", + "callable": "triton_scaled_mm", + "get_inputs": "get_inputs", + }, + }, + "latency": { + "enabled": True, + "method": "both", + "primary_metric": "kernel_median_ms", + "rep_ms": 25, + "n_retries": 7, + "seed": 1234, + "pythonpath": ["$MAGPIE_ROOT/lib"], + }, + } + ), + encoding="utf-8", + ) + + ( + kernels, + _perf, + _corr, + _sched, + lat_overrides, + ) = load_kernel_config(config_path) + + assert len(kernels) == 1 + cfg = kernels[0] + assert cfg.bench_target == { + "module": "my_kernels.scaled_mm", + "callable": "triton_scaled_mm", + "get_inputs": "get_inputs", + } + assert lat_overrides["method"] == "both" + assert lat_overrides["primary_metric"] == "kernel_median_ms" + assert lat_overrides["rep_ms"] == 25 + assert lat_overrides["n_retries"] == 7 + assert lat_overrides["seed"] == 1234 + assert lat_overrides["pythonpath"] == [f"{tmp_path}/workspace/lib"] + + def test_kernel_eval_config_normalizes_single_and_multi_commands(): cfg = KernelEvalConfig( kernel_id="test",