From dc8f2e68a2f7ba98256316bfd1a4f0fd69851cff Mon Sep 17 00:00:00 2001
From: Sina Rafati <Sina.Rafati@amd.com>
Date: Wed, 29 Apr 2026 01:30:30 -0400
Subject: [PATCH] no overhead latency measurement

---
 Magpie/bench/__init__.py                      | 249 +++++
 Magpie/bench/_runner.py                       | 318 +++++++
 Magpie/bench/include/magpie_bench.hpp         | 238 +++++
 Magpie/config.yaml                            |  34 +
 Magpie/config/__init__.py                     |  11 +
 Magpie/config/kernel.py                       |   7 +
 Magpie/config/latency.py                      | 238 +++++
 Magpie/config/pipeline.py                     |  14 +
 Magpie/core/executor.py                       |   2 +
 Magpie/core/scheduler.py                      |   6 +
 Magpie/core/task.py                           |   2 +
 Magpie/eval/__init__.py                       |   4 +
 Magpie/eval/evaluator.py                      |  63 +-
 Magpie/eval/latency.py                        | 899 ++++++++++++++++++
 Magpie/eval/performance.py                    | 101 ++
 Magpie/kernel_config.yaml.example             |  52 +
 Magpie/main.py                                | 310 +++++-
 Magpie/modes/analyze_eval/analyzer.py         |  10 +
 Magpie/modes/compare_eval/comparator.py       |  10 +
 docs/analysis_compare.md                      |   2 +-
 docs/benchmark.md                             |   2 +
 docs/latency.md                               | 180 ++++
 .../analyze_hipgraph_latency.yaml             |  35 +
 examples/simple_hip_test/analyze_latency.yaml |  37 +
 examples/simple_hip_test/vector_add_bench     | Bin 0 -> 41536 bytes
 examples/simple_hip_test/vector_add_bench.hip |  69 ++
 examples/simple_triton_test/README.md         |  71 ++
 .../analyze_triton_latency.yaml               |  58 ++
 .../compare_triton_blocksize.yaml             |  59 ++
 .../simple_triton_test/triton_vector_add.py   | 155 +++
 tests/test_latency.py                         | 364 +++++++
 tests/test_main_and_kernel_config.py          |  68 +-
 32 files changed, 3649 insertions(+), 19 deletions(-)
 create mode 100644 Magpie/bench/__init__.py
 create mode 100644 Magpie/bench/_runner.py
 create mode 100644 Magpie/bench/include/magpie_bench.hpp
 create mode 100644 Magpie/config/latency.py
 create mode 100644 Magpie/eval/latency.py
 create mode 100644 docs/latency.md
 create mode 100644 examples/simple_hip_test/analyze_hipgraph_latency.yaml
 create mode 100644 examples/simple_hip_test/analyze_latency.yaml
 create mode 100755 examples/simple_hip_test/vector_add_bench
 create mode 100644 examples/simple_hip_test/vector_add_bench.hip
 create mode 100644 examples/simple_triton_test/README.md
 create mode 100644 examples/simple_triton_test/analyze_triton_latency.yaml
 create mode 100644 examples/simple_triton_test/compare_triton_blocksize.yaml
 create mode 100644 examples/simple_triton_test/triton_vector_add.py
 create mode 100644 tests/test_latency.py

diff --git a/Magpie/bench/__init__.py b/Magpie/bench/__init__.py
new file mode 100644
index 0000000..7412be5
--- /dev/null
+++ b/Magpie/bench/__init__.py
@@ -0,0 +1,249 @@
+###############################################################################
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+``magpie.bench`` — sanctioned 0-overhead kernel timing helpers.
+
+Two entry points:
+
+- ``do_bench_cudagraph(fn, rep=20, n_retries=5, estimate_reps=5)`` —
+  dispatch-inclusive wall-clock latency measured via CUDA graph
+  estimate-then-unrolled-replay. Works on AMD (HIP graphs through
+  ``torch.cuda.CUDAGraph``) and NVIDIA without code changes.
+- ``LatencyStats`` — dataclass returned by ``do_bench_cudagraph`` carrying
+  median / p50 / p99 / min / max / std and the meta-parameters used.
+
+The marker line ``MAGPIE_LATENCY_JSON: { ... }`` printed by ``_runner.py``
+is the contract used by ``Magpie/eval/latency.py`` to ingest results from
+either the bundled runner subprocess or any user-provided harness.
+"""
+
+from __future__ import annotations
+
+import statistics
+from dataclasses import dataclass, field, asdict
+from typing import Any, Callable, Dict, List, Optional, cast
+
+
+__all__ = [
+    "LatencyStats",
+    "do_bench_cudagraph",
+    "MAGPIE_LATENCY_JSON_MARKER",
+]
+
+
+MAGPIE_LATENCY_JSON_MARKER = "MAGPIE_LATENCY_JSON:"
+
+
+@dataclass
+class LatencyStats:
+    """
+    Summary statistics for a latency measurement.
+
+    All time fields are in milliseconds.
+
+    Attributes:
+        median_ms: Median across ``n_retries`` independent measurements.
+        p50_ms / p99_ms: Percentiles across the same set.
+        min_ms / max_ms: Min/max across the same set.
+        std_ms: Sample standard deviation (0.0 if ``n_retries < 2``).
+        samples_ms: Raw per-retry measurements (``len == n_retries``).
+        n_repeat: Number of unrolled ``fn()`` calls inside the timed graph.
+        n_retries: Number of independent graph-replay measurements taken.
+        estimate_ms: Per-call cost estimated from the small estimate graph
+                     (used to size ``n_repeat``).
+    """
+
+    median_ms: float
+    p50_ms: float
+    p99_ms: float
+    min_ms: float
+    max_ms: float
+    std_ms: float
+    samples_ms: List[float] = field(default_factory=list)
+    n_repeat: int = 0
+    n_retries: int = 0
+    estimate_ms: float = 0.0
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: Optional[Dict[str, Any]]) -> Optional["LatencyStats"]:
+        if not data:
+            return None
+        return cls(
+            median_ms=float(data.get("median_ms", 0.0)),
+            p50_ms=float(data.get("p50_ms", 0.0)),
+            p99_ms=float(data.get("p99_ms", 0.0)),
+            min_ms=float(data.get("min_ms", 0.0)),
+            max_ms=float(data.get("max_ms", 0.0)),
+            std_ms=float(data.get("std_ms", 0.0)),
+            samples_ms=list(data.get("samples_ms", []) or []),
+            n_repeat=int(data.get("n_repeat", 0)),
+            n_retries=int(data.get("n_retries", 0)),
+            estimate_ms=float(data.get("estimate_ms", 0.0)),
+        )
+
+    @classmethod
+    def from_samples(
+        cls,
+        samples_ms: List[float],
+        *,
+        n_repeat: int,
+        n_retries: int,
+        estimate_ms: float = 0.0,
+    ) -> "LatencyStats":
+        """Compute summary statistics from a list of per-retry latency samples."""
+        if not samples_ms:
+            return cls(
+                median_ms=0.0,
+                p50_ms=0.0,
+                p99_ms=0.0,
+                min_ms=0.0,
+                max_ms=0.0,
+                std_ms=0.0,
+                samples_ms=[],
+                n_repeat=n_repeat,
+                n_retries=n_retries,
+                estimate_ms=estimate_ms,
+            )
+
+        sorted_ms = sorted(samples_ms)
+        n = len(sorted_ms)
+        # Linear-interpolation percentile (matches numpy's default)
+        def _pct(p: float) -> float:
+            if n == 1:
+                return sorted_ms[0]
+            rank = p * (n - 1) / 100.0
+            lo = int(rank)
+            hi = min(lo + 1, n - 1)
+            frac = rank - lo
+            return sorted_ms[lo] + (sorted_ms[hi] - sorted_ms[lo]) * frac
+
+        std_ms = statistics.stdev(samples_ms) if n >= 2 else 0.0
+
+        return cls(
+            median_ms=statistics.median(samples_ms),
+            p50_ms=_pct(50.0),
+            p99_ms=_pct(99.0),
+            min_ms=sorted_ms[0],
+            max_ms=sorted_ms[-1],
+            std_ms=std_ms,
+            samples_ms=list(samples_ms),
+            n_repeat=n_repeat,
+            n_retries=n_retries,
+            estimate_ms=estimate_ms,
+        )
+
+
+def do_bench_cudagraph(
+    fn: Callable[[], Any],
+    rep: int = 20,
+    n_retries: int = 5,
+    estimate_reps: int = 5,
+) -> LatencyStats:
+    """
+    Benchmark ``fn`` via CUDA-graph estimate-then-unrolled-replay.
+
+    Algorithm (mirrors the user-attached snippet byte-for-byte):
+
+      1. Warmup: call ``fn()`` once on a side stream.
+      2. Capture an "estimate" graph containing ``estimate_reps`` calls of
+         ``fn``; replay it once to get ``estimate_ms`` per-call.
+      3. Compute ``n_repeat = max(1, int(rep / estimate_ms))`` so the timed
+         graph runs for roughly ``rep`` milliseconds.
+      4. Capture a fresh graph with ``n_repeat`` unrolled calls.
+      5. Replay the timed graph ``n_retries`` times, each time bracketed by
+         a fresh pair of ``torch.cuda.Event`` records, and divide the
+         elapsed time by ``n_repeat``.
+      6. Report ``statistics.median`` of the ``n_retries`` per-call samples
+         along with min/max/p50/p99/std.
+
+    The dispatch overhead of each ``fn()`` call is amortized across
+    ``n_repeat`` replays inside the captured graph, so per-call latency
+    closely tracks the kernel time *plus* one graph-launch's worth of
+    overhead (typically tens of microseconds).
+
+    Args:
+        fn: Zero-arg callable that issues the workload onto the current
+            CUDA stream. Must be safe to capture inside ``torch.cuda.graph``.
+        rep: Target measurement window in milliseconds.
+        n_retries: Number of independent replay measurements.
+        estimate_reps: Number of ``fn()`` calls inside the small estimate graph.
+
+    Returns:
+        ``LatencyStats`` with per-call median latency in ms.
+
+    Raises:
+        ImportError: If ``torch`` is not installed.
+        RuntimeError: If CUDA / HIP is not available.
+    """
+    try:
+        import torch
+    except ImportError as e:
+        raise ImportError(
+            "magpie.bench.do_bench_cudagraph requires PyTorch. "
+            "Install torch (or torch+rocm) and retry."
+        ) from e
+
+    if not torch.cuda.is_available():
+        raise RuntimeError(
+            "magpie.bench.do_bench_cudagraph requires a CUDA / HIP capable GPU; "
+            "torch.cuda.is_available() is False."
+        )
+
+    stream = cast(torch.cuda.Stream, torch.cuda.Stream())
+    stream.wait_stream(cast(torch.cuda.Stream, torch.cuda.current_stream()))
+    with torch.cuda.stream(stream):
+        torch.cuda.synchronize()
+        # Warmup
+        fn()
+
+        # Step 1: capture initial estimate graph
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            for _ in range(estimate_reps):
+                fn()
+        torch.cuda.synchronize()
+
+        # Step 2: estimate per-call device time
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record(stream)
+        g.replay()
+        end_event.record(stream)
+        torch.cuda.synchronize()
+
+        estimate_ms = start_event.elapsed_time(end_event) / estimate_reps
+        if estimate_ms == 0:
+            n_repeat = 1000
+        else:
+            n_repeat = max(1, int(rep / estimate_ms))
+
+        # Step 3: capture timed graph with n_repeat unrolled calls
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            for _ in range(n_repeat):
+                fn()
+        torch.cuda.synchronize()
+
+        # Step 4: measure n_retries replays
+        samples_ms: List[float] = []
+        for _ in range(n_retries):
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record(stream)
+            g.replay()
+            end_event.record(stream)
+            torch.cuda.synchronize()
+            samples_ms.append(start_event.elapsed_time(end_event) / n_repeat)
+
+    return LatencyStats.from_samples(
+        samples_ms,
+        n_repeat=n_repeat,
+        n_retries=n_retries,
+        estimate_ms=estimate_ms,
+    )
diff --git a/Magpie/bench/_runner.py b/Magpie/bench/_runner.py
new file mode 100644
index 0000000..ad20c2a
--- /dev/null
+++ b/Magpie/bench/_runner.py
@@ -0,0 +1,318 @@
+###############################################################################
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Subprocess harness for Magpie's import-based latency benchmarking.
+
+Spawned by ``Magpie/eval/latency.py``; never imported by the rest of Magpie.
+
+Two modes (selected by ``--profile``):
+
+1. (default) **CUDA-graph timing.** Runs ``magpie.bench.do_bench_cudagraph``
+   on the user's callable and prints one line:
+
+       MAGPIE_LATENCY_JSON: {"stats": {...}, "n_repeat": ..., ...}
+
+2. **--profile** (kernel-trace harness mode). Runs a tight
+   ``for _ in range(N): fn(); torch.cuda.synchronize()`` loop sized to
+   roughly ``rep_ms`` so the outer ``rocprofv3 --kernel-trace`` invocation
+   captures clean per-dispatch HW kernel durations. Still prints the
+   marker line, but ``stats`` is ``null`` (the wrapper parses the rocprofv3
+   CSV instead).
+
+Inputs are read from environment variables (kept on env, not argv, so the
+runner is invoked uniformly under both ``python _runner.py`` and
+``rocprofv3 ... -- python _runner.py --profile``):
+
+  - MAGPIE_BENCH_MODULE:        importable module path (required)
+  - MAGPIE_BENCH_CALLABLE:      attribute of the callable (required)
+  - MAGPIE_BENCH_INPUTS_FUNC:   attribute of the inputs factory
+                                (default: "get_inputs")
+  - MAGPIE_BENCH_REP_MS:        target measurement window in ms (default: 20)
+  - MAGPIE_BENCH_N_RETRIES:     retries (default: 5)
+  - MAGPIE_BENCH_ESTIMATE_REPS: estimate-graph reps (default: 5)
+  - MAGPIE_BENCH_WARMUP_ITERS:  eager warmup iters before timing (default: 5)
+  - MAGPIE_BENCH_SEED:          torch.manual_seed value (default: 42)
+  - MAGPIE_BENCH_PROFILE_REP_MS: only used with --profile (default: rep_ms*5)
+
+Reproducibility note: the seed is set BEFORE inputs are materialized so
+tensor shapes/values are stable across runs.
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import json
+import os
+import sys
+import time
+import traceback
+from typing import Any, Callable, Tuple
+
+
+def _env_int(name: str, default: int) -> int:
+    raw = os.environ.get(name)
+    if raw is None or raw == "":
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+def _env_str(name: str, default: str) -> str:
+    raw = os.environ.get(name)
+    if raw is None or raw == "":
+        return default
+    return raw
+
+
+def _seed_everything(seed: int) -> None:
+    """Make tensor shapes/values reproducible BEFORE inputs are materialized."""
+    try:
+        import torch
+
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+    except Exception:
+        pass
+
+    try:
+        import random
+
+        random.seed(seed)
+    except Exception:
+        pass
+
+    try:
+        import numpy as np  # noqa: F401
+
+        np.random.seed(seed)
+    except Exception:
+        pass
+
+
+def _resolve_target(
+    module_name: str, callable_name: str, inputs_func_name: str
+) -> Tuple[Callable[..., Any], Callable[[], Any]]:
+    """Import the user's module and look up the callable + inputs factory."""
+    module = importlib.import_module(module_name)
+    if not hasattr(module, callable_name):
+        raise AttributeError(
+            f"module {module_name!r} has no attribute {callable_name!r}"
+        )
+    if not hasattr(module, inputs_func_name):
+        raise AttributeError(
+            f"module {module_name!r} has no attribute {inputs_func_name!r} "
+            f"(expected an inputs factory)"
+        )
+    fn = getattr(module, callable_name)
+    inputs_factory = getattr(module, inputs_func_name)
+    if not callable(fn):
+        raise TypeError(f"{module_name}.{callable_name} is not callable")
+    if not callable(inputs_factory):
+        raise TypeError(f"{module_name}.{inputs_func_name} is not callable")
+    return fn, inputs_factory
+
+
+def _normalize_inputs(raw: Any) -> Tuple[tuple, dict]:
+    """Coerce the inputs factory return value into ``(args, kwargs)``."""
+    if raw is None:
+        return tuple(), {}
+    # Accept (args, kwargs)
+    if isinstance(raw, tuple) and len(raw) == 2 and isinstance(raw[1], dict):
+        first = raw[0]
+        if isinstance(first, (list, tuple)):
+            return tuple(first), dict(raw[1])
+    # Accept positional iterable
+    if isinstance(raw, (list, tuple)):
+        return tuple(raw), {}
+    # Accept dict-as-kwargs
+    if isinstance(raw, dict):
+        return tuple(), dict(raw)
+    # Single positional value
+    return (raw,), {}
+
+
+def _emit(payload: dict) -> None:
+    """Print the canonical marker line."""
+    sys.stdout.write(
+        "MAGPIE_LATENCY_JSON: " + json.dumps(payload, default=str) + "\n"
+    )
+    sys.stdout.flush()
+
+
+def main(argv: list) -> int:
+    parser = argparse.ArgumentParser(
+        prog="magpie.bench._runner",
+        description="Magpie 0-overhead latency benchmark runner",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Run in kernel-trace harness mode (tight loop, no graph capture). "
+        "Intended to be wrapped by rocprofv3 --kernel-trace.",
+    )
+    parser.add_argument(
+        "--module", type=str, default=None,
+        help="Override MAGPIE_BENCH_MODULE",
+    )
+    parser.add_argument(
+        "--callable", type=str, default=None,
+        help="Override MAGPIE_BENCH_CALLABLE",
+    )
+    parser.add_argument(
+        "--get-inputs", type=str, default=None,
+        help="Override MAGPIE_BENCH_INPUTS_FUNC",
+    )
+    args = parser.parse_args(argv)
+
+    module_name = args.module or _env_str("MAGPIE_BENCH_MODULE", "")
+    callable_name = args.callable or _env_str("MAGPIE_BENCH_CALLABLE", "")
+    inputs_func_name = (
+        args.get_inputs or _env_str("MAGPIE_BENCH_INPUTS_FUNC", "get_inputs")
+    )
+
+    if not module_name or not callable_name:
+        _emit({
+            "stats": None,
+            "error": "MAGPIE_BENCH_MODULE and MAGPIE_BENCH_CALLABLE are required",
+        })
+        return 2
+
+    rep_ms = _env_int("MAGPIE_BENCH_REP_MS", 20)
+    n_retries = _env_int("MAGPIE_BENCH_N_RETRIES", 5)
+    estimate_reps = _env_int("MAGPIE_BENCH_ESTIMATE_REPS", 5)
+    warmup_iters = _env_int("MAGPIE_BENCH_WARMUP_ITERS", 5)
+    seed = _env_int("MAGPIE_BENCH_SEED", 42)
+    profile_rep_ms = _env_int("MAGPIE_BENCH_PROFILE_REP_MS", max(rep_ms * 5, 50))
+
+    try:
+        _seed_everything(seed)
+        fn_raw, inputs_factory = _resolve_target(
+            module_name, callable_name, inputs_func_name
+        )
+        raw_inputs = inputs_factory()
+        args_tuple, kwargs = _normalize_inputs(raw_inputs)
+
+        def call_fn() -> None:
+            fn_raw(*args_tuple, **kwargs)
+
+        for _ in range(max(0, warmup_iters)):
+            call_fn()
+
+        try:
+            import torch
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+        except Exception:
+            pass
+
+        if args.profile:
+            # Kernel-trace harness mode: tight loop, no graph capture.
+            # Pre-size N from a quick wall-clock estimate so the loop runs
+            # roughly profile_rep_ms milliseconds without CUDA event overhead.
+            est_calls = max(estimate_reps, 1)
+            t0 = time.perf_counter()
+            for _ in range(est_calls):
+                call_fn()
+            try:
+                import torch
+
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+            except Exception:
+                pass
+            t1 = time.perf_counter()
+            per_call_ms = ((t1 - t0) * 1000.0) / est_calls
+            if per_call_ms <= 0:
+                n_iter = 1000
+            else:
+                n_iter = max(1, int(profile_rep_ms / per_call_ms))
+
+            # Tight, dispatch-only loop. No CUDA events. The outer rocprofv3
+            # --kernel-trace is what produces timing.
+            t_start = time.perf_counter()
+            for _ in range(n_iter):
+                call_fn()
+            try:
+                import torch
+
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+            except Exception:
+                pass
+            t_end = time.perf_counter()
+
+            _emit(
+                {
+                    "mode": "profile",
+                    "stats": None,
+                    "n_iter": n_iter,
+                    "per_call_estimate_ms": per_call_ms,
+                    "wall_loop_ms": (t_end - t_start) * 1000.0,
+                    "module": module_name,
+                    "callable": callable_name,
+                    "seed": seed,
+                }
+            )
+            return 0
+
+        # Default mode: CUDA-graph based wall-clock timing.
+        from Magpie.bench import do_bench_cudagraph  # type: ignore
+
+        stats = do_bench_cudagraph(
+            call_fn,
+            rep=rep_ms,
+            n_retries=n_retries,
+            estimate_reps=estimate_reps,
+        )
+
+        _emit(
+            {
+                "mode": "cuda_graph",
+                "stats": stats.to_dict(),
+                "module": module_name,
+                "callable": callable_name,
+                "seed": seed,
+            }
+        )
+        return 0
+
+    except SystemExit:
+        raise
+    except BaseException as e:
+        _emit(
+            {
+                "stats": None,
+                "error": f"{type(e).__name__}: {e}",
+                "traceback": traceback.format_exc(),
+                "module": module_name,
+                "callable": callable_name,
+            }
+        )
+        return 1
+
+
+if __name__ == "__main__":
+    # Allow ``python -m Magpie.bench._runner`` and ``python _runner.py``.
+    # The fallback handles being launched directly (no parent package) by
+    # adding the repo root to sys.path so ``import Magpie.bench`` works.
+    try:
+        from Magpie.bench import do_bench_cudagraph  # noqa: F401
+    except ImportError:
+        # When invoked as ``python /path/to/Magpie/bench/_runner.py``, the
+        # parent ``Magpie`` package may not be on sys.path. Walk up two dirs
+        # and prepend so the import succeeds.
+        repo_root = os.path.abspath(
+            os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
+        )
+        if repo_root not in sys.path:
+            sys.path.insert(0, repo_root)
+
+    sys.exit(main(sys.argv[1:]))
diff --git a/Magpie/bench/include/magpie_bench.hpp b/Magpie/bench/include/magpie_bench.hpp
new file mode 100644
index 0000000..87ea352
--- /dev/null
+++ b/Magpie/bench/include/magpie_bench.hpp
@@ -0,0 +1,238 @@
+// Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+// See LICENSE for license information.
+//
+// Header-only HIP-graph latency helper that mirrors the Python
+// ``magpie.bench.do_bench_cudagraph`` algorithm so HIP testcases produce
+// directly comparable wall-clock numbers (same warmup -> estimate ->
+// unrolled replay -> median across retries math).
+//
+// Usage from a .hip / .cpp testcase:
+//
+//   #define MAGPIE_BENCH_IMPLEMENTATION
+//   #include "magpie_bench.hpp"
+//
+//   int main() {
+//     ... setup ...
+//     auto stats = magpie::bench::do_bench_hipgraph(
+//         [&]() { my_kernel<<<grid, block, 0, magpie::bench::current_stream()>>>(...); },
+//         /*rep_ms=*/20, /*n_retries=*/5, /*estimate_reps=*/5);
+//     magpie::bench::print_marker(stats);
+//     return 0;
+//   }
+//
+// Output (one line, parsed by Magpie's user-harness sub-mode):
+//
+//   MAGPIE_LATENCY_JSON: {"stats":{"median_ms":...,"p99_ms":...,...}}
+//
+// Magpie's ``Latency`` stage (Magpie/eval/latency.py) picks this up via
+// the same ``MAGPIE_LATENCY_JSON:`` marker contract used for Triton
+// kernels; HIP and Triton report identical wall-clock stats.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#define MAGPIE_BENCH_HIP_CHECK(call)                                       \
+    do {                                                                    \
+        hipError_t _err = (call);                                          \
+        if (_err != hipSuccess) {                                          \
+            throw std::runtime_error(                                      \
+                std::string("HIP error: ") + hipGetErrorString(_err));     \
+        }                                                                   \
+    } while (0)
+
+namespace magpie {
+namespace bench {
+
+struct LatencyStats {
+    double median_ms = 0.0;
+    double p50_ms    = 0.0;
+    double p99_ms    = 0.0;
+    double min_ms    = 0.0;
+    double max_ms    = 0.0;
+    double std_ms    = 0.0;
+    double estimate_ms = 0.0;
+    int    n_repeat  = 0;
+    int    n_retries = 0;
+    std::vector<double> samples_ms;
+};
+
+inline hipStream_t& current_stream() {
+    // Side stream used by do_bench_hipgraph; users issuing inside ``fn``
+    // should target this stream so the captured graph is well-defined.
+    static hipStream_t s = nullptr;
+    return s;
+}
+
+namespace detail {
+
+inline double percentile(std::vector<double> sorted_v, double p) {
+    if (sorted_v.empty()) return 0.0;
+    if (sorted_v.size() == 1) return sorted_v[0];
+    double rank = p * (sorted_v.size() - 1) / 100.0;
+    int    lo   = static_cast<int>(rank);
+    int    hi   = std::min(lo + 1, static_cast<int>(sorted_v.size()) - 1);
+    double frac = rank - lo;
+    return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * frac;
+}
+
+inline double median(std::vector<double> v) {
+    if (v.empty()) return 0.0;
+    std::sort(v.begin(), v.end());
+    size_t n = v.size();
+    if (n % 2 == 1) return v[n / 2];
+    return 0.5 * (v[n / 2 - 1] + v[n / 2]);
+}
+
+inline double stddev(const std::vector<double>& v) {
+    if (v.size() < 2) return 0.0;
+    double mean = 0.0;
+    for (double x : v) mean += x;
+    mean /= v.size();
+    double var = 0.0;
+    for (double x : v) var += (x - mean) * (x - mean);
+    return std::sqrt(var / (v.size() - 1));
+}
+
+inline LatencyStats stats_from_samples(const std::vector<double>& samples,
+                                       int n_repeat,
+                                       int n_retries,
+                                       double estimate_ms) {
+    LatencyStats s;
+    s.n_repeat   = n_repeat;
+    s.n_retries  = n_retries;
+    s.estimate_ms = estimate_ms;
+    s.samples_ms  = samples;
+    if (samples.empty()) return s;
+
+    auto sorted = samples;
+    std::sort(sorted.begin(), sorted.end());
+    s.min_ms    = sorted.front();
+    s.max_ms    = sorted.back();
+    s.median_ms = median(samples);
+    s.p50_ms    = percentile(sorted, 50.0);
+    s.p99_ms    = percentile(sorted, 99.0);
+    s.std_ms    = stddev(samples);
+    return s;
+}
+
+} // namespace detail
+
+// Benchmark ``fn`` via HIP-graph estimate-then-unrolled-replay. Mirrors
+// ``magpie.bench.do_bench_cudagraph`` byte-for-byte (same n_repeat
+// formula, same median across n_retries).
+template <typename Fn>
+inline LatencyStats do_bench_hipgraph(Fn&& fn,
+                                       int rep_ms       = 20,
+                                       int n_retries    = 5,
+                                       int estimate_reps = 5) {
+    hipStream_t& stream = current_stream();
+    if (stream == nullptr) {
+        MAGPIE_BENCH_HIP_CHECK(hipStreamCreate(&stream));
+    }
+
+    // Warmup
+    fn();
+    MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream));
+
+    // ----- Step 1: capture estimate graph -----------------------------
+    hipGraph_t     est_graph = nullptr;
+    hipGraphExec_t est_exec  = nullptr;
+
+    MAGPIE_BENCH_HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeThreadLocal));
+    for (int i = 0; i < estimate_reps; ++i) fn();
+    MAGPIE_BENCH_HIP_CHECK(hipStreamEndCapture(stream, &est_graph));
+    MAGPIE_BENCH_HIP_CHECK(hipGraphInstantiate(&est_exec, est_graph, nullptr, nullptr, 0));
+    MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream));
+
+    // ----- Step 2: estimate per-call cost -----------------------------
+    hipEvent_t e0, e1;
+    MAGPIE_BENCH_HIP_CHECK(hipEventCreate(&e0));
+    MAGPIE_BENCH_HIP_CHECK(hipEventCreate(&e1));
+
+    MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e0, stream));
+    MAGPIE_BENCH_HIP_CHECK(hipGraphLaunch(est_exec, stream));
+    MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e1, stream));
+    MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream));
+    float est_total_ms = 0.0f;
+    MAGPIE_BENCH_HIP_CHECK(hipEventElapsedTime(&est_total_ms, e0, e1));
+    double estimate_ms = static_cast<double>(est_total_ms) / estimate_reps;
+
+    int n_repeat;
+    if (estimate_ms <= 0.0) {
+        n_repeat = 1000;
+    } else {
+        n_repeat = std::max(1, static_cast<int>(rep_ms / estimate_ms));
+    }
+
+    MAGPIE_BENCH_HIP_CHECK(hipGraphExecDestroy(est_exec));
+    MAGPIE_BENCH_HIP_CHECK(hipGraphDestroy(est_graph));
+
+    // ----- Step 3: capture timed graph with n_repeat unrolled calls --
+    hipGraph_t     timed_graph = nullptr;
+    hipGraphExec_t timed_exec  = nullptr;
+
+    MAGPIE_BENCH_HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeThreadLocal));
+    for (int i = 0; i < n_repeat; ++i) fn();
+    MAGPIE_BENCH_HIP_CHECK(hipStreamEndCapture(stream, &timed_graph));
+    MAGPIE_BENCH_HIP_CHECK(hipGraphInstantiate(&timed_exec, timed_graph, nullptr, nullptr, 0));
+    MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream));
+
+    // ----- Step 4: measure n_retries replays --------------------------
+    std::vector<double> samples;
+    samples.reserve(n_retries);
+
+    for (int i = 0; i < n_retries; ++i) {
+        MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e0, stream));
+        MAGPIE_BENCH_HIP_CHECK(hipGraphLaunch(timed_exec, stream));
+        MAGPIE_BENCH_HIP_CHECK(hipEventRecord(e1, stream));
+        MAGPIE_BENCH_HIP_CHECK(hipStreamSynchronize(stream));
+        float total_ms = 0.0f;
+        MAGPIE_BENCH_HIP_CHECK(hipEventElapsedTime(&total_ms, e0, e1));
+        samples.push_back(static_cast<double>(total_ms) / n_repeat);
+    }
+
+    MAGPIE_BENCH_HIP_CHECK(hipEventDestroy(e0));
+    MAGPIE_BENCH_HIP_CHECK(hipEventDestroy(e1));
+    MAGPIE_BENCH_HIP_CHECK(hipGraphExecDestroy(timed_exec));
+    MAGPIE_BENCH_HIP_CHECK(hipGraphDestroy(timed_graph));
+
+    return detail::stats_from_samples(samples, n_repeat, n_retries, estimate_ms);
+}
+
+// Print the canonical ``MAGPIE_LATENCY_JSON: {...}`` line to stdout. The
+// payload schema matches what ``Magpie/bench/_runner.py`` emits so the
+// same parser in ``Magpie/eval/latency.py`` ingests both.
+inline void print_marker(const LatencyStats& s,
+                         const std::string& kernel_name = "") {
+    std::printf(
+        "MAGPIE_LATENCY_JSON: "
+        "{\"mode\":\"hip_graph\","
+        "\"stats\":{\"median_ms\":%.6f,\"p50_ms\":%.6f,\"p99_ms\":%.6f,"
+        "\"min_ms\":%.6f,\"max_ms\":%.6f,\"std_ms\":%.6f,"
+        "\"n_repeat\":%d,\"n_retries\":%d,\"estimate_ms\":%.6f,"
+        "\"samples_ms\":[",
+        s.median_ms, s.p50_ms, s.p99_ms,
+        s.min_ms, s.max_ms, s.std_ms,
+        s.n_repeat, s.n_retries, s.estimate_ms);
+    for (size_t i = 0; i < s.samples_ms.size(); ++i) {
+        std::printf("%s%.6f", (i ? "," : ""), s.samples_ms[i]);
+    }
+    std::printf("]}");
+    if (!kernel_name.empty()) {
+        std::printf(",\"kernel_name\":\"%s\"", kernel_name.c_str());
+    }
+    std::printf("}\n");
+    std::fflush(stdout);
+}
+
+} // namespace bench
+} // namespace magpie
diff --git a/Magpie/config.yaml b/Magpie/config.yaml
index 95999e1..f99a192 100644
--- a/Magpie/config.yaml
+++ b/Magpie/config.yaml
@@ -157,6 +157,40 @@ performance:
     timeout_seconds: 600       # Timeout per profiling run
 
 
+# =============================================================================
+# Latency Harness (0-overhead in-process kernel timing)
+# =============================================================================
+# Complements the Performance stage with explicit dispatch-inclusive
+# wall-clock latency (cuda_graph) and/or kernel-only latency (kernel_trace
+# via rocprofv3 --kernel-trace). See docs/latency.md.
+latency:
+  enabled: true
+
+  # auto -> both for triton/pytorch/cuda; rocprof_timestamps for hip
+  # cuda_graph | kernel_trace | rocprof_timestamps | both | none
+  method: auto
+
+  # The headline number used by `compare` mode for ranking
+  # wall_median_ms (end-to-end) | kernel_median_ms (kernel-only, autotuning)
+  primary_metric: wall_median_ms
+
+  # do_bench_cudagraph parameters (mirrors the canonical Triton snippet)
+  rep_ms: 20
+  n_retries: 5
+  estimate_reps: 5
+  warmup_iters: 5
+
+  # Reproducibility: torch.manual_seed + torch.cuda.manual_seed_all
+  seed: 42
+
+  # Optional regex applied to per-dispatch kernel names when aggregating
+  # rocprof timestamps / rocprofv3 kernel-trace output
+  kernel_filter: null
+
+  # Per-subprocess timeout (covers the cuda_graph runner + the rocprofv3 wrap)
+  timeout_seconds: 120
+
+
 compare:
   # Per-backend weights (rocprof-compute for AMD, ncu for NVIDIA)
   perf_weights_rocprof:
diff --git a/Magpie/config/__init__.py b/Magpie/config/__init__.py
index e87e054..b52b730 100644
--- a/Magpie/config/__init__.py
+++ b/Magpie/config/__init__.py
@@ -36,6 +36,12 @@
     METRIX_KEY_METRICS,
     DEFAULT_ROCPROF_METRIC_BLOCKS,
 )
+from .latency import (
+    LatencyConfig,
+    BenchTarget,
+    LATENCY_METHODS,
+    PRIMARY_METRICS,
+)
 
 __all__ = [
     # Pipeline configuration
@@ -60,4 +66,9 @@
     "ROCPROF_KEY_METRICS",
     "METRIX_KEY_METRICS",
     "DEFAULT_ROCPROF_METRIC_BLOCKS",
+    # Latency configuration
+    "LatencyConfig",
+    "BenchTarget",
+    "LATENCY_METHODS",
+    "PRIMARY_METRICS",
 ]
diff --git a/Magpie/config/kernel.py b/Magpie/config/kernel.py
index 4eab7d1..76bf9f8 100644
--- a/Magpie/config/kernel.py
+++ b/Magpie/config/kernel.py
@@ -68,6 +68,11 @@ class KernelEvalConfig:
     # Performance profiling - custom command(s) to replace built-in profiler
     prof_command: Optional[List] = None
 
+    # Latency benchmarking (Triton/PyTorch/CUDA in-process harness)
+    # Optional per-kernel BenchTarget spec ({"module", "callable", "get_inputs"}).
+    # Wins over LatencyConfig.bench_target when both are set.
+    bench_target: Optional[Dict[str, Any]] = None
+
     # Input generation (for KenrelBench)
     get_inputs_func: str = "get_inputs"
     get_init_inputs_func: str = "get_init_inputs"
@@ -153,6 +158,7 @@ def to_dict(self) -> Dict[str, Any]:
             "compiling_command": self.compiling_command,
             "testcase_command": self.testcase_command,
             "prof_command": self.prof_command,
+            "bench_target": self.bench_target,
             "get_inputs_func": self.get_inputs_func,
             "get_init_inputs_func": self.get_init_inputs_func,
             "input_shapes": self.input_shapes,
@@ -181,6 +187,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "KernelEvalConfig":
             compiling_command=data.get("compiling_command"),
             testcase_command=data.get("testcase_command"),
             prof_command=data.get("prof_command"),
+            bench_target=data.get("bench_target"),
             get_inputs_func=data.get("get_inputs_func", "get_inputs"),
             get_init_inputs_func=data.get("get_init_inputs_func", "get_init_inputs"),
             input_shapes=data.get("input_shapes", []),
diff --git a/Magpie/config/latency.py b/Magpie/config/latency.py
new file mode 100644
index 0000000..878b4cf
--- /dev/null
+++ b/Magpie/config/latency.py
@@ -0,0 +1,238 @@
+###############################################################################
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Latency evaluation configuration.
+
+Defines a 0-overhead in-process timing harness that complements the
+HW-counter-based ``Performance`` stage:
+
+- ``cuda_graph``       — dispatch-inclusive wall-clock latency via
+                          ``do_bench_cudagraph`` (warmup -> capture -> unrolled
+                          replay -> median across retries).
+- ``kernel_trace``     — kernel-only timing free of dispatch noise; the harness
+                          runs in ``--profile`` mode (tight loop, no graph) and
+                          the outer ``rocprofv3 --kernel-trace`` produces HW
+                          per-dispatch durations.
+- ``rocprof_timestamps`` — reuse ``pmc_perf.csv`` already produced by the
+                          ``Performance`` stage; no extra subprocess.
+- ``both``             — run both wall-clock and kernel-only and emit
+                          ``dispatch_overhead_us = wall - kernel``.
+
+For kernel-config autotuning (BLOCK sizes, num_warps, num_stages) the dispatch
+overhead is roughly constant across configs and dominates wall-clock numbers
+when the kernel runs in microseconds — use ``primary_metric=kernel_median_ms``
+to rank by the dispatch-free measurement.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .pipeline import KernelType
+
+
+# ---------------------------------------------------------------------------
+# Sentinel string literals (not Enums to keep YAML/JSON round-tripping trivial)
+# ---------------------------------------------------------------------------
+
+LATENCY_METHODS = (
+    "auto",
+    "cuda_graph",
+    "kernel_trace",
+    "rocprof_timestamps",
+    "both",
+    "none",
+)
+
+PRIMARY_METRICS = ("wall_median_ms", "kernel_median_ms")
+
+
+@dataclass
+class BenchTarget:
+    """
+    Import-based benchmark target.
+
+    The runner imports ``module``, looks up ``callable`` and ``get_inputs``,
+    materializes inputs by calling ``get_inputs()`` (which must return either
+    a tuple ``(args, kwargs)`` or a single args tuple/list), then times
+    ``callable(*args, **kwargs)`` with ``magpie.bench.do_bench_cudagraph``.
+
+    Attributes:
+        module: Importable module path (e.g. ``my_kernels.scaled_mm``).
+        callable: Attribute name of the callable inside the module.
+        get_inputs: Attribute name of the inputs factory inside the module.
+                    Must return ``(args, kwargs)`` or a positional tuple.
+    """
+
+    module: str
+    callable: str
+    get_inputs: str = "get_inputs"
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "module": self.module,
+            "callable": self.callable,
+            "get_inputs": self.get_inputs,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Optional[Dict[str, Any]]) -> Optional["BenchTarget"]:
+        if not data:
+            return None
+        if "module" not in data or "callable" not in data:
+            return None
+        return cls(
+            module=str(data["module"]),
+            callable=str(data["callable"]),
+            get_inputs=str(data.get("get_inputs", "get_inputs")),
+        )
+
+
+@dataclass
+class LatencyConfig:
+    """
+    Configuration for the Latency evaluation stage.
+
+    Attributes:
+        enabled: Master switch. ``False`` skips the stage entirely.
+        method: One of ``LATENCY_METHODS``. ``auto`` selects ``both`` for
+                ``TRITON``/``PYTORCH``/``CUDA`` and ``rocprof_timestamps`` for
+                ``HIP`` (since HIP testcases are native binaries that don't
+                import torch).
+        primary_metric: Which median is reported as the headline number and
+                        used by ``compare`` mode for ranking.
+        rep_ms: Target measurement window in milliseconds. ``do_bench_cudagraph``
+                picks ``n_repeat = max(1, int(rep_ms / estimate_ms))``.
+        n_retries: How many independent measurements to take (median is reported).
+        estimate_reps: How many ``fn()`` calls to capture for the initial
+                       cost-estimate graph.
+        warmup_iters: Eager warmup iterations before any graph capture / timing.
+        seed: Seed passed to ``torch.manual_seed`` / ``torch.cuda.manual_seed_all``
+              before inputs are materialized — guarantees reproducible tensor
+              contents and sizes across runs.
+        kernel_filter: Optional regex applied to per-dispatch kernel names when
+                       aggregating ``rocprof_timestamps`` / ``kernel_trace``
+                       results.
+        kernel_type: Auto-selection input. Set by ``PipelineConfig`` so that
+                     ``method=auto`` can pick the right backend.
+        gpu_arch: Auto-selection input (``gfx*`` -> AMD, ``sm_*`` -> NVIDIA).
+        bench_target: Import-based target (sub-mode A). When ``None``, the
+                      Latency stage falls back to running the user's
+                      ``testcase_command`` and parsing a ``MAGPIE_LATENCY_JSON:``
+                      stdout marker (sub-mode B).
+        pythonpath: Extra absolute paths prepended to ``PYTHONPATH`` of the
+                    benchmark subprocess so non-installed user packages import
+                    cleanly.
+        timeout_seconds: Per-subprocess timeout.
+        output_dir: Where ``kernel_trace`` mode writes its rocprofv3 CSV.
+                    ``None`` -> a sibling ``latency/`` folder under the
+                    workload dir.
+    """
+
+    enabled: bool = True
+    method: str = "auto"
+    primary_metric: str = "wall_median_ms"
+    rep_ms: int = 20
+    n_retries: int = 5
+    estimate_reps: int = 5
+    warmup_iters: int = 5
+    seed: int = 42
+    kernel_filter: Optional[str] = None
+
+    kernel_type: Optional["KernelType"] = None
+    gpu_arch: Optional[str] = None
+
+    bench_target: Optional[BenchTarget] = None
+    pythonpath: List[str] = field(default_factory=list)
+
+    timeout_seconds: float = 120.0
+    output_dir: Optional[str] = None
+
+    def __post_init__(self) -> None:
+        if self.method not in LATENCY_METHODS:
+            raise ValueError(
+                f"latency.method must be one of {LATENCY_METHODS}, got {self.method!r}"
+            )
+        if self.primary_metric not in PRIMARY_METRICS:
+            raise ValueError(
+                f"latency.primary_metric must be one of {PRIMARY_METRICS}, "
+                f"got {self.primary_metric!r}"
+            )
+
+    # ------------------------------------------------------------------
+    # Method resolution
+    # ------------------------------------------------------------------
+
+    def resolve_method(self) -> str:
+        """
+        Resolve ``method=auto`` into a concrete method based on the configured
+        kernel type and GPU architecture.
+
+        Selection table:
+          - HIP                            -> ``rocprof_timestamps``
+          - TRITON / PYTORCH / CUDA        -> ``both``
+          - unknown / no kernel type       -> ``cuda_graph`` (best portable default)
+        """
+        if self.method != "auto":
+            return self.method
+
+        from .pipeline import KernelType
+
+        if self.kernel_type == KernelType.HIP:
+            return "rocprof_timestamps"
+        if self.kernel_type in (KernelType.TRITON, KernelType.PYTORCH, KernelType.CUDA):
+            return "both"
+        return "cuda_graph"
+
+    # ------------------------------------------------------------------
+    # Serialization
+    # ------------------------------------------------------------------
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "enabled": self.enabled,
+            "method": self.method,
+            "primary_metric": self.primary_metric,
+            "rep_ms": self.rep_ms,
+            "n_retries": self.n_retries,
+            "estimate_reps": self.estimate_reps,
+            "warmup_iters": self.warmup_iters,
+            "seed": self.seed,
+            "kernel_filter": self.kernel_filter,
+            "bench_target": self.bench_target.to_dict() if self.bench_target else None,
+            "pythonpath": list(self.pythonpath),
+            "timeout_seconds": self.timeout_seconds,
+            "output_dir": self.output_dir,
+        }
+
+    @classmethod
+    def from_dict(
+        cls,
+        data: Optional[Dict[str, Any]],
+        kernel_type: Optional["KernelType"] = None,
+        gpu_arch: Optional[str] = None,
+    ) -> "LatencyConfig":
+        data = dict(data or {})
+        bench_target = BenchTarget.from_dict(data.get("bench_target"))
+        return cls(
+            enabled=bool(data.get("enabled", True)),
+            method=str(data.get("method", "auto")),
+            primary_metric=str(data.get("primary_metric", "wall_median_ms")),
+            rep_ms=int(data.get("rep_ms", 20)),
+            n_retries=int(data.get("n_retries", 5)),
+            estimate_reps=int(data.get("estimate_reps", 5)),
+            warmup_iters=int(data.get("warmup_iters", 5)),
+            seed=int(data.get("seed", 42)),
+            kernel_filter=data.get("kernel_filter"),
+            kernel_type=kernel_type,
+            gpu_arch=gpu_arch,
+            bench_target=bench_target,
+            pythonpath=list(data.get("pythonpath", []) or []),
+            timeout_seconds=float(data.get("timeout_seconds", 120.0)),
+            output_dir=data.get("output_dir"),
+        )
diff --git a/Magpie/config/pipeline.py b/Magpie/config/pipeline.py
index 357e966..2cbb73b 100644
--- a/Magpie/config/pipeline.py
+++ b/Magpie/config/pipeline.py
@@ -14,6 +14,7 @@
 from typing import Optional
 
 from .correctness import CorrectnessConfig
+from .latency import LatencyConfig
 from .performance import PerformanceConfig
 
 
@@ -69,6 +70,7 @@ class PipelineConfig:
     compiling_config: Optional[CompilingConfig] = None
     correctness_config: Optional[CorrectnessConfig] = None
     performance_config: Optional[PerformanceConfig] = None
+    latency_config: Optional[LatencyConfig] = None
     output_dir: str = "./results"
     verbose: bool = False
 
@@ -87,6 +89,18 @@ def __post_init__(self):
                 kernel_type=self.kernel_type,
                 gpu_arch=self.gpu_arch,
             )
+        if self.latency_config is None:
+            self.latency_config = LatencyConfig(
+                kernel_type=self.kernel_type,
+                gpu_arch=self.gpu_arch,
+            )
+        else:
+            # Backfill kernel_type / gpu_arch when caller built LatencyConfig
+            # without them so resolve_method() works.
+            if self.latency_config.kernel_type is None:
+                self.latency_config.kernel_type = self.kernel_type
+            if self.latency_config.gpu_arch is None:
+                self.latency_config.gpu_arch = self.gpu_arch
 
     def _detect_gpu_arch(self) -> str:
         """Auto-detect GPU architecture."""
diff --git a/Magpie/core/executor.py b/Magpie/core/executor.py
index cfb457c..973bfe9 100644
--- a/Magpie/core/executor.py
+++ b/Magpie/core/executor.py
@@ -692,6 +692,7 @@ def _execute_task_worker(task_dict: Dict[str, Any]) -> Dict[str, Any]:
                 ncu_config=mode_cfg.get("ncu_config", {}),
                 metrix_config=mode_cfg.get("metrix_config", {}),
                 correctness_config=mode_cfg.get("correctness_config", {}),
+                latency_config=mode_cfg.get("latency_config", {}),
                 gpu_arch=mode_cfg.get("gpu_arch", None),
             )
             analyzer = AnalyzeMode(analyze_config)
@@ -716,6 +717,7 @@ def _execute_task_worker(task_dict: Dict[str, Any]) -> Dict[str, Any]:
                 ncu_config=mode_cfg.get("ncu_config", {}),
                 metrix_config=mode_cfg.get("metrix_config", {}),
                 correctness_config=mode_cfg.get("correctness_config", {}),
+                latency_config=mode_cfg.get("latency_config", {}),
                 gpu_arch=mode_cfg.get("gpu_arch", None),
                 winner_strategy=compare_cfg.get("winner_strategy", "perf_score"),
                 perf_weights_rocprof=compare_cfg.get("perf_weights_rocprof", {}),
diff --git a/Magpie/core/scheduler.py b/Magpie/core/scheduler.py
index 35eb07c..56b895c 100644
--- a/Magpie/core/scheduler.py
+++ b/Magpie/core/scheduler.py
@@ -167,6 +167,7 @@ def create_task(
         ncu_config: Optional[Dict[str, Any]] = None,
         metrix_config: Optional[Dict[str, Any]] = None,
         correctness_config: Optional[Dict[str, Any]] = None,
+        latency_config: Optional[Dict[str, Any]] = None,
         baseline_index: int = 0,
         compare_config: Optional[Dict[str, Any]] = None,
         benchmark_config: Optional[Dict[str, Any]] = None,
@@ -204,6 +205,7 @@ def create_task(
             ncu_config=ncu_config or {},
             metrix_config=metrix_config or {},
             correctness_config=correctness_config or {},
+            latency_config=latency_config or {},
             baseline_index=baseline_index,
             compare_config=compare_config or {},
             benchmark_config=benchmark_config or {},
@@ -291,6 +293,7 @@ def run_analyze(
         ncu_config: Optional[Dict[str, Any]] = None,
         metrix_config: Optional[Dict[str, Any]] = None,
         correctness_config: Optional[Dict[str, Any]] = None,
+        latency_config: Optional[Dict[str, Any]] = None,
     ) -> TaskResult:
         """
         Convenience method to run analyze mode.
@@ -321,6 +324,7 @@ def run_analyze(
             ncu_config=ncu_config,
             metrix_config=metrix_config,
             correctness_config=correctness_config,
+            latency_config=latency_config,
         )
         return self.execute(task)
 
@@ -337,6 +341,7 @@ def run_compare(
         ncu_config: Optional[Dict[str, Any]] = None,
         metrix_config: Optional[Dict[str, Any]] = None,
         correctness_config: Optional[Dict[str, Any]] = None,
+        latency_config: Optional[Dict[str, Any]] = None,
         compare_config: Optional[Dict[str, Any]] = None,
     ) -> TaskResult:
         """
@@ -370,6 +375,7 @@ def run_compare(
             ncu_config=ncu_config,
             metrix_config=metrix_config,
             correctness_config=correctness_config,
+            latency_config=latency_config,
             compare_config=compare_config,
         )
         return self.execute(task)
diff --git a/Magpie/core/task.py b/Magpie/core/task.py
index 69092b3..06b03ee 100644
--- a/Magpie/core/task.py
+++ b/Magpie/core/task.py
@@ -63,6 +63,7 @@ class ModeConfig:
     ncu_config: Dict[str, Any] = field(default_factory=dict)
     metrix_config: Dict[str, Any] = field(default_factory=dict)
     correctness_config: Dict[str, Any] = field(default_factory=dict)
+    latency_config: Dict[str, Any] = field(default_factory=dict)
     baseline_index: int = 0  # For compare mode
     compare_config: Dict[str, Any] = field(default_factory=dict)
     benchmark_config: Dict[str, Any] = field(default_factory=dict)  # For benchmark mode
@@ -119,6 +120,7 @@ def to_dict(self) -> Dict[str, Any]:
                 "ncu_config": self.mode_config.ncu_config,
                 "metrix_config": self.mode_config.metrix_config,
                 "correctness_config": self.mode_config.correctness_config,
+                "latency_config": self.mode_config.latency_config,
                 "baseline_index": self.mode_config.baseline_index,
                 "compare_config": self.mode_config.compare_config,
                 "benchmark_config": self.mode_config.benchmark_config,
diff --git a/Magpie/eval/__init__.py b/Magpie/eval/__init__.py
index 323ce7b..0812a5a 100644
--- a/Magpie/eval/__init__.py
+++ b/Magpie/eval/__init__.py
@@ -25,6 +25,7 @@
     MetricResult,
 )
 from .performance import Performance, PerformanceResult
+from .latency import Latency, LatencyResult
 
 __all__ = [
     # Evaluator
@@ -41,4 +42,7 @@
     # Performance
     "Performance",
     "PerformanceResult",
+    # Latency
+    "Latency",
+    "LatencyResult",
 ]
diff --git a/Magpie/eval/evaluator.py b/Magpie/eval/evaluator.py
index deccd7b..01dc66b 100644
--- a/Magpie/eval/evaluator.py
+++ b/Magpie/eval/evaluator.py
@@ -19,6 +19,7 @@
 from ..config import PipelineConfig, KernelEvalConfig
 from .correctness import Correctness, CorrectnessResult
 from .compiling import Compiling, CompilingResult
+from .latency import Latency, LatencyResult
 from .performance import Performance, PerformanceResult
 
 
@@ -40,12 +41,14 @@ class EvaluationState:
     compiling_state: BaseKind = BaseKind.SUCCESS
     correctness_state: BaseKind = BaseKind.SUCCESS
     performance_state: BaseKind = BaseKind.SUCCESS
+    latency_state: BaseKind = BaseKind.SKIPPED
     errors: List[str] = field(default_factory=list)
 
     # Results of each evaluation step
     compiling_result: Optional[CompilingResult] = None
     correctness_result: Optional[CorrectnessResult] = None
     performance_result: Optional[PerformanceResult] = None
+    latency_result: Optional[LatencyResult] = None
 
     # Overall score (0.0 to 1.0)
     score: float = 0.0
@@ -59,6 +62,7 @@ def to_dict(self) -> Dict[str, Any]:
             "compiling_state": self.compiling_state.name,
             "correctness_state": self.correctness_state.name,
             "performance_state": self.performance_state.name,
+            "latency_state": self.latency_state.name,
             "errors": self.errors,
             "score": self.score,
             "compiling_result": {
@@ -84,6 +88,9 @@ def to_dict(self) -> Dict[str, Any]:
             "performance_result": self.performance_result.to_dict()
             if self.performance_result
             else None,
+            "latency_result": self.latency_result.to_dict()
+            if self.latency_result
+            else None,
             "extra": self.extra,
         }
 
@@ -104,6 +111,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvaluationState":
         state.compiling_state = BaseKind[data.get("compiling_state", "SUCCESS")]
         state.correctness_state = BaseKind[data.get("correctness_state", "SUCCESS")]
         state.performance_state = BaseKind[data.get("performance_state", "SUCCESS")]
+        state.latency_state = BaseKind[data.get("latency_state", "SKIPPED")]
 
         # Restore errors and score
         state.errors = data.get("errors", [])
@@ -135,6 +143,31 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvaluationState":
                 workload_dir=perf_data.get("workload_dir"),
             )
 
+        # Restore latency result (best-effort - dict round-trip preserves
+        # the headline fields; full dataclass restoration not required for
+        # downstream consumers since they read .to_dict() directly).
+        lat_data = data.get("latency_result")
+        if lat_data:
+            from .latency import LatencyResult
+            from ..bench import LatencyStats
+
+            state.latency_result = LatencyResult(
+                success=lat_data.get("success", False),
+                method=lat_data.get("method", "none"),
+                primary_metric=lat_data.get("primary_metric", "wall_median_ms"),
+                wall_stats=LatencyStats.from_dict(lat_data.get("wall_stats")),
+                kernel_stats=LatencyStats.from_dict(lat_data.get("kernel_stats")),
+                dispatch_overhead_us=lat_data.get("dispatch_overhead_us"),
+                crosscheck_vs_rocprof_ratio=lat_data.get(
+                    "crosscheck_vs_rocprof_ratio"
+                ),
+                crosscheck_warning=lat_data.get("crosscheck_warning"),
+                config_snapshot=lat_data.get("config", {}),
+                command=lat_data.get("command"),
+                output_dir=lat_data.get("output_dir"),
+                errors=lat_data.get("errors"),
+            )
+
         # Restore extra
         state.extra = data.get("extra", {})
 
@@ -159,6 +192,7 @@ def __init__(self, pipeline_cfg: PipelineConfig) -> None:
         self.compiling = Compiling(pipeline_cfg)
         self.correctness = Correctness(pipeline_cfg)
         self.performance = Performance(pipeline_cfg)
+        self.latency = Latency(pipeline_cfg)
 
     def evaluate(self, kernel_cfg: KernelEvalConfig) -> EvaluationState:
         """
@@ -187,7 +221,12 @@ def evaluate(self, kernel_cfg: KernelEvalConfig) -> EvaluationState:
         # 3) Performance (skip if no prof_command and profiling disabled)
         state = self._check_performance(state, kernel_cfg)
 
-        # 4) Calculate score
+        # 4) Latency (0-overhead wall-clock + kernel-only) — runs even if
+        #    Performance was skipped; needs Performance only for the
+        #    rocprof_timestamps method which reuses pmc_perf.csv.
+        state = self._check_latency(state, kernel_cfg)
+
+        # 5) Calculate score
         state = self._calculate_score(state)
 
         return state
@@ -258,6 +297,28 @@ def _check_performance(
 
         return state
 
+    def _check_latency(
+        self, state: EvaluationState, kernel_cfg: KernelEvalConfig
+    ) -> EvaluationState:
+        """Run the 0-overhead Latency harness."""
+        try:
+            result = self.latency.run(state, kernel_cfg)
+            state.latency_result = result
+
+            if result is None:
+                state.latency_state = BaseKind.SKIPPED
+            elif result.success:
+                state.latency_state = BaseKind.SUCCESS
+            else:
+                state.latency_state = BaseKind.FAILED
+                if result.errors:
+                    state.errors.append(f"Latency: {result.errors}")
+        except Exception as e:
+            state.latency_state = BaseKind.FAILED
+            state.errors.append(f"Latency error: {str(e)}")
+
+        return state
+
     def _calculate_score(self, state: EvaluationState) -> EvaluationState:
         """Calculate overall evaluation score."""
         score = 0.0
diff --git a/Magpie/eval/latency.py b/Magpie/eval/latency.py
new file mode 100644
index 0000000..b0746c7
--- /dev/null
+++ b/Magpie/eval/latency.py
@@ -0,0 +1,899 @@
+###############################################################################
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Latency evaluation stage.
+
+Sits next to ``Performance`` (HW counters) in the pipeline and produces
+0-overhead wall-clock and/or kernel-only latency for the kernel under
+evaluation. See ``Magpie.config.latency`` for the method semantics.
+
+This module never imports ``torch`` at module level — all heavyweight work
+happens inside short-lived subprocesses spawned by :class:`Latency`.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from ..bench import LatencyStats, MAGPIE_LATENCY_JSON_MARKER
+from ..config import (
+    BenchTarget,
+    KernelEvalConfig,
+    LatencyConfig,
+    PipelineConfig,
+)
+from ..utils import get_updated_env
+
+logger = logging.getLogger(__name__)
+
+
+# Filter out HIP / CUDA runtime kernels from per-kernel aggregations.
+_RUNTIME_KERNEL_PREFIXES = ("__amd_rocclr_", "__hip_", "cuLaunchKernel")
+
+
+# ---------------------------------------------------------------------------
+# Result types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LatencyResult:
+    """
+    Result of the Latency evaluation stage.
+
+    Fields are populated based on the resolved ``method``:
+
+      - ``cuda_graph``         -> ``wall_stats`` only
+      - ``kernel_trace``       -> ``kernel_stats`` + per-kernel breakdown
+      - ``rocprof_timestamps`` -> ``kernel_stats`` + per-kernel breakdown
+      - ``both``               -> both, plus ``dispatch_overhead_us``
+    """
+
+    success: bool
+    method: str = "none"
+    primary_metric: str = "wall_median_ms"
+    wall_stats: Optional[LatencyStats] = None
+    kernel_stats: Optional[LatencyStats] = None
+    per_kernel: Dict[str, LatencyStats] = field(default_factory=dict)
+    dispatch_overhead_us: Optional[float] = None
+    crosscheck_vs_rocprof_ratio: Optional[float] = None
+    crosscheck_warning: Optional[str] = None
+    config_snapshot: Dict[str, Any] = field(default_factory=dict)
+    command: Optional[str] = None
+    output_dir: Optional[str] = None
+    raw_output: Optional[str] = None
+    errors: Optional[str] = None
+
+    def get_primary_value(self) -> Optional[float]:
+        """Return the headline number used by ``compare`` rankings."""
+        if self.primary_metric == "kernel_median_ms" and self.kernel_stats:
+            return self.kernel_stats.median_ms
+        if self.wall_stats:
+            return self.wall_stats.median_ms
+        if self.kernel_stats:
+            return self.kernel_stats.median_ms
+        return None
+
+    def to_dict(self) -> Dict[str, Any]:
+        d: Dict[str, Any] = {
+            "success": self.success,
+            "method": self.method,
+            "primary_metric": self.primary_metric,
+            "primary_value_ms": self.get_primary_value(),
+            "config": self.config_snapshot,
+            "command": self.command,
+            "output_dir": self.output_dir,
+            "errors": self.errors,
+        }
+        if self.wall_stats is not None:
+            d["wall_stats"] = self.wall_stats.to_dict()
+        if self.kernel_stats is not None:
+            d["kernel_stats"] = self.kernel_stats.to_dict()
+        if self.per_kernel:
+            d["per_kernel"] = {k: v.to_dict() for k, v in self.per_kernel.items()}
+        if self.dispatch_overhead_us is not None:
+            d["dispatch_overhead_us"] = self.dispatch_overhead_us
+        if self.crosscheck_vs_rocprof_ratio is not None:
+            d["crosscheck_vs_rocprof_ratio"] = self.crosscheck_vs_rocprof_ratio
+        if self.crosscheck_warning:
+            d["crosscheck_warning"] = self.crosscheck_warning
+        return d
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_runtime_kernel(name: str) -> bool:
+    return name.startswith(_RUNTIME_KERNEL_PREFIXES)
+
+
+def _aggregate_per_kernel_durations_ns(
+    per_kernel_ns: Dict[str, List[float]]
+) -> Dict[str, LatencyStats]:
+    """Build per-kernel ``LatencyStats`` from raw ns durations."""
+    out: Dict[str, LatencyStats] = {}
+    for name, samples_ns in per_kernel_ns.items():
+        samples_ms = [v / 1e6 for v in samples_ns if v > 0]
+        if not samples_ms:
+            continue
+        out[name] = LatencyStats.from_samples(
+            samples_ms, n_repeat=1, n_retries=len(samples_ms)
+        )
+    return out
+
+
+def _summary_stats_from_per_kernel(
+    per_kernel: Dict[str, LatencyStats],
+) -> Optional[LatencyStats]:
+    """
+    Build a single ``kernel_stats`` summary from per-kernel breakdown:
+
+    - For each per-kernel timeline, take the median.
+    - The summary's ``median_ms`` is the SUM of per-kernel medians (i.e. the
+      median total kernel time per "iteration"), with min/max/p99 derived
+      from the collated set of all dispatch durations.
+    """
+    if not per_kernel:
+        return None
+
+    total_median_ms = sum(stats.median_ms for stats in per_kernel.values())
+    all_samples: List[float] = []
+    for stats in per_kernel.values():
+        all_samples.extend(stats.samples_ms)
+
+    if not all_samples:
+        return None
+
+    base = LatencyStats.from_samples(
+        all_samples, n_repeat=1, n_retries=len(all_samples)
+    )
+    # Override median with the per-kernel-summed median (truer "per iter" cost)
+    base.median_ms = total_median_ms
+    base.p50_ms = total_median_ms
+    return base
+
+
+def _merge_bench_target(
+    cfg: LatencyConfig, kernel_cfg: KernelEvalConfig
+) -> Optional[BenchTarget]:
+    """Per-kernel ``bench_target`` wins over LatencyConfig default."""
+    if kernel_cfg.bench_target:
+        return BenchTarget.from_dict(kernel_cfg.bench_target)
+    return cfg.bench_target
+
+
+def _build_runner_env(
+    cfg: LatencyConfig,
+    bench_target: BenchTarget,
+    kernel_env: Optional[Dict[str, str]],
+) -> Dict[str, str]:
+    """Construct env for the runner subprocess (no leakage of MAGPIE state)."""
+    extra: Dict[str, str] = dict(kernel_env or {})
+
+    extra["MAGPIE_BENCH_MODULE"] = bench_target.module
+    extra["MAGPIE_BENCH_CALLABLE"] = bench_target.callable
+    extra["MAGPIE_BENCH_INPUTS_FUNC"] = bench_target.get_inputs
+    extra["MAGPIE_BENCH_REP_MS"] = str(cfg.rep_ms)
+    extra["MAGPIE_BENCH_N_RETRIES"] = str(cfg.n_retries)
+    extra["MAGPIE_BENCH_ESTIMATE_REPS"] = str(cfg.estimate_reps)
+    extra["MAGPIE_BENCH_WARMUP_ITERS"] = str(cfg.warmup_iters)
+    extra["MAGPIE_BENCH_SEED"] = str(cfg.seed)
+
+    if cfg.pythonpath:
+        extra["PYTHONPATH"] = ":".join(cfg.pythonpath)
+
+    return get_updated_env(extra)
+
+
+def _runner_module_args() -> List[str]:
+    """Command-line invocation of the runner as a module."""
+    return [sys.executable, "-m", "Magpie.bench._runner"]
+
+
+def _parse_marker_line(stdout: str) -> Optional[Dict[str, Any]]:
+    """Find and parse the ``MAGPIE_LATENCY_JSON: {...}`` line in *stdout*."""
+    if not stdout:
+        return None
+    for line in reversed(stdout.splitlines()):
+        line = line.strip()
+        if line.startswith(MAGPIE_LATENCY_JSON_MARKER):
+            payload = line[len(MAGPIE_LATENCY_JSON_MARKER):].strip()
+            try:
+                return json.loads(payload)
+            except json.JSONDecodeError as e:
+                logger.warning(f"Failed to parse MAGPIE_LATENCY_JSON line: {e}")
+                return None
+    return None
+
+
+# ---------------------------------------------------------------------------
+# rocprofv3 / pmc_perf parsing
+# ---------------------------------------------------------------------------
+
+
+def _parse_rocprofv3_kernel_trace_csv(
+    csv_path: Path,
+    kernel_filter_re: Optional[str] = None,
+) -> Dict[str, List[float]]:
+    """
+    Parse a rocprofv3 ``--kernel-trace`` CSV and return per-kernel ns durations.
+
+    rocprofv3 emits columns like ``Kernel_Name``, ``Start_Timestamp``,
+    ``End_Timestamp`` (units = ns). Different rocprofv3 versions also use
+    ``KernelName`` / ``Start_Time`` / ``End_Time`` — we accept both spellings.
+    """
+    import re
+
+    rx = re.compile(kernel_filter_re) if kernel_filter_re else None
+
+    per_kernel: Dict[str, List[float]] = {}
+    if not csv_path.exists():
+        return per_kernel
+
+    try:
+        with open(csv_path, "r") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                name = (
+                    row.get("Kernel_Name")
+                    or row.get("KernelName")
+                    or row.get("kernel_name")
+                    or "unknown"
+                )
+                if _is_runtime_kernel(name):
+                    continue
+                if rx and not rx.search(name):
+                    continue
+                start = (
+                    row.get("Start_Timestamp")
+                    or row.get("Start_Time")
+                    or row.get("start_timestamp")
+                )
+                end = (
+                    row.get("End_Timestamp")
+                    or row.get("End_Time")
+                    or row.get("end_timestamp")
+                )
+                if not start or not end:
+                    continue
+                try:
+                    duration_ns = float(end) - float(start)
+                except (TypeError, ValueError):
+                    continue
+                if duration_ns <= 0:
+                    continue
+                per_kernel.setdefault(name, []).append(duration_ns)
+    except Exception as e:
+        logger.warning(f"Failed to parse rocprofv3 kernel-trace CSV {csv_path}: {e}")
+
+    return per_kernel
+
+
+def _find_rocprofv3_csv(out_dir: Path) -> Optional[Path]:
+    """Locate the kernel-trace CSV inside *out_dir*.
+
+    rocprofv3 layouts seen in the wild:
+      - ``<out_dir>/kernel_trace.csv``
+      - ``<out_dir>/<name>_kernel_trace.csv``
+      - ``<out_dir>/<host>/<pid>_kernel_trace.csv``  (default in 7.x+)
+    Walks recursively to find the first matching file.
+    """
+    if not out_dir.exists():
+        return None
+
+    direct = out_dir / "kernel_trace.csv"
+    if direct.exists():
+        return direct
+
+    # Recursive glob so ``<host>/<pid>_kernel_trace.csv`` works.
+    for pattern in ("*kernel_trace*.csv", "*_kernel_trace.csv"):
+        for hit in sorted(out_dir.rglob(pattern)):
+            # Skip the agent_info / per-process metadata files.
+            if "agent_info" in hit.name:
+                continue
+            return hit
+    return None
+
+
+def _parse_pmc_perf_csv_for_durations(
+    csv_path: Path,
+    kernel_filter_re: Optional[str] = None,
+) -> Dict[str, List[float]]:
+    """Parse rocprof-compute's ``pmc_perf.csv`` for per-dispatch ns durations."""
+    import re
+
+    rx = re.compile(kernel_filter_re) if kernel_filter_re else None
+
+    per_kernel: Dict[str, List[float]] = {}
+    if not csv_path.exists():
+        return per_kernel
+
+    try:
+        with open(csv_path, "r") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                name = (
+                    row.get("Kernel_Name")
+                    or row.get("KernelName")
+                    or "unknown"
+                )
+                if _is_runtime_kernel(name):
+                    continue
+                if rx and not rx.search(name):
+                    continue
+                try:
+                    start = float(row.get("Start_Timestamp", 0))
+                    end = float(row.get("End_Timestamp", 0))
+                except (TypeError, ValueError):
+                    continue
+                if end <= start:
+                    continue
+                per_kernel.setdefault(name, []).append(end - start)
+    except Exception as e:
+        logger.warning(f"Failed to parse pmc_perf.csv {csv_path}: {e}")
+
+    return per_kernel
+
+
+# ---------------------------------------------------------------------------
+# Latency stage
+# ---------------------------------------------------------------------------
+
+
+class Latency:
+    """
+    Latency evaluation handler.
+
+    Methods:
+      - ``cuda_graph``        : in-process ``do_bench_cudagraph`` (subprocess)
+      - ``kernel_trace``      : runner ``--profile`` + rocprofv3 ``--kernel-trace``
+      - ``rocprof_timestamps``: reuse pmc_perf.csv from the Performance stage
+      - ``both``              : run both wall-clock and a kernel-only method
+    """
+
+    def __init__(self, pipeline_cfg: PipelineConfig) -> None:
+        self.pipeline_cfg = pipeline_cfg
+        self.lat_cfg: LatencyConfig = (
+            pipeline_cfg.latency_config
+            or LatencyConfig(
+                kernel_type=pipeline_cfg.kernel_type,
+                gpu_arch=pipeline_cfg.gpu_arch,
+            )
+        )
+
+    # ------------------------------------------------------------------
+    # Public entry point
+    # ------------------------------------------------------------------
+
+    def run(
+        self,
+        eval_state: Any,
+        kernel_cfg: KernelEvalConfig,
+    ) -> Optional[LatencyResult]:
+        """
+        Run the Latency stage.
+
+        Returns:
+          - ``None`` when the stage is skipped (disabled or method=``none``).
+          - ``LatencyResult`` otherwise.
+        """
+        if not self.lat_cfg.enabled:
+            return None
+
+        method = self.lat_cfg.resolve_method()
+        if method == "none":
+            return None
+
+        snapshot = self._config_snapshot(method)
+
+        # rocprof_timestamps reuses the Performance stage's pmc_perf.csv
+        if method == "rocprof_timestamps":
+            return self._run_rocprof_timestamps(eval_state, snapshot)
+
+        # cuda_graph / kernel_trace / both — need either a bench_target OR a
+        # testcase_command harness that emits MAGPIE_LATENCY_JSON.
+        bench_target = _merge_bench_target(self.lat_cfg, kernel_cfg)
+        has_harness_cmd = kernel_cfg.has_testcase()
+
+        try:
+            if method == "cuda_graph":
+                return self._run_cuda_graph(
+                    bench_target, kernel_cfg, has_harness_cmd, snapshot
+                )
+            if method == "kernel_trace":
+                return self._run_kernel_trace(
+                    bench_target, kernel_cfg, snapshot
+                )
+            if method == "both":
+                return self._run_both(
+                    bench_target, kernel_cfg, has_harness_cmd, eval_state, snapshot
+                )
+        except Exception as e:  # pragma: no cover — defensive
+            logger.exception(f"Latency stage failed: {e}")
+            return LatencyResult(
+                success=False,
+                method=method,
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                errors=str(e),
+            )
+
+        return LatencyResult(
+            success=False,
+            method=method,
+            primary_metric=self.lat_cfg.primary_metric,
+            config_snapshot=snapshot,
+            errors=f"Unknown latency method: {method}",
+        )
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    def _config_snapshot(self, resolved_method: str) -> Dict[str, Any]:
+        snap = self.lat_cfg.to_dict()
+        snap["resolved_method"] = resolved_method
+        snap["gpu_arch"] = self.lat_cfg.gpu_arch
+        snap["kernel_type"] = (
+            self.lat_cfg.kernel_type.name if self.lat_cfg.kernel_type else None
+        )
+        return snap
+
+    # ----- cuda_graph -------------------------------------------------
+
+    def _run_cuda_graph(
+        self,
+        bench_target: Optional[BenchTarget],
+        kernel_cfg: KernelEvalConfig,
+        has_harness_cmd: bool,
+        snapshot: Dict[str, Any],
+    ) -> LatencyResult:
+        if bench_target is None and not has_harness_cmd:
+            return LatencyResult(
+                success=False,
+                method="cuda_graph",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                errors=(
+                    "cuda_graph requires either a bench_target (import-based) "
+                    "or testcase_command (user harness emitting "
+                    "MAGPIE_LATENCY_JSON: {...})."
+                ),
+            )
+
+        if bench_target is not None:
+            return self._exec_runner_cuda_graph(bench_target, kernel_cfg, snapshot)
+
+        return self._exec_user_harness(kernel_cfg, snapshot)
+
+    def _exec_runner_cuda_graph(
+        self,
+        bench_target: BenchTarget,
+        kernel_cfg: KernelEvalConfig,
+        snapshot: Dict[str, Any],
+    ) -> LatencyResult:
+        env = _build_runner_env(self.lat_cfg, bench_target, kernel_cfg.env)
+        cmd = _runner_module_args()
+        return self._invoke_runner(cmd, env, kernel_cfg.working_dir, snapshot, "cuda_graph")
+
+    def _exec_user_harness(
+        self,
+        kernel_cfg: KernelEvalConfig,
+        snapshot: Dict[str, Any],
+    ) -> LatencyResult:
+        cmds = kernel_cfg.get_testcase_commands()
+        if not cmds:
+            return LatencyResult(
+                success=False,
+                method="cuda_graph",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                errors="No testcase command available for user harness",
+            )
+        env = get_updated_env(kernel_cfg.env)
+        # Use the LAST testcase command — earlier commands are usually setup
+        cmd = cmds[-1]
+        return self._invoke_runner(cmd, env, kernel_cfg.working_dir, snapshot, "cuda_graph")
+
+    def _invoke_runner(
+        self,
+        cmd: List[str],
+        env: Dict[str, str],
+        cwd: Optional[str],
+        snapshot: Dict[str, Any],
+        method: str,
+    ) -> LatencyResult:
+        cmd_str = " ".join(cmd)
+        logger.info(f"[Latency:{method}] running: {cmd_str}")
+        try:
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                env=env,
+                cwd=cwd,
+                timeout=self.lat_cfg.timeout_seconds,
+            )
+        except subprocess.TimeoutExpired:
+            return LatencyResult(
+                success=False,
+                method=method,
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                command=cmd_str,
+                errors=f"Latency runner timed out after {self.lat_cfg.timeout_seconds}s",
+            )
+
+        stdout = proc.stdout or ""
+        stderr = proc.stderr or ""
+        marker = _parse_marker_line(stdout)
+
+        if proc.returncode != 0 or marker is None:
+            return LatencyResult(
+                success=False,
+                method=method,
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                command=cmd_str,
+                raw_output=stdout,
+                errors=(
+                    marker.get("error")
+                    if isinstance(marker, dict)
+                    else (stderr.strip() or stdout.strip() or "no MAGPIE_LATENCY_JSON marker")
+                ),
+            )
+
+        if marker.get("error"):
+            return LatencyResult(
+                success=False,
+                method=method,
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                command=cmd_str,
+                raw_output=stdout,
+                errors=str(marker.get("error")),
+            )
+
+        wall_stats = LatencyStats.from_dict(marker.get("stats"))
+
+        return LatencyResult(
+            success=True,
+            method=method,
+            primary_metric=self.lat_cfg.primary_metric,
+            wall_stats=wall_stats,
+            config_snapshot=snapshot,
+            command=cmd_str,
+            raw_output=stdout,
+        )
+
+    # ----- kernel_trace ----------------------------------------------
+
+    def _run_kernel_trace(
+        self,
+        bench_target: Optional[BenchTarget],
+        kernel_cfg: KernelEvalConfig,
+        snapshot: Dict[str, Any],
+    ) -> LatencyResult:
+        if shutil.which("rocprofv3") is None:
+            return LatencyResult(
+                success=False,
+                method="kernel_trace",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                errors=(
+                    "rocprofv3 not found. kernel_trace requires ROCm rocprofv3 "
+                    "on PATH. Use method=cuda_graph for a torch-only fallback."
+                ),
+            )
+
+        if bench_target is None:
+            # User harness: just wrap the testcase command with rocprofv3.
+            cmds = kernel_cfg.get_testcase_commands()
+            if not cmds:
+                return LatencyResult(
+                    success=False,
+                    method="kernel_trace",
+                    primary_metric=self.lat_cfg.primary_metric,
+                    config_snapshot=snapshot,
+                    errors=(
+                        "kernel_trace requires either a bench_target or a "
+                        "testcase_command to wrap with rocprofv3."
+                    ),
+                )
+            inner = cmds[-1]
+            env = get_updated_env(kernel_cfg.env)
+        else:
+            inner = _runner_module_args() + ["--profile"]
+            env = _build_runner_env(self.lat_cfg, bench_target, kernel_cfg.env)
+
+        out_dir = self._kernel_trace_output_dir(kernel_cfg)
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        cmd = [
+            "rocprofv3",
+            "--kernel-trace",
+            "--output-format",
+            "csv",
+            "-d",
+            str(out_dir),
+            "--",
+            *inner,
+        ]
+        cmd_str = " ".join(cmd)
+        logger.info(f"[Latency:kernel_trace] running: {cmd_str}")
+
+        try:
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                env=env,
+                cwd=kernel_cfg.working_dir,
+                timeout=self.lat_cfg.timeout_seconds,
+            )
+        except subprocess.TimeoutExpired:
+            return LatencyResult(
+                success=False,
+                method="kernel_trace",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                command=cmd_str,
+                output_dir=str(out_dir),
+                errors=f"rocprofv3 timed out after {self.lat_cfg.timeout_seconds}s",
+            )
+
+        stdout = proc.stdout or ""
+        stderr = proc.stderr or ""
+
+        if proc.returncode != 0:
+            return LatencyResult(
+                success=False,
+                method="kernel_trace",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                command=cmd_str,
+                output_dir=str(out_dir),
+                raw_output=stdout,
+                errors=stderr.strip() or stdout.strip() or "rocprofv3 failed",
+            )
+
+        csv_path = _find_rocprofv3_csv(out_dir)
+        if csv_path is None:
+            return LatencyResult(
+                success=False,
+                method="kernel_trace",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                command=cmd_str,
+                output_dir=str(out_dir),
+                raw_output=stdout,
+                errors=f"No kernel-trace CSV found under {out_dir}",
+            )
+
+        per_kernel_ns = _parse_rocprofv3_kernel_trace_csv(
+            csv_path, self.lat_cfg.kernel_filter
+        )
+        per_kernel = _aggregate_per_kernel_durations_ns(per_kernel_ns)
+
+        # Heuristic: when no kernel_filter is set, drop "outlier" kernels
+        # that fired only a handful of times (typical for torch setup like
+        # randn / fill / kaiming) so they don't pollute the sum-of-medians
+        # summary. Always emit a warning when many distinct kernels were
+        # captured so users know to set kernel_filter for tighter numbers.
+        if not self.lat_cfg.kernel_filter and len(per_kernel) > 1:
+            max_n = max(len(s.samples_ms) for s in per_kernel.values())
+            cutoff = max(2, max_n // 10)
+            dropped = {k: len(s.samples_ms) for k, s in per_kernel.items()
+                       if len(s.samples_ms) < cutoff}
+            if dropped:
+                logger.warning(
+                    "[Latency:kernel_trace] dropping %d low-dispatch kernels "
+                    "from kernel_stats summary (set latency.kernel_filter to "
+                    "silence): %s",
+                    len(dropped),
+                    ", ".join(f"{k} ({n} dispatches)" for k, n in dropped.items()),
+                )
+                per_kernel = {
+                    k: s for k, s in per_kernel.items()
+                    if len(s.samples_ms) >= cutoff
+                }
+
+        kernel_stats = _summary_stats_from_per_kernel(per_kernel)
+
+        if kernel_stats is None:
+            return LatencyResult(
+                success=False,
+                method="kernel_trace",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                command=cmd_str,
+                output_dir=str(out_dir),
+                raw_output=stdout,
+                errors=(
+                    f"rocprofv3 produced no kernel timings in {csv_path} "
+                    "(check kernel_filter regex)"
+                ),
+            )
+
+        return LatencyResult(
+            success=True,
+            method="kernel_trace",
+            primary_metric=self.lat_cfg.primary_metric,
+            kernel_stats=kernel_stats,
+            per_kernel=per_kernel,
+            config_snapshot=snapshot,
+            command=cmd_str,
+            output_dir=str(out_dir),
+            raw_output=stdout,
+        )
+
+    def _kernel_trace_output_dir(self, kernel_cfg: KernelEvalConfig) -> Path:
+        base = self.lat_cfg.output_dir or os.path.join(
+            kernel_cfg.working_dir or os.getcwd(), "latency"
+        )
+        # Use a per-kernel subdir + timestamp to avoid clobbering across runs
+        kid = (
+            kernel_cfg.kernel_id
+            if kernel_cfg.kernel_id
+            else "kernel"
+        )
+        safe_kid = "".join(c if c.isalnum() or c in "-_" else "_" for c in kid)
+        ts = int(time.time())
+        return Path(base) / f"kernel_trace_{safe_kid}_{ts}"
+
+    # ----- rocprof_timestamps ----------------------------------------
+
+    def _run_rocprof_timestamps(
+        self,
+        eval_state: Any,
+        snapshot: Dict[str, Any],
+    ) -> LatencyResult:
+        """
+        Reuse ``pmc_perf.csv`` produced by a prior Performance stage.
+
+        Looks at ``eval_state.performance_result.workload_dir`` to find the
+        rocprof-compute workload directory. If it's missing (e.g. the
+        Performance stage was skipped), returns a soft failure suggesting
+        the user run with profiling enabled.
+        """
+        perf_result = getattr(eval_state, "performance_result", None)
+        workload_dir = getattr(perf_result, "workload_dir", None) if perf_result else None
+
+        if not workload_dir:
+            return LatencyResult(
+                success=False,
+                method="rocprof_timestamps",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                errors=(
+                    "rocprof_timestamps requires the Performance stage to have "
+                    "produced a workload_dir (rocprof-compute). Re-run with "
+                    "performance enabled, or use method=kernel_trace."
+                ),
+            )
+
+        pmc_csv = Path(workload_dir) / "pmc_perf.csv"
+        if not pmc_csv.exists():
+            return LatencyResult(
+                success=False,
+                method="rocprof_timestamps",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                output_dir=str(workload_dir),
+                errors=f"No pmc_perf.csv found in {workload_dir}",
+            )
+
+        per_kernel_ns = _parse_pmc_perf_csv_for_durations(
+            pmc_csv, self.lat_cfg.kernel_filter
+        )
+        per_kernel = _aggregate_per_kernel_durations_ns(per_kernel_ns)
+        kernel_stats = _summary_stats_from_per_kernel(per_kernel)
+
+        if kernel_stats is None:
+            return LatencyResult(
+                success=False,
+                method="rocprof_timestamps",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                output_dir=str(workload_dir),
+                errors=(
+                    f"No kernel dispatch durations parsed from {pmc_csv} "
+                    "(check kernel_filter regex)"
+                ),
+            )
+
+        return LatencyResult(
+            success=True,
+            method="rocprof_timestamps",
+            primary_metric=self.lat_cfg.primary_metric,
+            kernel_stats=kernel_stats,
+            per_kernel=per_kernel,
+            config_snapshot=snapshot,
+            output_dir=str(workload_dir),
+        )
+
+    # ----- both --------------------------------------------------------
+
+    def _run_both(
+        self,
+        bench_target: Optional[BenchTarget],
+        kernel_cfg: KernelEvalConfig,
+        has_harness_cmd: bool,
+        eval_state: Any,
+        snapshot: Dict[str, Any],
+    ) -> LatencyResult:
+        wall = self._run_cuda_graph(bench_target, kernel_cfg, has_harness_cmd, snapshot)
+
+        # Pick the kernel-only method based on environment availability
+        if shutil.which("rocprofv3") is not None and bench_target is not None:
+            kern = self._run_kernel_trace(bench_target, kernel_cfg, snapshot)
+        else:
+            # Try rocprof_timestamps as a fallback; only works when
+            # Performance has produced a workload_dir.
+            kern = self._run_rocprof_timestamps(eval_state, snapshot)
+
+        wall_ok = wall.success and wall.wall_stats is not None
+        kern_ok = kern.success and kern.kernel_stats is not None
+
+        if not wall_ok and not kern_ok:
+            errs = [
+                e for e in (wall.errors, kern.errors) if e
+            ]
+            return LatencyResult(
+                success=False,
+                method="both",
+                primary_metric=self.lat_cfg.primary_metric,
+                config_snapshot=snapshot,
+                errors="; ".join(errs) or "both methods failed",
+            )
+
+        merged = LatencyResult(
+            success=True,
+            method="both",
+            primary_metric=self.lat_cfg.primary_metric,
+            wall_stats=wall.wall_stats if wall_ok else None,
+            kernel_stats=kern.kernel_stats if kern_ok else None,
+            per_kernel=kern.per_kernel if kern_ok else {},
+            config_snapshot=snapshot,
+            command=" && ".join(
+                c for c in (wall.command, kern.command) if c
+            ) or None,
+            output_dir=kern.output_dir,
+            raw_output=wall.raw_output,
+        )
+
+        if wall_ok and kern_ok and wall.wall_stats and kern.kernel_stats:
+            wall_us = wall.wall_stats.median_ms * 1000.0
+            kern_us = kern.kernel_stats.median_ms * 1000.0
+            merged.dispatch_overhead_us = wall_us - kern_us
+            if kern_us > 0:
+                merged.crosscheck_vs_rocprof_ratio = wall_us / kern_us
+                if not (0.5 <= merged.crosscheck_vs_rocprof_ratio <= 2.0):
+                    merged.crosscheck_warning = (
+                        f"wall/kernel ratio {merged.crosscheck_vs_rocprof_ratio:.2f} "
+                        "is outside [0.5, 2.0]; check warmup, timer pollution, "
+                        "or kernel_filter."
+                    )
+
+        if not wall_ok:
+            # Surface kernel-only error so users see why wall failed
+            merged.errors = wall.errors
+        elif not kern_ok:
+            merged.errors = kern.errors
+
+        return merged
diff --git a/Magpie/eval/performance.py b/Magpie/eval/performance.py
index 95aa17a..e8441b2 100644
--- a/Magpie/eval/performance.py
+++ b/Magpie/eval/performance.py
@@ -12,6 +12,33 @@
 - CUDA kernels: ncu (NVIDIA Nsight Compute)
 - Triton kernels: auto-selected based on GPU arch (rocprof-compute on AMD,
   ncu on NVIDIA) since Triton JIT-compiles to native HIP/CUDA dispatches
+
+0-overhead guarantee
+~~~~~~~~~~~~~~~~~~~~
+
+For HIP and Triton-on-AMD, kernel duration comes from
+``Start_Timestamp`` / ``End_Timestamp`` columns of ``pmc_perf.csv``,
+which are HW per-dispatch timestamps written by the GPU's command
+processor. **There is no Python or runtime instrumentation between the
+kernel launch and the recorded timestamp** — the host runs the kernel
+exactly as it would in production, and the wallclock value is sourced
+purely from the device. The same applies to ``ncu``'s GPU-side timing
+on NVIDIA.
+
+Anti-pattern (DO NOT DO THIS in your testcase):
+
+  for j in range(n_iter):
+      start_events[j].record()
+      mod.triton_scaled_mm(...)        # <- includes Python + JIT + dispatch
+      end_events[j].record()
+      torch.cuda.synchronize()
+
+This pattern times the *host*-driven launch path including JIT
+specialization, autotuning, and the Python dispatcher and is **not**
+0-overhead. Use :class:`Magpie.eval.latency.Latency` (the dedicated
+Latency stage) for in-process wall-clock timing — it captures the
+kernel inside a CUDA graph so dispatch overhead is amortized across
+many replays.
 """
 
 from __future__ import annotations
@@ -40,6 +67,74 @@
 logger = logging.getLogger(__name__)
 
 
+# Anti-pattern signatures that indicate naive in-process CUDA event timing.
+# When any pair of these tokens appears in a Python testcase, log a warning
+# pointing the user at magpie.bench.do_bench_cudagraph.
+_NAIVE_TIMING_PAIRS = [
+    ("start_events[", "end_events["),
+    (".record(", "elapsed_time("),
+    ("Event(enable_timing=True)", "elapsed_time("),
+]
+
+
+def _warn_if_in_process_timing(kernel_cfg: KernelEvalConfig) -> None:
+    """
+    Best-effort scan of the testcase script for naive in-process CUDA-event
+    timing. Emits a single warning per kernel and silently no-ops when source
+    is not available.
+    """
+    try:
+        cmds = kernel_cfg.get_testcase_commands()
+    except Exception:
+        return
+    if not cmds:
+        return
+
+    # Collect candidate .py files referenced in the testcase command(s)
+    py_paths: List[Path] = []
+    for cmd in cmds:
+        for tok in cmd:
+            if isinstance(tok, str) and tok.endswith(".py"):
+                p = Path(tok)
+                if not p.is_absolute() and kernel_cfg.working_dir:
+                    p = Path(kernel_cfg.working_dir) / p
+                if p.exists() and p.is_file():
+                    py_paths.append(p)
+    # Also scan declared source files
+    for src in kernel_cfg.get_source_file_paths():
+        p = Path(src)
+        if p.suffix == ".py" and p.exists() and p.is_file():
+            py_paths.append(p)
+
+    seen: set = set()
+    for p in py_paths:
+        rp = p.resolve()
+        if rp in seen:
+            continue
+        seen.add(rp)
+        try:
+            text = rp.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+        # Skip if the user is already using do_bench_cudagraph
+        if "do_bench_cudagraph" in text or "magpie.bench" in text or "Magpie.bench" in text:
+            continue
+        for a, b in _NAIVE_TIMING_PAIRS:
+            if a in text and b in text:
+                logger.warning(
+                    "[%s] testcase '%s' uses naive in-process CUDA event timing "
+                    "('%s' / '%s'). This includes Python + dispatch + JIT overhead "
+                    "and is NOT 0-overhead. For wall-clock latency, use "
+                    "magpie.bench.do_bench_cudagraph (see docs/latency.md). "
+                    "rocprof-based perf metrics from this run remain accurate.",
+                    kernel_cfg.kernel_id,
+                    rp,
+                    a,
+                    b,
+                )
+                return
+
+
 @dataclass
 class MetricResult:
     """A single performance metric result."""
@@ -210,6 +305,12 @@ def run(
         if not self.perf_cfg.enabled:
             return None
 
+        # Heuristic: warn the user if their testcase script uses naive
+        # torch.cuda.Event timing; it would otherwise look like Magpie is
+        # reporting that overhead-laden number alongside the 0-overhead
+        # rocprof timestamps.
+        _warn_if_in_process_timing(kernel_cfg)
+
         # 2. If custom prof_command is provided, use it
         if kernel_cfg.has_prof_command():
             return self._run_custom_profiler(kernel_cfg)
diff --git a/Magpie/kernel_config.yaml.example b/Magpie/kernel_config.yaml.example
index fc6b8ef..847b0a3 100644
--- a/Magpie/kernel_config.yaml.example
+++ b/Magpie/kernel_config.yaml.example
@@ -142,3 +142,55 @@ kernel:
 #     kernel_filter: null       # regex to filter kernel names
 #     num_replays: 3
 #     timeout_seconds: 120
+
+# =============================================================================
+# Triton Kernel with 0-overhead Latency Harness
+# =============================================================================
+# For Triton (or any PyTorch / CUDA Python kernel), Magpie can run an in-process
+# CUDA-graph based timer (do_bench_cudagraph) and/or wrap the kernel in a
+# rocprofv3 --kernel-trace subprocess to extract pure HW kernel durations.
+#
+# When `bench_target` is set on the kernel, Magpie spawns a tiny subprocess
+# that imports `module.callable` and times it with the user's inputs.
+
+# kernel:
+#   id: "triton_scaled_mm"
+#   type: triton
+#   source_files: ["./my_kernels/scaled_mm.py"]
+#   working_dir: "./my_kernels"
+#   testcase_command: "python -m my_kernels.scaled_mm --check"   # for correctness/perf
+#   # Latency-harness target (per-kernel; wins over latency.bench_target)
+#   bench_target:
+#     module: "my_kernels.scaled_mm"
+#     callable: "triton_scaled_mm"
+#     get_inputs: "get_inputs"          # returns (args, kwargs) or args tuple
+
+# =============================================================================
+# Latency Harness Settings (Optional, framework-wide)
+# =============================================================================
+# Add a top-level `latency:` section to control the 0-overhead timing stage.
+#
+# Methods:
+#   auto                 - both for triton/pytorch/cuda; rocprof_timestamps for hip
+#   cuda_graph           - in-process do_bench_cudagraph (dispatch-inclusive)
+#   kernel_trace         - rocprofv3 --kernel-trace harness (kernel-only timing,
+#                          required for autotuning small kernels)
+#   rocprof_timestamps   - reuse pmc_perf.csv from the Performance stage
+#   both                 - run wall-clock + kernel-only and report
+#                          dispatch_overhead_us = wall - kernel
+#   none                 - disable
+
+# latency:
+#   enabled: true
+#   method: auto
+#   primary_metric: kernel_median_ms   # use kernel-only timing for compare ranking
+#                                      # (set to wall_median_ms for end-to-end ranking)
+#   rep_ms: 20                         # target measurement window (ms)
+#   n_retries: 5                       # number of independent measurements
+#   estimate_reps: 5                   # reps inside the cost-estimate graph
+#   warmup_iters: 5                    # eager warmup iterations
+#   seed: 42                           # torch.manual_seed for reproducibility
+#   kernel_filter: null                # regex applied to per-dispatch kernel names
+#   pythonpath:                        # prepended to PYTHONPATH of subprocess
+#     - "/abs/path/to/my_kernels_repo" # so non-installed user packages import cleanly
+#   timeout_seconds: 120
diff --git a/Magpie/main.py b/Magpie/main.py
index e03f923..f343f78 100644
--- a/Magpie/main.py
+++ b/Magpie/main.py
@@ -71,19 +71,27 @@ def parse_kernel_type(type_str: str) -> KernelType:
 
 def load_kernel_config(
     kernel_config_path: Path,
-) -> tuple[List[KernelEvalConfig], Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
+) -> tuple[
+    List[KernelEvalConfig],
+    Dict[str, Any],
+    Dict[str, Any],
+    Dict[str, Any],
+    Dict[str, Any],
+]:
     """
     Load kernel configuration from YAML file.
 
     The YAML may optionally contain:
     - ``performance:`` — overrides framework-level profiler settings
     - ``correctness:`` — overrides framework-level correctness settings
-    - ``ray_config:`` — Ray cluster settings (implies ``environment: ray``)
-    - ``scheduler:`` — scheduler-level overrides (environment, workers, …)
+    - ``latency:``     — overrides framework-level latency-harness settings
+    - ``ray_config:``  — Ray cluster settings (implies ``environment: ray``)
+    - ``scheduler:``   — scheduler-level overrides (environment, workers, …)
 
     Returns:
         Tuple of (kernel configs, performance overrides, correctness overrides,
-        scheduler overrides). Override dicts are empty when absent.
+        scheduler overrides, latency overrides). Override dicts are empty when
+        absent.
     """
     data = load_yaml(kernel_config_path)
     configs = []
@@ -106,12 +114,14 @@ def load_kernel_config(
 
     corr_overrides = _expand_env_vars(data.get("correctness", {}))
 
+    lat_overrides = _expand_env_vars(data.get("latency", {}))
+
     sched_overrides: Dict[str, Any] = dict(data.get("scheduler", {}))
     if "ray_config" in data:
         sched_overrides["ray_config"] = data["ray_config"]
         sched_overrides.setdefault("environment", "ray")
 
-    return configs, perf_overrides, corr_overrides, sched_overrides
+    return configs, perf_overrides, corr_overrides, sched_overrides, lat_overrides
 
 
 def _parse_command_list(cmd_entry) -> Optional[List]:
@@ -199,6 +209,14 @@ def _parse_kernel_entry(entry: Dict[str, Any]) -> Optional[KernelEvalConfig]:
     # Parse prof command(s)
     prof_cmd = _parse_command_list(entry.get("prof_command"))
 
+    # Parse optional bench_target (per-kernel latency harness target)
+    bench_target_raw = entry.get("bench_target")
+    bench_target = (
+        _expand_env_vars(bench_target_raw)
+        if isinstance(bench_target_raw, dict)
+        else None
+    )
+
     return KernelEvalConfig(
         kernel_id=entry.get("id", "kernel"),
         kernel_type=kernel_type,
@@ -208,6 +226,7 @@ def _parse_kernel_entry(entry: Dict[str, Any]) -> Optional[KernelEvalConfig]:
         testcase_command=testcase_cmd,
         compiling_command=compile_cmd,
         prof_command=prof_cmd,
+        bench_target=bench_target,
         get_inputs_func=entry.get("get_inputs_func", "get_inputs"),
         get_init_inputs_func=entry.get("get_init_inputs_func", "get_init_inputs"),
     )
@@ -418,6 +437,73 @@ def _get_compare_config(config: Dict[str, Any]) -> Dict[str, Any]:
     return config.get("compare", {})
 
 
+def _get_latency_config(config: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Get latency-harness configuration from framework config.
+
+    Returns:
+        Dict that can be passed straight into ``LatencyConfig.from_dict``.
+    """
+    lat_cfg = config.get("latency", {}) or {}
+    return {
+        "enabled": lat_cfg.get("enabled", True),
+        "method": lat_cfg.get("method", "auto"),
+        "primary_metric": lat_cfg.get("primary_metric", "wall_median_ms"),
+        "rep_ms": lat_cfg.get("rep_ms", 20),
+        "n_retries": lat_cfg.get("n_retries", 5),
+        "estimate_reps": lat_cfg.get("estimate_reps", 5),
+        "warmup_iters": lat_cfg.get("warmup_iters", 5),
+        "seed": lat_cfg.get("seed", 42),
+        "kernel_filter": lat_cfg.get("kernel_filter"),
+        "bench_target": lat_cfg.get("bench_target"),
+        "pythonpath": list(lat_cfg.get("pythonpath", []) or []),
+        "timeout_seconds": lat_cfg.get("timeout_seconds", 120.0),
+        "output_dir": lat_cfg.get("output_dir"),
+    }
+
+
+def _apply_latency_overrides(
+    settings: Dict[str, Any], overrides: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Merge per-kernel-YAML ``latency:`` overrides into framework defaults."""
+    merged = dict(settings)
+    for key in (
+        "enabled",
+        "method",
+        "primary_metric",
+        "rep_ms",
+        "n_retries",
+        "estimate_reps",
+        "warmup_iters",
+        "seed",
+        "kernel_filter",
+        "bench_target",
+        "timeout_seconds",
+        "output_dir",
+    ):
+        if key in overrides:
+            merged[key] = overrides[key]
+    if "pythonpath" in overrides:
+        merged["pythonpath"] = list(overrides["pythonpath"] or [])
+    return merged
+
+
+def _apply_latency_cli_overrides(
+    settings: Dict[str, Any], args
+) -> Dict[str, Any]:
+    """Apply CLI flags (--no-latency, --latency-method, --latency-rep-ms)."""
+    merged = dict(settings)
+    if getattr(args, "no_latency", False):
+        merged["enabled"] = False
+    method = getattr(args, "latency_method", None)
+    if method:
+        merged["method"] = method
+    rep_ms = getattr(args, "latency_rep_ms", None)
+    if rep_ms:
+        merged["rep_ms"] = int(rep_ms)
+    return merged
+
+
 def _get_scheduler_config(
     config: Dict[str, Any],
     args,
@@ -481,12 +567,17 @@ def run_analyze(args, config: Dict[str, Any]) -> int:
     perf_overrides: Dict[str, Any] = {}
     corr_overrides: Dict[str, Any] = {}
     sched_overrides: Dict[str, Any] = {}
+    lat_overrides: Dict[str, Any] = {}
 
     if args.kernel_config:
         # Load from kernel config file
-        kernel_configs, perf_overrides, corr_overrides, sched_overrides = (
-            load_kernel_config(args.kernel_config)
-        )
+        (
+            kernel_configs,
+            perf_overrides,
+            corr_overrides,
+            sched_overrides,
+            lat_overrides,
+        ) = load_kernel_config(args.kernel_config)
         if not kernel_configs:
             logger.error(f"No kernels found in {args.kernel_config}")
             return 1
@@ -527,12 +618,18 @@ def run_analyze(args, config: Dict[str, Any]) -> int:
     compile_settings = _get_compiling_config(config)
     perf_settings = _get_performance_config(config, kernel_type)
     corr_settings = _get_correctness_config(config)
+    lat_settings = _get_latency_config(config)
 
     # Apply per-config overrides (from kernel config YAML)
     if perf_overrides:
         perf_settings = _apply_perf_overrides(perf_settings, perf_overrides, kernel_type)
     if corr_overrides:
         corr_settings = _apply_correctness_overrides(corr_settings, corr_overrides)
+    if lat_overrides:
+        lat_settings = _apply_latency_overrides(lat_settings, lat_overrides)
+
+    # CLI flag overrides
+    lat_settings = _apply_latency_cli_overrides(lat_settings, args)
 
     # Create workspace before profiling so profiler writes directly there
     label = kernel_configs[0].kernel_id if kernel_configs else ""
@@ -543,6 +640,10 @@ def run_analyze(args, config: Dict[str, Any]) -> int:
     perf_settings["rocprof_config"]["output_dir"] = perf_dir
     perf_settings["metrix_config"]["output_dir"] = perf_dir
 
+    lat_dir = str(ws_path / "latency")
+    Path(lat_dir).mkdir(parents=True, exist_ok=True)
+    lat_settings["output_dir"] = lat_dir
+
     corr_settings["workspace_path"] = str(ws_path)
 
     scheduler_config = _get_scheduler_config(config, args, sched_overrides)
@@ -564,6 +665,7 @@ def run_analyze(args, config: Dict[str, Any]) -> int:
             ncu_config=perf_settings["ncu_config"],
             metrix_config=perf_settings["metrix_config"],
             correctness_config=corr_settings,
+            latency_config=lat_settings,
         )
 
         # Unwrap Ray result format: {'task_id': ..., 'results': [...]}
@@ -610,11 +712,16 @@ def run_compare(args, config: Dict[str, Any]) -> int:
     perf_overrides: Dict[str, Any] = {}
     corr_overrides: Dict[str, Any] = {}
     sched_overrides: Dict[str, Any] = {}
+    lat_overrides: Dict[str, Any] = {}
 
     if args.kernel_config:
-        kernel_configs, perf_overrides, corr_overrides, sched_overrides = (
-            load_kernel_config(args.kernel_config)
-        )
+        (
+            kernel_configs,
+            perf_overrides,
+            corr_overrides,
+            sched_overrides,
+            lat_overrides,
+        ) = load_kernel_config(args.kernel_config)
     elif args.kernels:
         kernel_type = parse_kernel_type(args.type)
 
@@ -644,6 +751,7 @@ def run_compare(args, config: Dict[str, Any]) -> int:
     compile_settings = _get_compiling_config(config)
     perf_settings = _get_performance_config(config, kernel_type)
     corr_settings = _get_correctness_config(config)
+    lat_settings = _get_latency_config(config)
     compare_settings = _get_compare_config(config)
 
     # Apply per-config overrides (from kernel config YAML)
@@ -651,6 +759,10 @@ def run_compare(args, config: Dict[str, Any]) -> int:
         perf_settings = _apply_perf_overrides(perf_settings, perf_overrides, kernel_type)
     if corr_overrides:
         corr_settings = _apply_correctness_overrides(corr_settings, corr_overrides)
+    if lat_overrides:
+        lat_settings = _apply_latency_overrides(lat_settings, lat_overrides)
+
+    lat_settings = _apply_latency_cli_overrides(lat_settings, args)
 
     # Create workspace before profiling so profiler writes directly there
     ws_path = _create_workspace(args.output_dir, "compare")
@@ -660,6 +772,10 @@ def run_compare(args, config: Dict[str, Any]) -> int:
     perf_settings["rocprof_config"]["output_dir"] = perf_dir
     perf_settings["metrix_config"]["output_dir"] = perf_dir
 
+    lat_dir = str(ws_path / "latency")
+    Path(lat_dir).mkdir(parents=True, exist_ok=True)
+    lat_settings["output_dir"] = lat_dir
+
     corr_settings["workspace_path"] = str(ws_path)
 
     scheduler_config = _get_scheduler_config(config, args, sched_overrides)
@@ -682,6 +798,7 @@ def run_compare(args, config: Dict[str, Any]) -> int:
             ncu_config=perf_settings["ncu_config"],
             metrix_config=perf_settings["metrix_config"],
             correctness_config=corr_settings,
+            latency_config=lat_settings,
             compare_config=compare_settings,
         )
 
@@ -724,6 +841,8 @@ def _dict_to_eval_state(state_dict: Dict[str, Any]) -> EvaluationState:
         state.correctness_state = BaseKind[state_dict["correctness_state"]]
     if "performance_state" in state_dict:
         state.performance_state = BaseKind[state_dict["performance_state"]]
+    if "latency_state" in state_dict:
+        state.latency_state = BaseKind[state_dict["latency_state"]]
     if "score" in state_dict:
         state.score = state_dict["score"]
     if "errors" in state_dict:
@@ -731,6 +850,26 @@ def _dict_to_eval_state(state_dict: Dict[str, Any]) -> EvaluationState:
     if "extra" in state_dict:
         state.extra = state_dict["extra"]
 
+    lat_data = state_dict.get("latency_result")
+    if lat_data:
+        from .eval.latency import LatencyResult
+        from .bench import LatencyStats
+
+        state.latency_result = LatencyResult(
+            success=lat_data.get("success", False),
+            method=lat_data.get("method", "none"),
+            primary_metric=lat_data.get("primary_metric", "wall_median_ms"),
+            wall_stats=LatencyStats.from_dict(lat_data.get("wall_stats")),
+            kernel_stats=LatencyStats.from_dict(lat_data.get("kernel_stats")),
+            dispatch_overhead_us=lat_data.get("dispatch_overhead_us"),
+            crosscheck_vs_rocprof_ratio=lat_data.get("crosscheck_vs_rocprof_ratio"),
+            crosscheck_warning=lat_data.get("crosscheck_warning"),
+            config_snapshot=lat_data.get("config", {}),
+            command=lat_data.get("command"),
+            output_dir=lat_data.get("output_dir"),
+            errors=lat_data.get("errors"),
+        )
+
     return state
 
 
@@ -744,14 +883,43 @@ def _print_result(kernel_cfg: KernelEvalConfig, result: EvaluationState) -> None
         print(f"Compiling: {result.get('compiling_state', 'UNKNOWN')}")
         print(f"Correctness: {result.get('correctness_state', 'UNKNOWN')}")
         print(f"Performance: {result.get('performance_state', 'UNKNOWN')}")
+        print(f"Latency: {result.get('latency_state', 'SKIPPED')}")
         print(f"Score: {result.get('score', 0.0):.2f}")
         errors = result.get("errors", [])
+        latency_dict = result.get("latency_result")
     else:
         print(f"Compiling: {result.compiling_state.name}")
         print(f"Correctness: {result.correctness_state.name}")
         print(f"Performance: {result.performance_state.name}")
+        print(f"Latency: {result.latency_state.name}")
         print(f"Score: {result.score:.2f}")
         errors = result.errors
+        latency_dict = (
+            result.latency_result.to_dict() if result.latency_result else None
+        )
+
+    if latency_dict and latency_dict.get("success"):
+        method = latency_dict.get("method", "?")
+        wall = latency_dict.get("wall_stats")
+        kern = latency_dict.get("kernel_stats")
+        print(f"  method:       {method}")
+        if wall and wall.get("median_ms") is not None:
+            print(
+                f"  wall_median:  {wall['median_ms']:.4f} ms "
+                f"(p99 {wall.get('p99_ms', 0.0):.4f}, std {wall.get('std_ms', 0.0):.4f})"
+            )
+        if kern and kern.get("median_ms") is not None:
+            print(
+                f"  kernel_median:{kern['median_ms']:.4f} ms "
+                f"(p99 {kern.get('p99_ms', 0.0):.4f})"
+            )
+        if latency_dict.get("dispatch_overhead_us") is not None:
+            print(
+                f"  dispatch:     {latency_dict['dispatch_overhead_us']:.2f} us "
+                "(wall - kernel)"
+            )
+        if latency_dict.get("crosscheck_warning"):
+            print(f"  WARN:         {latency_dict['crosscheck_warning']}")
 
     if errors:
         print("Errors:")
@@ -807,7 +975,35 @@ def _save_config_snapshot(
 
 
 def _save_results(results: List, ws_path: Path, mode: str) -> None:
-    """Save analysis/compare results into the workspace as a JSON report."""
+    """Save analysis/compare results into the workspace as a JSON report.
+
+    Layout::
+
+        {
+          "mode": ...,
+          "timestamp": ...,
+          "summary": [               # one entry per kernel, headline numbers
+            {
+              "kernel_id": ...,
+              "compiling_state": ...,
+              "correctness_state": ...,
+              "performance_state": ...,
+              "latency_state": ...,
+              "score": ...,
+              "latency": {
+                "method": ...,
+                "primary_metric": ...,
+                "primary_value_ms": ...,
+                "wall_median_ms": ...,
+                "kernel_median_ms": ...,
+                "dispatch_overhead_us": ...,
+                "crosscheck_vs_rocprof_ratio": ...
+              }
+            }, ...
+          ],
+          "results": [<full EvaluationState dicts>]
+        }
+    """
     import json
     from datetime import datetime
 
@@ -815,17 +1011,26 @@ def _save_results(results: List, ws_path: Path, mode: str) -> None:
     report_file = ws_path / f"{mode}_report.json"
 
     serialized_results: List[Any] = []
+    summary: List[Dict[str, Any]] = []
     for r in results:
         if isinstance(r, dict):
             serialized_results.append(r)
+            summary.append(_build_kernel_summary(r))
         elif hasattr(r, "to_dict"):
-            serialized_results.append(r.to_dict())
+            d = r.to_dict()
+            serialized_results.append(d)
+            summary.append(_build_kernel_summary(d))
         else:
             serialized_results.append(str(r))
 
     with open(report_file, "w") as f:
         json.dump(
-            {"mode": mode, "timestamp": timestamp, "results": serialized_results},
+            {
+                "mode": mode,
+                "timestamp": timestamp,
+                "summary": summary,
+                "results": serialized_results,
+            },
             f,
             indent=2,
         )
@@ -833,6 +1038,43 @@ def _save_results(results: List, ws_path: Path, mode: str) -> None:
     logger.info(f"Results saved to {report_file}")
 
 
+def _build_kernel_summary(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Extract per-kernel headline numbers (perf + latency) for the summary block."""
+    summary: Dict[str, Any] = {
+        "kernel_id": (state_dict.get("extra") or {}).get("kernel_id"),
+        "kernel_type": (state_dict.get("extra") or {}).get("kernel_type"),
+        "compiling_state": state_dict.get("compiling_state"),
+        "correctness_state": state_dict.get("correctness_state"),
+        "performance_state": state_dict.get("performance_state"),
+        "latency_state": state_dict.get("latency_state"),
+        "score": state_dict.get("score"),
+    }
+
+    lat = state_dict.get("latency_result")
+    if lat:
+        wall = lat.get("wall_stats") or {}
+        kern = lat.get("kernel_stats") or {}
+        summary["latency"] = {
+            "method": lat.get("method"),
+            "primary_metric": lat.get("primary_metric"),
+            "primary_value_ms": lat.get("primary_value_ms"),
+            "wall_median_ms": wall.get("median_ms"),
+            "kernel_median_ms": kern.get("median_ms"),
+            "dispatch_overhead_us": lat.get("dispatch_overhead_us"),
+            "crosscheck_vs_rocprof_ratio": lat.get("crosscheck_vs_rocprof_ratio"),
+            "crosscheck_warning": lat.get("crosscheck_warning"),
+        }
+
+    perf = state_dict.get("performance_result")
+    if perf and isinstance(perf, dict):
+        # Bring across just the rolled-up summary metrics if present.
+        perf_summary = perf.get("summary") or perf.get("summary_metrics")
+        if perf_summary is not None:
+            summary["performance_summary"] = perf_summary
+
+    return summary
+
+
 def _save_comparison(comparison: Any, ws_path: Path) -> None:
     """Save comparison results into the workspace as a JSON report."""
     import json
@@ -848,11 +1090,19 @@ def _save_comparison(comparison: Any, ws_path: Path) -> None:
     else:
         comparison_data = {"result": str(comparison)}
 
+    # Build per-kernel summary block highlighting latency / perf headlines so
+    # consumers don't have to walk the full nested kernel_results structure.
+    kernel_summary: List[Dict[str, Any]] = []
+    for r in comparison_data.get("kernel_results", []):
+        if isinstance(r, dict):
+            kernel_summary.append(_build_kernel_summary(r))
+
     with open(report_file, "w") as f:
         json.dump(
             {
                 "mode": "compare",
                 "timestamp": timestamp,
+                "summary": kernel_summary,
                 "results": {
                     "kernel_results": comparison_data.get("kernel_results", []),
                     "comparison_metrics": comparison_data.get("comparison_metrics", {}),
@@ -1116,6 +1366,22 @@ def create_parser() -> argparse.ArgumentParser:
     analyze_parser.add_argument(
         "--no-perf", action="store_true", help="Skip performance profiling"
     )
+    analyze_parser.add_argument(
+        "--no-latency",
+        action="store_true",
+        help="Skip 0-overhead latency harness (cuda_graph / kernel_trace)",
+    )
+    analyze_parser.add_argument(
+        "--latency-method",
+        type=str,
+        choices=["auto", "cuda_graph", "kernel_trace", "rocprof_timestamps", "both", "none"],
+        help="Override latency.method (default: auto from kernel type)",
+    )
+    analyze_parser.add_argument(
+        "--latency-rep-ms",
+        type=int,
+        help="Override latency.rep_ms (target measurement window in ms)",
+    )
     analyze_parser.add_argument(
         "--output-dir",
         "-o",
@@ -1148,6 +1414,22 @@ def create_parser() -> argparse.ArgumentParser:
     compare_parser.add_argument(
         "--no-perf", action="store_true", help="Skip performance profiling"
     )
+    compare_parser.add_argument(
+        "--no-latency",
+        action="store_true",
+        help="Skip 0-overhead latency harness (cuda_graph / kernel_trace)",
+    )
+    compare_parser.add_argument(
+        "--latency-method",
+        type=str,
+        choices=["auto", "cuda_graph", "kernel_trace", "rocprof_timestamps", "both", "none"],
+        help="Override latency.method (default: auto from kernel type)",
+    )
+    compare_parser.add_argument(
+        "--latency-rep-ms",
+        type=int,
+        help="Override latency.rep_ms (target measurement window in ms)",
+    )
     compare_parser.add_argument(
         "--output-dir",
         "-o",
diff --git a/Magpie/modes/analyze_eval/analyzer.py b/Magpie/modes/analyze_eval/analyzer.py
index 5f3df50..8d35ebc 100644
--- a/Magpie/modes/analyze_eval/analyzer.py
+++ b/Magpie/modes/analyze_eval/analyzer.py
@@ -25,6 +25,7 @@
     CompilingConfig,
     CorrectnessConfig,
     CorrectnessMode,
+    LatencyConfig,
     PerformanceConfig,
 )
 from ...config.performance import RocprofComputeConfig, NcuConfig, MetrixConfig, PerfBackend
@@ -59,6 +60,7 @@ class AnalyzeConfig:
     ncu_config: Dict[str, Any] = field(default_factory=dict)
     metrix_config: Dict[str, Any] = field(default_factory=dict)
     correctness_config: Dict[str, Any] = field(default_factory=dict)
+    latency_config: Dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self):
         if self.gpu_arch is None:
@@ -144,6 +146,13 @@ def analyze(self, kernel_cfg: KernelEvalConfig) -> EvaluationState:
         # Build correctness config
         corr_cfg = self._build_correctness_config()
 
+        # Build latency config from dict
+        lat_cfg = LatencyConfig.from_dict(
+            self.config.latency_config,
+            kernel_type=kernel_cfg.kernel_type,
+            gpu_arch=self.config.gpu_arch,
+        )
+
         # Build pipeline config for analyze mode
         pipeline_cfg = PipelineConfig(
             mode=EvalMode.ANALYZE,
@@ -164,6 +173,7 @@ def analyze(self, kernel_cfg: KernelEvalConfig) -> EvaluationState:
                 ncu_config=ncu_cfg,
                 metrix_config=metrix_cfg,
             ),
+            latency_config=lat_cfg,
         )
 
         evaluator = Evaluator(pipeline_cfg)
diff --git a/Magpie/modes/compare_eval/comparator.py b/Magpie/modes/compare_eval/comparator.py
index 4a3e8a0..fdafdde 100644
--- a/Magpie/modes/compare_eval/comparator.py
+++ b/Magpie/modes/compare_eval/comparator.py
@@ -25,6 +25,7 @@
     CompilingConfig,
     CorrectnessConfig,
     CorrectnessMode,
+    LatencyConfig,
     PerformanceConfig,
 )
 from ...config.performance import RocprofComputeConfig, NcuConfig, MetrixConfig, PerfBackend
@@ -72,6 +73,7 @@ def _detect_gpu_arch(self) -> None:
     ncu_config: Dict[str, Any] = field(default_factory=dict)
     metrix_config: Dict[str, Any] = field(default_factory=dict)
     correctness_config: Dict[str, Any] = field(default_factory=dict)
+    latency_config: Dict[str, Any] = field(default_factory=dict)
     # Winner selection strategy: "correctness_first" or "perf_score"
     winner_strategy: str = "perf_score"
     # Per-backend scoring weights
@@ -198,6 +200,13 @@ def _compare_sequential(
             # Build correctness config
             corr_cfg = self._build_correctness_config(corr_mode)
 
+            # Build latency config from dict
+            lat_cfg = LatencyConfig.from_dict(
+                self.config.latency_config,
+                kernel_type=cfg.kernel_type,
+                gpu_arch=self.config.gpu_arch,
+            )
+
             # Build pipeline config
             pipeline_cfg = PipelineConfig(
                 mode=EvalMode.COMPARE,
@@ -218,6 +227,7 @@ def _compare_sequential(
                     ncu_config=ncu_cfg,
                     metrix_config=metrix_cfg,
                 ),
+                latency_config=lat_cfg,
             )
 
             evaluator = Evaluator(pipeline_cfg)
diff --git a/docs/analysis_compare.md b/docs/analysis_compare.md
index 988bacb..2590066 100644
--- a/docs/analysis_compare.md
+++ b/docs/analysis_compare.md
@@ -1,6 +1,6 @@
 # Analyze vs Compare
 
-Magpie’s **Analyze** and **Compare** modes both evaluate GPU kernels (HIP, CUDA, PyTorch, Triton) through the same underlying pipeline—compile (optional), correctness, and optional performance profiling—but they differ in how many kernels you evaluate and how a “winner” is chosen.
+Magpie’s **Analyze** and **Compare** modes both evaluate GPU kernels (HIP, CUDA, PyTorch, Triton) through the same underlying pipeline—compile (optional), correctness, optional performance profiling, and optional 0-overhead [Latency](latency.md)—but they differ in how many kernels you evaluate and how a “winner” is chosen.
 
 ## At a glance
 
diff --git a/docs/benchmark.md b/docs/benchmark.md
index 2e9c00f..2471e02 100644
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -2,6 +2,8 @@
 
 Benchmark mode enables framework-level performance benchmarking for LLM inference engines (vLLM, SGLang) with integrated trace analysis capabilities.
 
+> Looking for *per-kernel* 0-overhead latency for HIP / CUDA / Triton kernels? See [Latency](latency.md). Benchmark mode targets framework-level (server / client) measurements, not single-kernel microbenchmarks.
+
 **Execution:** Benchmarks use `run_mode`: **`docker`** (default), **`local`** (host / in-pod, via YAML or `--run-mode local`), or **`ray`** (driver submits `RayJobExecutor`; a **GPU worker** runs the same InferenceX → vLLM/SGLang flow—see [Magpie + Ray](ray-magpie.md)). InferenceX is cloned automatically when `inferencex_path` is empty (see `Magpie/config.yaml` `benchmark.inferencex_path`).
 
 ## Overview
diff --git a/docs/latency.md b/docs/latency.md
new file mode 100644
index 0000000..471d542
--- /dev/null
+++ b/docs/latency.md
@@ -0,0 +1,180 @@
+# 0-Overhead Kernel Latency
+
+The **Latency** stage runs alongside Magpie's `Performance` stage and produces a 0-overhead measurement of how long the kernel actually takes to run, free of Python, JIT, and dispatch noise.
+
+It complements (rather than replaces) the HW-counter metrics from `rocprof-compute` / `ncu` / `metrix`, which are already 0-overhead but measure *throughput*-style quantities (FLOPs, bandwidth, occupancy). Latency answers the orthogonal question: *how many milliseconds does each kernel call take?*
+
+## Two timing semantics — and when to use each
+
+There are two fundamentally different ways to ask "how long does this kernel take?" Magpie supports both and emits them side-by-side in the report.
+
+| Method | What it measures | Use when |
+| --- | --- | --- |
+| **`cuda_graph`** (wall-clock) | End-to-end per-call latency including one CUDA-graph launch's worth of host overhead. Implemented via `do_bench_cudagraph`: warmup → estimate graph → unrolled replay → median across `n_retries`. Dispatch is amortized across `n_repeat` calls inside a single graph capture. | Comparing two *different* kernel implementations end-to-end (e.g. PyTorch vs Triton vs HIP), reporting numbers that match what production sees. |
+| **`kernel_trace`** (kernel-only) | Pure HW per-dispatch duration extracted from `rocprofv3 --kernel-trace`. The runner runs in `--profile` mode (tight `for _ in range(N): fn()` loop, no graph capture, no event recording) so the outer profiler captures clean kernel start/end timestamps. | **Kernel-config autotuning** (BLOCK_M, num_warps, num_stages, etc.). Dispatch overhead is roughly constant across configs; for small kernels (single-digit microseconds) it dominates wall-clock noise and obscures the actual config-to-config kernel improvement. |
+
+The default `method: auto` resolves to:
+
+- `both` for `triton` / `pytorch` / `cuda` kernels — runs *both* methods and reports `dispatch_overhead_us = wall_median - kernel_median`.
+- `rocprof_timestamps` for `hip` kernels — reuses the `pmc_perf.csv` already produced by the `Performance` stage (no extra subprocess), since a HIP testcase is a native binary that doesn't import torch.
+
+You can pin the method via `latency.method` in YAML or `--latency-method` on the CLI.
+
+## Architecture
+
+```mermaid
+flowchart LR
+    KernelCfg["KernelEvalConfig (yaml)"] --> Eval[Evaluator]
+    Eval --> Compile[Compiling]
+    Eval --> Correct[Correctness]
+    Eval --> Perf["Performance: HW counters (rocprof-compute / ncu / metrix)"]
+    Eval --> Lat["Latency: 0-overhead timing"]
+
+    Lat -->|"Triton/PyTorch/CUDA: wall-clock"| CudaGraph["magpie.bench.do_bench_cudagraph (subprocess)"]
+    Lat -->|"Triton/PyTorch/CUDA: kernel-only"| KernelTrace["_runner.py --profile + rocprofv3 --kernel-trace"]
+    Lat -->|"HIP / native binary"| RocprofTS["pmc_perf.csv aggregation"]
+
+    Perf --> Report["unified JSON report"]
+    CudaGraph --> Report
+    KernelTrace --> Report
+    RocprofTS --> Report
+```
+
+## Wiring up a Triton kernel
+
+There are two ways to feed your Triton kernel into the Latency harness.
+
+### A. Import-based (recommended)
+
+Add a `bench_target` block to your kernel config. Magpie spawns a tiny subprocess that imports `module.callable`, materializes inputs from `module.get_inputs`, and runs `do_bench_cudagraph`. No additional Python harness required.
+
+```yaml
+kernel:
+  id: triton_scaled_mm
+  type: triton
+  source_files: [./my_kernels/scaled_mm.py]
+  testcase_command: "python -m my_kernels.scaled_mm --check"
+  bench_target:
+    module: my_kernels.scaled_mm
+    callable: triton_scaled_mm
+    get_inputs: get_inputs
+
+latency:
+  enabled: true
+  method: auto
+  primary_metric: kernel_median_ms   # rank by kernel-only latency for autotuning
+  rep_ms: 20
+  n_retries: 5
+  estimate_reps: 5
+  warmup_iters: 5
+  seed: 42
+  pythonpath:
+    - /abs/path/to/my_kernels_repo   # so non-installed packages import cleanly
+```
+
+`get_inputs` must return either:
+
+- a 2-tuple `(args, kwargs)` where `args` is a tuple/list and `kwargs` is a dict, or
+- a positional tuple/list (treated as `args`), or
+- a dict (treated as `kwargs`).
+
+Whatever shape it returns, the runner calls your kernel as `callable(*args, **kwargs)`.
+
+### B. User harness (escape hatch)
+
+If your kernel has multi-step / multi-stream behavior that doesn't fit the import-based mold, write your own harness that uses `magpie.bench` and prints the canonical marker line. Magpie will pick it up from your `testcase_command` stdout.
+
+```python
+# my_kernels/bench_harness.py
+import json
+from Magpie.bench import do_bench_cudagraph, MAGPIE_LATENCY_JSON_MARKER
+
+def fn():
+    # ...issue your workload onto the current CUDA stream...
+    pass
+
+stats = do_bench_cudagraph(fn, rep=20, n_retries=5, estimate_reps=5)
+print(f"{MAGPIE_LATENCY_JSON_MARKER} {json.dumps({'stats': stats.to_dict()})}")
+```
+
+Then point `kernel.testcase_command` at it; do not set `bench_target`.
+
+## Reproducibility
+
+The runner sets `torch.manual_seed(seed)` and `torch.cuda.manual_seed_all(seed)` **before** materializing inputs. Combined with a deterministic `get_inputs` (use `torch.randn` with the seeded RNG), this yields stable tensor shapes and contents across runs — a prerequisite for trustworthy autotuning comparisons.
+
+If your benchmarked function depends on global state (`torch.set_default_dtype`, environment variables, etc.), set those inside `get_inputs` or in a module-level initializer that runs at import time.
+
+## What the report contains
+
+`<workspace>/analyze_report.json`:
+
+```json
+{
+  "summary": [
+    {
+      "kernel_id": "triton_scaled_mm",
+      "latency_state": "SUCCESS",
+      "latency": {
+        "method": "both",
+        "primary_metric": "kernel_median_ms",
+        "primary_value_ms": 0.118,
+        "wall_median_ms": 0.142,
+        "kernel_median_ms": 0.118,
+        "dispatch_overhead_us": 24.0,
+        "crosscheck_vs_rocprof_ratio": 1.20
+      }
+    }
+  ],
+  "results": [
+    {
+      "latency_result": {
+        "method": "both",
+        "wall_stats":   { "median_ms": 0.142, "p99_ms": 0.151, "samples_ms": [...] },
+        "kernel_stats": { "median_ms": 0.118, "p99_ms": 0.124, "samples_ms": [...] },
+        "per_kernel": { "triton_scaled_mm_kernel": { "median_ms": 0.117, ... } },
+        "dispatch_overhead_us": 24.0,
+        "config": { "rep_ms": 20, "n_retries": 5, "warmup_iters": 5, "seed": 42 }
+      }
+    }
+  ]
+}
+```
+
+`dispatch_overhead_us = wall_median_ms*1000 - kernel_median_ms*1000`. For typical small Triton kernels this lands in the single-digit-to-low-double-digit microsecond range.
+
+`crosscheck_vs_rocprof_ratio = wall / kernel`. Magpie warns when it's outside `[0.5, 2.0]` — that usually indicates warmup pollution, kernel_filter swallowing the wrong dispatches, or another kernel being captured inadvertently.
+
+## CLI overrides
+
+```bash
+python -m Magpie analyze -k triton_scaled_mm.yaml \
+    --latency-method kernel_trace \
+    --latency-rep-ms 50
+
+# Disable the latency stage entirely
+python -m Magpie analyze -k triton_scaled_mm.yaml --no-latency
+```
+
+## Anti-pattern (do *not* do this in your testcase)
+
+```python
+for j in range(n_iter):
+    start_events[j].record()
+    mod.triton_scaled_mm(...)        # includes Python + JIT + dispatch
+    end_events[j].record()
+    torch.cuda.synchronize()
+```
+
+This times the host-driven launch path (Python dispatcher, JIT specialization, autotune cache lookup, runtime overhead) along with the kernel itself. Magpie scans your testcase scripts for this pattern and emits a warning pointing here.
+
+The `Performance` stage's rocprof-based numbers remain accurate even if your testcase contains this anti-pattern (they come from HW timestamps, not your timing loop) — but any latency the *script itself* prints will be inflated.
+
+## See also
+
+- [`Magpie/bench/__init__.py`](../Magpie/bench/__init__.py) — `do_bench_cudagraph` and `LatencyStats`.
+- [`Magpie/bench/_runner.py`](../Magpie/bench/_runner.py) — subprocess harness contract (env vars, `--profile` flag, marker line).
+- [`Magpie/eval/latency.py`](../Magpie/eval/latency.py) — orchestration, rocprofv3/pmc_perf parsers.
+- [`Magpie/config/latency.py`](../Magpie/config/latency.py) — `LatencyConfig`, `BenchTarget`, `auto`-method selection table.
+- [Performance + Compare](analysis_compare.md) — how the latency block plugs into ranking.
+- [Benchmark Mode](benchmark.md) — framework-level vLLM / SGLang benchmarks (separate use case).
diff --git a/examples/simple_hip_test/analyze_hipgraph_latency.yaml b/examples/simple_hip_test/analyze_hipgraph_latency.yaml
new file mode 100644
index 0000000..61ce774
--- /dev/null
+++ b/examples/simple_hip_test/analyze_hipgraph_latency.yaml
@@ -0,0 +1,35 @@
+# HIP vector_add — wall-clock latency via hipGraph helper (Phase 2)
+# ==================================================================
+# This example uses the C++ ``magpie::bench::do_bench_hipgraph`` helper
+# (Magpie/bench/include/magpie_bench.hpp) so a HIP testcase reports
+# wall-clock latency on byte-for-byte the same algorithm as the Triton
+# CUDA-graph harness — useful for cross-stack comparisons (HIP vs Triton).
+#
+# Magpie's user-harness sub-mode (no `bench_target`) just runs the
+# `testcase_command` and parses the `MAGPIE_LATENCY_JSON: {...}` line
+# the binary prints.
+#
+# Prereq:
+#   hipcc -g -O2 -std=c++17 \
+#     -IMagpie/bench/include \
+#     -o examples/simple_hip_test/vector_add_bench \
+#     examples/simple_hip_test/vector_add_bench.hip
+#
+# Usage:
+#   cd /path/to/Magpie
+#   python -m Magpie analyze -k examples/simple_hip_test/analyze_hipgraph_latency.yaml --no-perf
+
+kernel:
+  id: "vector_add_hipgraph"
+  type: hip
+  source_files:
+    - "./vector_add_bench.hip"
+  testcase_command: "./vector_add_bench"
+  working_dir: "examples/simple_hip_test"
+
+latency:
+  enabled: true
+  # No bench_target -> user-harness sub-mode. Magpie just runs
+  # `testcase_command` and parses its MAGPIE_LATENCY_JSON line.
+  method: cuda_graph
+  primary_metric: wall_median_ms
diff --git a/examples/simple_hip_test/analyze_latency.yaml b/examples/simple_hip_test/analyze_latency.yaml
new file mode 100644
index 0000000..8da20ad
--- /dev/null
+++ b/examples/simple_hip_test/analyze_latency.yaml
@@ -0,0 +1,37 @@
+# HIP vector_add — 0-overhead latency via rocprof timestamps
+# ===========================================================
+# For HIP (and any native binary), the Latency stage's `auto` method
+# resolves to `rocprof_timestamps`: it reuses the per-dispatch HW
+# timestamps already produced by the Performance stage's pmc_perf.csv,
+# so latency adds **no extra subprocess invocations** beyond what perf
+# profiling already runs.
+#
+# Prereq:
+#   hipcc -g -O2 -o vector_add vector_add.hip
+#
+# Usage:
+#   cd /path/to/Magpie
+#   python -m Magpie analyze -k examples/simple_hip_test/analyze_latency.yaml
+#
+# To compare against the wall-clock harness for cross-checking, change
+# `latency.method: both` and ensure rocprofv3 is on PATH.
+
+kernel:
+  id: "vector_add"
+  type: hip
+  source_files:
+    - "./vector_add.hip"
+  testcase_command: "./vector_add"
+  working_dir: "examples/simple_hip_test"
+
+latency:
+  enabled: true
+  # method: auto resolves to `rocprof_timestamps` for HIP, which reuses
+  # the Performance stage's pmc_perf.csv (rocprof-compute output) for
+  # zero extra cost. Use `kernel_trace` if you only have rocprofv3 (no
+  # full rocprof-compute install) — it wraps testcase_command with
+  # `rocprofv3 --kernel-trace` for kernel-only timestamps.
+  method: kernel_trace
+  primary_metric: kernel_median_ms
+  kernel_filter: "vector_add"
+  timeout_seconds: 60
diff --git a/examples/simple_hip_test/vector_add_bench b/examples/simple_hip_test/vector_add_bench
new file mode 100755
index 0000000000000000000000000000000000000000..cbf5d3818c7116bc6c5a8e6ba6d3322cab11a002
GIT binary patch
literal 41536
zcmeHw3w%_?+4tlI%PpIPOM<9uaKWIkY(mI|izbA?S=oe0KoKv?vb(vIWH;U2grKNk
zg0hAvwYJ4xtd~}4ZIxDQ(V_&fT%=XdzKT^_>UFpALQyfIlJEb_nc19d_E6fs-}ife
z@5eySoaaCDJkQKC*K=mh?9Ju&`H2Y$Ou9sN2_tITbtVp#@GCCgmp6=+up%}L*K9VP
z4Fa5uzY-B#!c6>5`gMtFfb<O_UJ`ub+bY7hCYpGN^v~rKW73x?B%k>DoJriO{?<yB
zls-#SY2r)4-!>60^B48=)Og#3ujQjWu2`>zuejgDZ=|0l(tls78Xn8HrA@`RJg)Ld
zf1AjVN$^WPB1vDX(ARpu$|wCpf=~L=o{QfNy2&nQnoc2xRSCW$^T7;vWWK4Ne>bNX
zlfJb5<=~@q{;Q7o>F+s_@8?9hOWz~}N?+=m)=<B)Ab(nedul_yx4Cs{Yf-_}f_zKB
zXUXI8sF*2F3o4g#2~@wNX*r;CA)bslVIzT=;@}g2(O;}90uT>hpo2Sr$J4t;2M_7s
zn{{w%uJPpmL<fID2j8KCKc|Bq(7})B;GgK=1{6%Z^q;ST=jq_{b?~J+xLXGg=-{{M
z;6Kp8AJM^orGr1EgCEer-_pT9)xp1sMiS4?H|pR&(80Iq;J?+u_vqly>)?lV@X=`K
z@zQ^;4qmK--=u@L>EI9R;Lqydf6~Dd(ZR(_|6JrZon^6g51P<I817WJOxVG?*(4Cv
z+ri_X%eMVS#pSs0q0n3Mt^V<;cT8YO=sS}zE~dbzYYN65x>vEmjQ$ZySGqUwctb!9
zWkWy>0j>TUiWz|c=x{cIbC47Rk3R%@5}6VCtU^-NemaF&L9d9?32Z&+`!w|T`q4@3
zmx4wV=D%=|kw$Y9Fp?MG^k9|)np|-5UC6$HT`I!A+4DVEXDC}J=rx=k!Ilg9<;PZG
zYA}S=%kaaG4Wm&w2%22;IrT#ra@HZ}s!3}Qp2U9IFFcutQ+n?H)^{;94r9L+@}A&y
zBHJzElT0*6mM8pk;IBhY60&m=2>bOL)Bo#F)%UgKoVV%RHMUhxGKT}BpUV*lI{iUM
zqqE)%q_)xL6^H|*-&0#32zvaE8o#sA<EZ!6_<*`vosOD%ud|{4dJjmqCw{cL`lc#h
zQ<<}&q0Hy4sjqGJJA?H;F9U9OHhW!l3q5|Xr-3yhhR;QCMc<vL(bcquu<D@S<7_O$
zB?x89-3?VOjfBtldpxX$E{>(%)%9MtW3eaD+(-#RXjOBdF0N)(T=g2L_xrq*fvAqT
z$gH=_*%WN{YtVA9TZ1j|JDcikUf9eVtfv%#yS&BY4T=Oxa<Rwd^SddObEx@^MlNq~
zHU&KHCG|+Ij8fj}an17tf_@**HfNKxW4U*ABWv_DHu_rnR7OL2fpF6yMGjA^%hMzc
zy}%PJ_xpW*WV_y53;R`82Xm+Un~{2yr-y>da?2Ml1YAf3+nSr3o43*#sHb{CEH)Py
za$R*!za!{()&~PNS9!G)m-2E*TVfj^F0PJwx#caQ;sjy$)reFb%(FTioN_dHytTnP
zH91uaU9yUya5cGULUMvNtf@H|V1c09*BoRuH4P|dP<}{I75}TbD0WmhT)c9XFK*;E
zrg}%o%Po#3&sPh3aAWb*krDqLJ<g^kkJnue8I3IPO@C0eLXlLn(&=_M{eI^fhu5=O
z*o3Tx8nRfl!}-qOTr@mCQrHhhYvLu&(WPG59<|YHTU#AX9)G|`?KZf^(P9;5MxAM`
z4>I%x&PF$Msq^1Ls+f*^%!5^bIz@olT^C<W)8hiRz;2scwrrVWx+PzGZ^_H;PZl%8
zDxGU{<XWa{ZY{+muu&t+k!Q)X%pkPZh1y3U5LH$N4OT*3{p02ewP2f`&!+jBg46sy
zSL4(|NXeynJ-R%mJZK`1Kl@PMC|#8XM<*V2%QQHKxTss9!8P42tJC1<?4wSOuY^(G
zE1eui3D1`x{I17MoEnU)R_}fduGM=;gKPC3(cs$WsG}O3bVwK0;BsuIuwxn=gH+V@
zXz-y?6gN+pI7PBCh((=2gAb3Q7+*E`2o0X4!R4bb1)DTD2I;8F(cm;jN@vyJ=@Nur
zq`@;Zc!>t5XEf;)4L(YO@T)ZVXbrwhgOAbRD>Qhf2CviLaxO-}O&WZxCVs01r}167
zbsBt}1mSPg;PRaZ1#i~i=V{_^(ctH6@cT6Qcn!W)gPSz?HVr;qgKyX1*&4h{gHO=l
z`!)Cl8vKw3H*4@C8hoM#KdQkeY4ETHzfgl8)8IK8yhnpi*5FLckE#A&q`?gu{9+BB
zropFZ@GK2JRfC%}xJ85KXz*zo+^WH?8oWq@&(Po{8a!8n%Uhaj4|LWt6|<I8-=Y0u
zjKZwtS{~|uNecCE)R$@hK%x^|R>FNYO87@FRq3k)Z4-We$vTIL2UV0VsIrwi?4?V}
zE6c8OTvolPa;E8q35`CtXTr=0G;-C_gl)o<37D}30~2N<%;T<idL4}*CRz$=rc7v>
zVbxN_#Tu%yUK_K~*{TV{V5p(Im@7A7jtZ&nDHDjo^vOd?Kw?@~-$-+_KCJ;~V^f1C
zz>%xjjjYAv3i|vGr`yfg6hY0LiOC7(CNw?9+&kdGlwo3kQq2@wm5EPCW}4WP39I--
z+Tq0<9oe23n82>Rk!fdfGiR!Ej~I~VK0Zul{mrhR$=lq}VDkEcrWQ;P+?bo>TKf`J
zwXmkz>Zq!zXRNn3VGMf*CHYR`u%k)r_N1|2Cb0*TZlnT1*jp*=T?`HH4jtNE$X*_n
z^5fa;<6$X3naw(er~GU-`|<FUM`yEL!wrwmX3vdC`O7SJV8jZ>ewLc@b|HH<HRW|M
zrKY@G$ik^92MgI}sb+||$5_JHYsS%^&te}LGoPBpHl}57naO^ZHj1%l(lU<EVlSp;
zJ~WF((y|W~v)j`F-<zKC;ViZ-J@ebM*z@Vx8;e;t!M*7j;aTik8JX*5u}3npyNcK!
zG5{aV$as4eJD!nwbSB$0GP|dc{d(lk*JiODBQe?>9GS7Fki9uF^PU3s`N-@a<gzWJ
z!1Iey8NWl4M`gAZup^_gU$(L@2<{x6@nj+U@#xGCXRxP6XLnfHpGO1!XmrM}3fa0b
znXk@Z-yf5`XBzw67{D)%$@oPf``ehzT{GC1W3s=rupedu-j<p1a3R~Bnfb^J_C{v*
z;}-Tc!QaZtc%YCyn3egh8SI&??Cz=TFIj+(XJ!1TkhPD^{5+rCJ2v~lsch$1z%P%@
z*iy*e9h>=PKI<Ku{rVJk-#LJvI49%Vg>3IRnS1it(Q~rDJ%z0w2lzYVGVUm350A_I
zSw7n}F8koc>@9*%jLQfWvW@3v-kHxHI5)fPVz%d8z^|U0(O$?R=VpF3owc2p{kx0U
zPtF6p{k)9c0`~lQnXgZ0@0^$Y$z-<Se8BgdpK-E)J#v2LuIcRe=Vw1MnY~Bw$@4Ql
zD`1<(XFfcgJv2W1?HsmmJmA;IXB;bFACAxLoX&1DW&a?DZ8ZVjVan((U@w?5KgnZZ
zQ})XjvQReQA7y8}UBDj8&U`J89mvk^xRCvw;8WQduNSb*6Eb(^v7b-K-ZP0EoB;T*
z6Ea>dVE>qq`O`esaY6Q%6WK2=0Niy!#=!#i(gm3vdF+D=vLBzwI?b3j+-uHg%VXQj
znQ!N^=gryOX4Y-~BaPpv=e7kYpBh+j#0djipHRx!?u4tqG_W5fqR)CG(R9>+=XL{t
zZzn&9j_=77x_>rh3p&473@NyO-H-y#qlOAdJL}F`;H(ACTHve&&RXDqhXovtGJ9#|
z0>`5H^X-dD=Q-vst(<2scQ~R?5!#1!WLZ9I*7*WKT7-2J<WFt(uJZa;d&SCZt+#oq
zT+QwhkJj=@6!ZL+{Hb_*RqOJSM}Ktz=hUgSHLb-nw9h~PJFI&)7iTSS)&gfOaMl86
zEpXNXXDx8n0%t97)&gfOaMl86E$~0X0ynGs|4V3%k6aQv%J-(J6k{dgrb{fsFTy4U
z+W$WU*EE*O_iGTZj2G+o&|)H-^yS_Kf~8I#A@(9SGe}>&haf(_k3!;d(OvjmLa6lR
zn+W2geHL9}6GN9+w3U97&|i`$ZZ!Ic&nh-KST9k-r7vRl*-Y?}m^0BQ^-8<Qef(#d
zPGMg5pYF#m68VnxmvCb<CVg3Ew0?dG!c6c>E9NbzTFO}4qw4D%i@M4g_|{)o#v>MI
zZId61m)OY0erj?0YXfhv^!7(BSU>f90s%*$$?5VuRNLTN>1=q|;;X3%cqn238~zNB
zSg^v`>~XBZromsTQ5-9qYid0He<Hx}p9)C(rvln!Vzbn!$=bwjU5Ra5l=NsUQ=C{&
z)YZG)9<O61R*hGwCUUe!Im~Csu_nrqbA}w(M>(db9EsYrKQ2;L>-RM`Ij9^oiL#s-
zLtmP#XU5Q%q`Wu`zZOaIdua13cCl(wlz&F?`w~=eM)CX7qp0!w)x{^JiFmDbsFhZe
zrm8dM=}XtLGv?__+ln}O*(THGuXj7#^^Jk2ETSYmwT;;Riv7T?9yc#n_9SnqXp@eB
z2*iehTCd?Z18}Mae@rm_%+lcW);2q9JzE!Hhh&-ERQ9VV<Y>XBMQj_sE0Jxt&<>26
z24~Q*+UH-@SC#&ljJ8d?AAYN|YvFem>>h2Q9T+l6n)E(m2}C=S)L*GYpsBv0LG&*V
zTLNnuSNa;B))8)5<=$>-_SV<<{Ea+e*mNQ*dBWqC<^Z;NxYr<gd{eE{wQ4sSMpUEW
zS1O^^_?Tt2v&B<`4az}o(vn|_E)ZL_YdyiIK*fL6ExSr-vjjQrpg)c0sGrYBLa!^v
zxlJMxk?Bc;SeF<;N(7!yIQG2DaY~L8<jX&GUs^JK2r!DUwe7DcNB*=U^33Ct?x%j9
z{wP_r4=A4FGIdO?w<`rjpzeruxSCNxS9%Hy^YdKOr%!hkV?-#9r{;_>6Hg50A!7WE
z$3#&|*ki06z8r76SYj97*OidiMf>`0?%K37sW<7?-d^NADG_y<)g`lyH-5u(%d=zH
z#+@u-XcwZfb!<=<sroTXoWR(fFWmx55ih=acqy4Bz^I0o2Ek|Rs@_OuHxD8oF;IYp
zmtgSZCgY}^gN%r_uIUYycrzre-wCN_|IXUrY#lgT2hO+-&@%?P%`_p0lY6@59%V6o
zXC<P;lY7Cd)adN9Q@uXAJtp|6&nK59J4ls`UNP4<xo8q#5+j!>FVAANSZnKprkuK9
zuqiNe+B6V#%_}V~U*ojJPPfPB<&y<pBfbdnOlxRpX`I^R_k9EVC{6efK>ZHDlviLa
z$~V=_C|YSPaJxLYHN~#n8HLtD*NmDXYf+8cYR&WH7P<1AE>}&VXEI~<dM}<cW|}BP
zcKbY$4pYvw2A{VUQ^Y1`u#RRLj(`szuV5zPZ)yxUT-Z?>Xl`oq;Y%t!d8~5cOQKqY
zd+>48%H~@B;gkdO0|)kUI(z{K5?WJ_?Vr;EepkPwbK+k~Z`6cwZ`9%IAHA`G>K^5P
z^u`v)LLPBo{04!?#=lG8VwjAod0a=%n>zRqE?}VEn+Q)dF!?+!6-)yjZ{D&1c)WSb
zO2XsJTbKnKUOmi$=^5sD%z|A8mO3XsDH2!0gMMc5``pf;lUeYoOpwQ>$sAx$gWrb&
zcf0B1D$B}1fLYYXRtwGVFk7u}Y;FXII!m);Ml;Evb0z#b$W`xDUw3AsnYgOcYn`c)
z#oOHIplPM9B#J<f&8Lg765sM+8h^T2d_LRa!dF`HcL=K!UOr0?xbVrJt4=p2OieRm
zo-b$(qUD@6uHWNo<_`mUvGnkNHa~RCyQ;FZ!iGKT|2>YXEp>e~&n2h#{zDBN_|A%W
zsn3=CvLBZI3c*PIFzJ%*QufdEjm(MOUZ3Qb{jl_N1iuWI`~v2W10-99%YIw>vHY@s
zm;QM0()?cX%W*>ba-5L*C=Xh9Aqc`Gf0sz0^h>mnaUV<5w-CfH$1(YQgBOh4b;*RP
zq#9Y?!>4Z}h*v(N$o@yZKWh?k<v1wqM{{>_^g53)$!`@OZOHe1@|+EdOD;D5v@5tD
zzkEL^-WSHkkEIvKkx6=_e!ADXSpHfB5@yv#B3kG#j$x3b^vOQ5?6e}9sA9quiHJnf
z+(Ft;*56iO(ssCwx~;;%w?`56M(UODEx3utpC>*Bqw^KCE{0_Dt85C#OeSqKmM|`9
z*4;oz%n@t^-v-}+MDK^61^*5B%iw<ne=GcRA*2Wo(^*K&GJKfjhQ9@mqjau85B&Y`
zb7rFcB3{-k#v0&nhrboRG8^Z$pd;uhfiBQJrHF(0t>utId<*d10(^4V!p>d7^@~|T
zYg)p%;e!pE!9zHG_hmv_CsB-fgSiFiLeKI=z-1k#cnaabir}--)U;a@%SH?eKp030
zufyLGgkJ_f8lG6rBbWef#9uqYdHKvsU6+v9oVqb#Uh3uqdn(Du!QU_NcN7Y`;j_7^
zX>(JvN>fd3$#*2(nV9%yY8HnOnbL3se?Nhp_&h_6J2%x-nwnFZYHdr|r1G>9X@VG)
z6G8Zb26;#R4X|wp_XG4a_r*3L7^Y_P5I^fu#y-Wph!0(JRb7KNrF6;^=5Sb}O|@G)
z_>X{pEmb_&jwgut=cZaqQ;SMdOWFo+8q}F`NAjIXw<adJ5(%^NaKs{UrKtubwL}3s
z^$)|#82c&we=aL=cRW#0sFpwK$k77Eu7)T!FV&j3IMqP)BA@C|p*UZrA~kIZN{Dbf
z@RE7`&yS$8Czo)2m7UkH!AW%VLz1j#lx8#jvamDL1peeVMw?=7xFEGFp)G31q`e7b
zmKa-hA-`$po0}l(1FDDE7@FJzSp)64Aaz@!)}Tu*tlq*BZ?!O=CGGOm?TOHrXhWn0
z7S>>4EeKm{Vc)Q%Ez*S9E$mtgb6Hr0g)O!)r-fa^|1YsH4;a2-VGs-M0Ek=kxmO_A
ztKnS<x{cFUa9#*e|3eKDvLt?SZA+ZX6N0EnbG3ynvaqXU(*8r_N()0?)m;9EqW-&*
z^#8LQREwChKqb4{!W#bvT#=H?EG)^IXkq0RrdZfQe#MRYAK<_AA6$v3d#A}AG$x`p
zKq_jyRyJA!xZ$es*N%Q+e?H@9wj>|Tiq|L1OI@Mu5|ZX5$8TvXNT$|(Vm$w3?uWL}
z;p>#pTT0tI$Euc;=XT{DP&((7qD$%>9YBZpX^q7=zMZpC{5^l@b4!)byC+gAu0RhN
zOzggJZ^~S{RXS1@f=_AdVZo6Qbw#+hSLvKF4PoJnFh#v~kCdnKFBSa7ds8kG{39j*
zaHV6;{3T#a03!q^_62m#`TG6d-f#~rb*)I0()K~C5?X6klu*!IqJ);1i$ZpDveG$^
zpn2vhx&uDXyo~OO=xznwCDUD<(z({0qjUz%SxV;;bDGj=HygrRAz$eTnsYjqn5{~O
z-JCVAW1iV$M_|>?6{Ae7H)uBCFDrZp#a#c~Mwn_(jd46uT6$&a6{Sl{msBh3KiCX-
zCx#dnMpwv`&>odQ=gn-Py){N($_tP8_HIu`G!P>WgMdH!9@Rg><pDTaV?6aY<JDo~
z)$bV78`_Q;)6H#w-v<1S@#+uYy@jAsgxz`!SM#kvcH{5QN?VsfnUjnq{O0X;%ILF{
z7<;HUd;+WY_D;9iB#`&R?{K8SY!%4I;g^6Y9pHMJxRlUe=qN1~J}6)lhwY&gD*Ob&
zZFAaDaIAUg&Mm(&vG6v+_9dH%u%nC&_K@40W)C%(vnV+zi}xro5wK8V5Mnkf+vTUY
zR6c1n`{tRm+-?P)fu_%U=b6tzS~%bO@sz;XW_$7b!4XtmX-e<j@IsmF@Z~b4;RWK(
z8lFRUp}&U9;oC!>t28p%s%+>DrYRjyn5it4$*00eh>_cc!lhEYy7ZdTYfG;yb?iYY
zN}U-|ov&gR)}6$I;)&XhU{ZP%{!(oP>49il=!6p5VW#;X>^H9(eshLuzb3@r&}H24
zDkkAJtemJ%Dz=V6_6|cu=fe!o6!y^j&r-42LnnDTpe=_(g6&C?-Z9@`>!?J|V5%Xu
zjw`a1jtsIa8pkH{I>b;ycbm7u-`8%YoMKkOqK(I-bhew<0V57HlXf1H2rCg3p9mD>
z`_zWe*6udnM_}hB^A`Bw>n;Y-k#9yrGEhURqK34L8j?wENKf#1y-#q#;Y%rc+jB_m
zoY{D=3%>}O4A5o(9!;QnzaR3+A#NJtaw~)%6%3Dq;deCMw|D-6Yfw5H%%<+4HJf`6
z8ru&cK(%5#^IgVeG{5pMm~NiA>CEkh$Z?Hk4@JV3izq{}_Tr|pg(5H$Syfsqog%|p
zk(^%iwjFbZTn@VP30^GWClE-c8%m}_wdQ5jxx-9m%Y7;L09BOxM4SaY&YS~PF!?nC
zyPGgH(x&k30(qMwJ9q;LHw$nYUNcZ9(WG?D2oZmS2w2SnRDB3*q_7ntY=;Qjh_oQ=
zatg~AVF^PCn=^;7LzC4!R02ckKY{4vsM$f^-D*=so+uzZQV6&Okv}n}snI_%F5iFL
zc-@FQ9_{TtZcIygi2K*=I&PfO)z*{9n~bqye~t0VuH(k^P3EJJRAaR4+8)NEjdAWS
zRPMD(hnkD<E|Hw|Jc&D0JMR$S9|=SvyC8ua-EUWKDL;*iWECAI7`_HWU+uicZ0cw*
zukAMU{?XX}3i$Nv=g)J<D*sabY&ZXjC*{nWHpSPir5o|mO1GB27E?=CQDmy66VzJj
zssxQ%+64{aRGt^(hReB0RZu~RlI3JAJqZC+lkebh=3D^&@RI_29T*i}jd3y!^%x~q
zfRpWnSQgI3)fRe=vQb5sWwy{hTid=I8j4o%62Az=i*j!_A4B|SsJ~D;uQ4mO(C@p?
zH;!KinjOc8TFz{3zsKO!U|vSWb1gbtS?9k~0Y$AMHQz*~q1r;JN4;J{VGSbe(?m6F
zqZ)QCg_Ve~)jVvDjj%WH@`7@k2n^X#C=LB9JO+{r1>+i|1a%$JP|;C?uTyhUQT(-W
zLmA9nG5LLEdk=2lZnhQgGqz7d678WLWmiuUCTj^wXqVD<7`YymV0%`L*K4#FANMEO
zL&v#g!lXZ8mp{oCdgZdt*`HB*!mlEc812xa+RYT<_*`Rpa#6*G-kVY}&e>^rJ{jJP
zNR^#aW~sJ&-yS*@erzJ~U1P4|wyf-&{RuA%Tj&jLcNllWgQuu3qkiWuVTol!$O59@
zxlw>W5g?g5oFl+v0_-C20s+pZYQqO7$T)%!j74joqO#`6KT2`097`(JqMBe9z^4)`
zXbQk6xgG85*u4I^9z=~!1g_nK>0BBypF}LwEItO_ZC(fHS?6dI3%`V?dX!Z+3F%yG
zHeuGHgm#Co&cbx+Hwly^6U}~JM|kds&(b+sPt&24guM?eG6xBlI=-vXvBa#X3cjf-
zxIIR}Nx)hK*T+$CD=Byb=8_6##8Z%_DwrIj;GP%-W8)~8ND6MyDEO1P&q~o8JT``#
zXg|WVCZczB+8k_0c&v$@0~R?)qaYAZL8YpoHb%j#fVB#)jHAFs3Z62cGby3cbc2+}
zGT<H)cwXX5%vEY>ygi!d<*N(&=cS>as?Wn8AiRHGJ`eW*i&R3DRPZo>0SazW75pGZ
z!LS$w9dQ(Vj}&}3xKF|ScnY#r1v6t5{2Xwfu|5x96h}b;DcGb@@a}{-Ip`Y2P4pVV
zw8d?UQE&j^u_pR2V3CV83fki-aHtAeV-$n|YjaQ+M?sJjJU^(<L>I(Ua0*AJp-pGT
zD7Y;~!QeOwMv{Vhje_0TaZL0hRl$=8)0${fjDm*{9&4gs1B)0m3Y_s2%vKdFjZyG>
zz*-ZP$5C)ODR?-g&qO%Deqc3yJ%gL*Q&ayOxMLK&gYeiK90L}auTk)000Y{=^{Rqz
z$0$gOQP37g!JVYw-Q>O;ToO;gSXDt@jDiOMYfUsEjsgoQpw~CDCBKPh)B!ozj^{bl
zqQmt3su>wdV-)N{c&v&32rP1eM#0*63a(TYG{q=53Rs(i6>$_akb>PweK{BxPr(U{
ze8_<@M#0T73VP2Un1f-Yz^PI2G$IW!(RWk@k0MNKqVr=E+#5&1&w)i~O-z=-Rq+($
zs|qS&6g&f1Yoa-E6xc|?j}!Y8^x$B@0XcXXPv<bv2ME*V;F=f(Zy-Fjki)<tmuM7x
zFP;Lws-PoA!IyxwIk+K?g4;;Jn+bh6DBudzIZSkPPFEFN6r<q#{S@?u&l#YgH#~_H
ztko#^FRZJ`Nm5k7wp4DS0|?Was5nN!QwZ;GqTcZDfknn?6a?cbSfnbbi&1a{aG!~K
z!^`3*aFYVO;)>Ryk?|CKG=k?~aEyZ0F$z8(H!ufD;OV|b*3sV}#(*lcS(W}XglX+F
zwx9IR!}ko3{(1PP#6z#DXyS>tomQoIpE|R((o3>Q=O7rb&L}7?8pPk94MR{#`-jF2
z%P^Eg+`~>Q=P!KX99PaB1gUZ`v55>my`1mr%eg_yp&c2Koiv-*&sU|soas`|5zHbY
zt4}XykiML5n!l5Mwve1@r<e0ABJ0@sK`Cb$$@z*NUG($SqA!O(Pf&VBlbk0`FUO)U
zXQWIIwq3JG!|CO`hZ(v~z7EplozinB$uXZ^&b|6_+N2yi$$0~Xe;WH-p)Y5alrx0n
z@Yh=Uc23oo^9jwfDPMarL5bK;pPs#ltYe=?q?|UAlYDwPH|fi9NjW7XX9o)ZH1;Xb
zmorYv`5Lo|NbvM>KEhO8CtokqES~K1G|8EAdN~j2%juAER+F4}P-v&g*LC`G%A}lp
zlJmpU%Ne6D=S!M;Q@;LzDMe(->E*nD$U63UT*`Tr<P1N(oLlwftdeq8k(}pH_@}YY
zCHivcFi4)S9Fnv4^m0yMa;=lE*J<8O_6cLc63LfxG-E>FI>}>*%hyRFRz&9M?u%z^
zw~z$-QHTK{=TZp$e8hl|p%n7Xcp>z~Y~;!~AsZf|kiX#)D{aFj3i(645I=?dHeSdw
z3V9%2$R!lg885_4A+7O3hEj+tUdS;_Mk1B*LSCVe;&>rDD1?*^$ms(VVvHBEkwQ-5
z62sH%k3{02*F=uQLFY&I#X+r+C*z>!M1DFDHMVadP_F7VW8zQibsypxiI&p70~+az
zJdENepQhvF(+`!9f2Znq9|X~jH#K{@chO}BU4BcKU(@B6blFOm2kCMzUGAaF-E_H=
zF1OQV16|hP5^2RtQu3RGKTYVnQ2678pQ-xx4!5}~LUj~OdXuqx@JOK3xNf4S1!`RM
zvqzELvbf?;>MqxpQy}Hkk(}#KFK39poR4T~PU)FQa_BdsPLr=)h^&*ZhoqdhP{@(L
z%i@opuQmE|u9I?pOmZHOa{4-zU~{wz<soi&4NtEoe0g_a4AeZS`?6SQWcTbCC}@oe
zehu^S?rem2o9Qy1E?8sT(>;PN2D;GNFumBdz_b+`-l3J|pm~LTGG3IwAHE0tRHt`w
ze^2*ch~qW7{E03v(&c%%e**g*a<HB>aRzuc^fr&$*)<<;CvT-9qj%6L!x4bB4P!bc
z_IN38ewOwtJiTYMi9JP^((Uw?zdZLq*aeoIht`>BaV94+1CJVfzX4<)$mg<yqpQM|
zBzIrR<3PwEszJLM1lvMrIlEd3ePRop!u#~W_*-Pa<;pZ%99g*3nsD*ubgVAwxN<gb
zS6g@1AxYZ?hjURNSY$1+70*|K9t5$S%aO!q=iss}A)FzDJ1P}rGS(@+7~;RQbSLT@
zehnqMKLtw<X~{h*<0UF1m4~f&9~UincYTp$D}K@c($4jWf|buF$<%E(;8bIy(lLE!
zql(gR5bm_%KKu}ccg%TlI=%77vVY{K0Qe>tzI?x=y2{>Ji~R~0DjQA&gM119O9kE2
z9f-ysbCb<jUP<$h<1hXHn)_0?orhpCDoN=;V5lq&%O%0{ZEd{?%|lC#PYx>Gm~8G&
zMSZ}Q3R*sK@g(r5SUMlJH#vcupB5l!AHvRiD4mruS<wwTygaa0V+(z*gpj6J!zWR+
z%2QPAaF^OHUSv>;clobCb%Z8bk)Y>ErL#Is35`@btb9pm7aj^<2o6_5)0K{NB~+c(
zJ;<1rU`$U`I;v2itMO-0_LX4riY*x>SXK-j;da`yvz?Mv2$dp5n*9!_xwg=!NEKG>
z4%8Ua7g_02LKhQV(r_^<`xaD@RHbcSNvOIgv=DVBz04YRV<5M|ausMe27B=_WBYrU
zw<u3VSGOkbwH3c$yzPCUG*ILZM<^Eku=^q3Y{S~BL7DuSv3(?}j~zROvg~*=#*!X7
z^S_6)5Vs<9%9ikv5(@LBG2=6b+KykE@RHJY{3b-Q?UZU@vhZs(r%>5Xgy}(`g2Y!`
zJjJYFcLUXaEG?y%Wnt+sZGI2knvCuHktbT0yRDQb@Z<2e@kC5%;~BC|K4ENs6j5kl
z%!KtvG-j;8g>OP6dnjyAI93t*+_tMX$!2`!RZYqoXw%HcJpSHrsYn-~kHSyT1RTjp
zvmO4p^u%6+9ceT*FQMi#ucrszvScLfs+6^N_V#v{#UvbYa@2&Yi>v8L%Dysm1?Xe%
zqCS5g@_;^XB87IRsZFrYM#wSxJEZ@k@UwXQvs11Y@*(i!@Q?7k%p1r)>H$%W{vN)b
zUJbN;ZD<}$1{>TR;Dr!QQaY918pY_{RbyP4@|`<qZ`3s@joi<{iv4IT>@UOLRU-}n
zqecFwxIg7U=j92#`-=}5E3gUby<MLe-^0r2HTxCU0g^xJMP&>5$@>8A20QiwEQ6n3
zrq+g!!bABZ>&M~a@MDttB0cT@v!wpLQ%&k_?)OjXDPWY;)PFyzcGwO5YI+%p8MXJL
z@N(2ZTgPxza@u4QI;2eAPj#5e27OSX@s>wWR@CH_;={pu+`OcOUchFxb}B>E)MaCo
zw!P>RUTOY>%#F45l_`@%9!_ndyyP5#xsM<w{;qqO1Z(<VW7=3)^3Sm3>xi*OY1;)^
zhnwG5diRsKLl8ImRW9b5(T}O({tR(oScbprux-w=%RlJ2@$#<jl<>W<IpqyQ9B-Z{
z!fW}1<PjRAG;6Qn4Tu<C$4h80#pMI4DO`!z-9x)kZhN~k=zSWAp_-Nzo=$=~%Sx1v
zB3k9eF2g_-E%J^7P8~)Wd@-J>`}fd$7N0V<{}D#AcU)rcs4-N8-m92=O6foszAz0#
z5;~lg%a!e4s9h@-*!LRS$k-%~+8^3wMe$cbbm|{t^_$xB9VjcMW0p(+gq#Y0ACH3e
z(8U!ApV>pJ(z>rwI&MHQo>V69#zK4`hu+RjM{`2gNlWMrkXwN_cNLgfB4-r!lydR(
z?hMj&9&dJ}se453346jR+vHD!sMl|WXHYJarrSdSFrcrN+u6dik)r6}ft$Ymx(XU?
zlfSZuj*8s^>Bdb8nlT0!8d)~HWNfDg9A)3{sdH2&zp2h24u{bJE~H*?6qW5^+|2K3
z$-?dKmOKziDBqFzOWxBR!1@6df6GjP52La-bYHC*-zglSBU>&;81Y`w(><$yI9hMo
zu$D0*yt1cz9BxTewE>-zs%<znpsqvFU}Ersy>k_ea7^8=Nt$h;4=NI{kCOIpQr}R)
z_j}kd*&*$#Kd9)uv8S)Sq*e7`V`UG54z2zNHdO?MgC{)`iVWSq+RTU5<Kf?68$+Mv
z(u{Yc@`2}6#fB5c4Mz}&T@gL#A<@a5QVvJZSTGVRhySjGUhE!Uqsd=IXczj|^yGs|
z+s6sUTXrD}NOJO*Hf-W3e)UEa(ihzqYT{RRCRd=4XiG-g@IAyj{2*l5iVxjrUeuX9
zgmC*FL(N>H#l9!4<q*Y3(pWcj3CMrxiRTQ>8Mfm6%_E=(5=ek8)MYE~G2Y=rK_hYf
z(uZE!wkvXJMZzvy?*ZH4FSz1^WH{rkmm)q-!k&uIep~SuHy@5n17vIaYRHWj&}1@N
z5ov?)IwXVT9)L#5ZI9umH<1DwK+~u&D`EA$Jt#M2@)5Gtu9Mioax!V{bxP-Dy$VvG
zbj}{C^uDMZ#`LzUdxDU6S?BO_ZVzPUpqdt2XuneY#f=v(>>N(Ev>&8mwjWGud6^<}
zV;qk5kGTi78k#Rs+IFLuPEyBqJ~qn?A-0!M$6*_#bGT~2(Q5tQaKPAJh)fFoO6TS1
zrC(JJe+-`CnEH@i#*$q)bLb0XB4PDwkz_#B4mIshomKN3_b7AL!#KuGci~#md6S`Z
z2ih$rEnnM1pY>}Gy#8SV`4-7Hhv#6VCdFoZ=ok<KRchLhV_W~#ItU2XVX$4;Ic3uE
zGDMn0m46T=arJgE@wSQZmc5vQ^ws}#5?FFfw1Hi#KkR<UR{R&^-Meg^^U?JmG2Xq~
z@O<!^nBujyeYB4DUm_7`U3(2+3BE!4Dtn~X{YYaAb`KrX)-Osl^eIx`e(3uhCPUrD
zYJJ62**^KWGWk_Z7%?KAP_sQBMGmgMY<Di;No%ixnQ1~7!thT|D$xx87W81sI(A33
z`~eqHew1DW^~>)sH614Hk5@WIe`g}9?up1HF!vw+z_GRu&^@P29gQAxLsu}d_Yi-0
z!gejhr!W-69+W`ujBkLocVYvs#_=V%Bf1VDEOHcy?VY%iXsVTwL!4etG*#(H7pE5!
zO%t-nHcnqgG^T<qav!JX5}iWyW=_u}8e<NNtmAYZ(HI^Wf2q+s@gkzBc12e3@Cig?
zq-BvRPM<?GCPFMy!s!g6F^6LjE2oDNjTtqIm^htCG#&`>n?;;HX$Fl)4lL4g`Z&>e
z)?<-l=ofn@{+(!=Y(|c9`YodIx(E5^^q+~wYaryG(=QQ?7mLV0r=KT!6w&u_dN<K1
z2IQa9PZN!o-pD_ve@!&r`y>CH{yEWD3_<=m{Q%Ke4?_Mq{e7abNQV4#`Yxie+=l#f
z`gWqRxQG06`WB+G9*X>P`g)?VgpB-i+D|kV#gYH7DgWOf+C=nGPOl_7o9IKFUQYA`
zqPsY~nCJ_L-p1+6h{hlC&*`~DqYRLLPR}HI64C28ok#SAL^pBzBBFDMUcu=JL{BEV
ziqq#1eG$<moX#NnVxp~_9!fN}pd<gBP9%CN(P^AMiL};Ph-REVPV_XQkA20<pJ*%5
zM?oLE&V*ZhC$~=Ztq#>EgObC6Slm$qAxFQHOit4DJIPvleGL7*7@B@3nH*IZt)m7)
zZeyy6`_w?lZB;$_xizDTLyjsGIjhmceQL1ewg~x2D#^A!ZsK=TVaZ87)IiCF6HVNw
z21ZWGPf<zslOZO4M-7tP--LX5E=j8hrwUAN#r-Do7*$wu{HYE;RcLZe_nXM$-@uYD
z^GOYw+^9QD+@}Uj&MN3*_y(Wop1~&asA_ZCZBS{HET?6<d8Gdbm-~c$TkluX5v%eF
zk^UOtw+R1M;eS{74+;NC;qMjxE5d(I_$P!v=uTDN7~xM6eu41k3;zn?*9gBw__qrG
zyTX4+_)iLdukc?H{(HhdA^bs`g?`~r5`Ka3=L`P|;nxViMfkT0|GUC}Ncc|*f3NUg
z5&nC^KOy`<-xB(TKS}rn!k;hvD};acTpV$v&;NyUap*TKpA-FvS}NGJf(|k<F^wG+
z{i|%3)XyZdf#=iEfSHKiLiWc+pvjr=mw?_v`3>Fm^}7_FfG?Dq7MKW*y324!@vB7q
z5)pDOe3CEwcZx@@7Jn3fbBz2OIX)`dpUHlzo#UgU{g}jOqXSMzJcr5tK;j*UA1}Z6
z0q5yiX~LcQeKz?Y&;=)!&}ZLTcZw6k7=6O6b!mJc8ph}|POY0j_D6o@xOTM&%m+^S
zy-(y<rt>O}k6`q=pPU>Ayh0AW$tOn>19Izt$II9I5SWnIS1w|>VA~O&C60@ZM_dRl
zUsD1S$FSJ`a~SNIn3x_N4`lpwj%P&21&N0shw2*>{Uc4_$lVXU8L-1PQ4eWiLXOUb
zAUth_iPI*?CHG6<CX7~FM8B@0jJ*Jy&OJIb(!?Q|FWTiMb;-Eoe+)b=j-0Q6o3s(d
zJuT=>h=Whn!EHLY6FB8d{;uooLgq$}4^;2~aLN}g<B*f};c1=t&+Fiab@119@OOb{
zqaM<?n_9iYF>#4!&vSuKK)%>|HGVheV(d1Z_;>2y-_yZ=sDnSEgYVVBU)RBDcSpQ@
zrJy3k!^i62IXZZu4nAK8U#^3Db?}>XaN3L<Fa4Wz@E-!Fax}H6<tXRFf7OZKBjO(u
z<$jkacRJfTp57|pB!BZQs{D|Mf3Hsb4}hnmx1;Z4$jN>$5ffyRPoK<a-8g}3F}lA{
z2QSybm+Rn89lQp3I@-&&nD)}B6F;bf-=Kr<)WP@a;BV{TIe5Sz`yaZ^#7*=mzcN-X
z@r^3Z&vrrB?*v}8NyYg&Ex@OvBZ^nPMLPI3I(R?_->ifG3^>_&tFSY=Ywmt8;?rR*
zTK8uiIm}Y;4SM`d>bZE9(z&*&K_^bO)1HqfPH0pQ%M*kHXXN#rofkbd&(q)}R(?>P
z)6&#{1MuWYdbKW>!_(^WGzA?v2Co5U@X@bz*EoYK>k%LK^PR!D_1<dr1ZigR)H!PW
zbnarE8@ePVPvooj)<bp;Jf^B4z~S`!ooiHB-Xk^Y(S5Z(FyO#GI>Qf%8gQ1Mdfp#m
zsq*;IerIxupToJb9yzLIE`M;P2gg8Ka8RDZ?ZXj}NMiKP;^06%PR738LuVJ4D~|b#
zODphO=C<l34oAP3zLj*Ur3&#paxruVG4P^7gbp4@9vI-J<B+|sI<hH_$Ez%8#2J9~
zo*5j>Ekw%rodd!Ax6*CP^XWMB@}&(e<&N_5O8o41Zk`=SXM0_1s_946%NH$i)YsRu
z1!ZOZN-%c-k~aV;Kqjc1@dNG64IT&m<hej<o0?Jbjyk8;jer3MIIC=G^>D1lZ@M>m
zB&U$&CRs4L$Ll7OyBqM!={NzOpZSes<LqtZn$aPH_!V`^A)_J|v}1vN(cDry{a$5t
z`4Y#H(z*DxO6IUFLO9NLbl}WGDx`+`l`fS7XFEay%0VN;QQPRlS%##53Q#Q!QC@yj
zg#cQXsKoW&8Xve_t<GrqkuaPosv3X@oWvN1XL;TX3Tvr%;Rw=T^GXNKN0+}G-}fW!
z++0<@rZSv2TITcC)YmrSBu1Q~u6IBpL#f!E&8Q~}`ESX?LiDTi1XuLkc^X|!Yak-F
z7UknuMn9bbk3%gz&c^bV^4tKiR&%lp7!KJSQ1|k1v?X{Noqn9STE2K;oT^4(u#gPZ
znwy)Or~OHLo2xpQo9n7``W->PvpyKGxyq}ZxRjSemAj#;rIC^_-|z9T8oD@^dRNzb
z-HyecKyxEVs)$w1fx3Y<il)4J4Ge-)*wIG&vNM-jb(ym%*zDJ!<zBZ2TYwr`XT#Z{
zPH(WDG7pjEEwGTVg(MeyTt2^>LOF+80@29j4bCQ1ye0KWu8dON>T%8Uz|OukQB2Z~
z<=)ketkKih=xgay*%#Ws?H9N`L1%qKZV^%$44`8A8XW;=V-t?0MPXD23)G5bbHl3c
z^78t6Y5`PZd3~UErS{)yt@=00sVvV{Jsn+4)j}8aO?UA8c!K4Azt4}FLr1DQmRARx
z8uW`=Un?51)v+WO;ulpHIjR;`&p?x0j`+FL{rJ%knAG7RMP<3X=KWihY>91vxVSpz
z<(9Vy+XYdbR`YD-Ssgea7?h*I<E;(WsmXyYL`<r)8VmEbQp0f8ae$#u@-+vs->U)D
z7nC0oRK@>lQz>>-I9$AWmM?DPRYz^$$X9VRdA?e7p4?bGb!5bUM-TPcUN@C?qng;5
z3g)b)YL`pJ1oZ_pK%$lh=t$)TpRdVQi4<Xu!TSQ}_u|(UoDGh8bgE8t{kAIifGS&F
z-avf{I!s?vAa{ntMjbG!cYt%1m$xi&R2PeSqLxXX7*C-);PF$3iF2O)!GBBlnW<)h
zZ(=;6Eaw$Ti?4LL)%l9U>yg!iDiHdB#bQiQXGcyydgFdDdT`#&qj|RS`388FdgVwG
z!=>|M2{3bOs{;dUz(-SyAbxblDk^1EZ2##>dF4DB@&5Y`(WUt-{o!$&yRC^P;j1()
zBx#+T?@DY4ZZJiU^2_&p&k2Dh8BzFnT<hm(or2!^N&RxYfp+tgOT+UQInqJ8F`ed4
zK8Ml56}{V+{PO*O4>7<ss9yApQW;PBv`|EA1(KHQ1#Y28eSfCvr=-NDe-<$9`M+|#
zqm>K;NAk(x@t!-GenGNw+)$X*FV|D(fC+MPJw>Hd8_0X|EAU5YlKgUgXk#jc_N@=a
z>X-Jr5^;!sE=7iu>qS<hN^94PXw62ZU-CPFQMu%)(WCFb4$+Dz9KEO0I=rFj=juTb
zzl<i|YrY{Cz?R5_3$9rE-w?ykOeQs^@VX|cWb_)7fZGtD)i2*~cFk8gWIiRo%)g}X
ziQzvo-^7EC2>!ceMB&G#|6$--{qlWv(*l*DXmEc4vHE`{_y>t-@;&CZN)>QV>(3vP
zefHZJe!0Fj>L!(s%3SMW)Bg+tDg9EvT#vH~ez{I3WyJFDiQz91?`KN{zbU5tW7EGM
z{8ZItv=#T8KryyrJ@@F1U_TdYzds?As=DNt>yJ%>KQ?|WeH6TucgZi`Q@0BKLu4E{
zS&WijrYnLliX!=y2Th=`X`_vM{2X8PPQ6NgiG73{!X&?ZPt_#&W9zR;G76tQw9)c!
z5&9)R-$|k25!s0%^m_u@^vn0+TZDd@PstFQe)<GR%P-fXw+jCIC{W{K`LhtNO}|_R
zQZ}eeO&VUhjpaX2=(onill=5qlh#R6`jc=Y`z~#~jkl_lRrKQ~kmun~>X+qW0lzFe
ll|#L5zf<)~q#_X*L-O;z<MH=wRv88f{eqp_THcfYe*j%Giy{C3

literal 0
HcmV?d00001

diff --git a/examples/simple_hip_test/vector_add_bench.hip b/examples/simple_hip_test/vector_add_bench.hip
new file mode 100644
index 0000000..c30e031
--- /dev/null
+++ b/examples/simple_hip_test/vector_add_bench.hip
@@ -0,0 +1,69 @@
+// Simple HIP vector_add wrapped in the Magpie HIP-graph latency helper
+// for strict parity with the Triton wall-clock numbers.
+//
+// Compile:
+//   hipcc -g -O2 -std=c++17 \
+//     -I/path/to/Magpie/Magpie/bench/include \
+//     -o vector_add_bench vector_add_bench.hip
+//
+// Run via Magpie's user-harness sub-mode (latency.bench_target unset,
+// testcase_command points at this binary). The binary prints exactly:
+//   MAGPIE_LATENCY_JSON: {"mode":"hip_graph","stats":{...}}
+// which Magpie/eval/latency.py parses directly.
+
+#include <hip/hip_runtime.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+#include "magpie_bench.hpp"
+
+__global__ void vector_add(const float* a, const float* b, float* c, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) c[idx] = a[idx] + b[idx];
+}
+
+int main() {
+    const int N = 1 << 20;  // 1M elements — same as the Triton example
+    const size_t bytes = N * sizeof(float);
+
+    float *h_a = (float*)malloc(bytes);
+    float *h_b = (float*)malloc(bytes);
+    for (int i = 0; i < N; i++) {
+        h_a[i] = static_cast<float>(i);
+        h_b[i] = static_cast<float>(i * 2);
+    }
+
+    float *d_a, *d_b, *d_c;
+    hipMalloc(&d_a, bytes);
+    hipMalloc(&d_b, bytes);
+    hipMalloc(&d_c, bytes);
+
+    hipMemcpy(d_a, h_a, bytes, hipMemcpyHostToDevice);
+    hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice);
+
+    // Use the Magpie helper's side stream so the captured graph is well-defined.
+    hipStream_t& stream = magpie::bench::current_stream();
+    if (stream == nullptr) hipStreamCreate(&stream);
+
+    const int blockSize = 256;
+    const int gridSize = (N + blockSize - 1) / blockSize;
+
+    auto stats = magpie::bench::do_bench_hipgraph(
+        [&]() {
+            hipLaunchKernelGGL(vector_add, dim3(gridSize), dim3(blockSize),
+                               0, stream, d_a, d_b, d_c, N);
+        },
+        /*rep_ms=*/20,
+        /*n_retries=*/5,
+        /*estimate_reps=*/5);
+
+    magpie::bench::print_marker(stats, "vector_add");
+
+    hipFree(d_a);
+    hipFree(d_b);
+    hipFree(d_c);
+    free(h_a);
+    free(h_b);
+    return 0;
+}
diff --git a/examples/simple_triton_test/README.md b/examples/simple_triton_test/README.md
new file mode 100644
index 0000000..7ba176b
--- /dev/null
+++ b/examples/simple_triton_test/README.md
@@ -0,0 +1,71 @@
+# Triton 0-overhead Latency Examples
+
+Self-contained Triton vector-add example exercising the new `Latency`
+stage end-to-end. See [`docs/latency.md`](../../docs/latency.md) for the
+full design.
+
+## Files
+
+| File | Purpose |
+| --- | --- |
+| `triton_vector_add.py` | Triton kernel + 2 BLOCK_SIZE variants + `get_inputs` factory + `--check` / `--bench` CLI. |
+| `analyze_triton_latency.yaml` | Single-kernel analyze with import-based `bench_target` and `method: auto`. |
+| `compare_triton_blocksize.yaml` | Compare 2 BLOCK_SIZE configs ranking by `kernel_median_ms` (the autotuning recipe). |
+
+## Setup
+
+```bash
+pip install triton torch
+# rocprofv3 needed for the kernel-only half of method=auto / method=both
+```
+
+## Run
+
+```bash
+cd /path/to/Magpie
+
+# Single-kernel analyze (wall-clock + kernel-only)
+python -m Magpie analyze -k examples/simple_triton_test/analyze_triton_latency.yaml
+
+# Compare two BLOCK_SIZE configs — primary_metric: kernel_median_ms
+python -m Magpie compare -k examples/simple_triton_test/compare_triton_blocksize.yaml
+```
+
+## What you should see
+
+`results/<run>/analyze_report.json` will contain a top-level summary
+block per kernel:
+
+```json
+{
+  "summary": [
+    {
+      "kernel_id": "triton_vector_add",
+      "latency_state": "SUCCESS",
+      "latency": {
+        "method": "both",
+        "primary_metric": "kernel_median_ms",
+        "primary_value_ms": 0.012,
+        "wall_median_ms":   0.044,
+        "kernel_median_ms": 0.012,
+        "dispatch_overhead_us": 32.0,
+        "crosscheck_vs_rocprof_ratio": 3.67
+      }
+    }
+  ]
+}
+```
+
+(`dispatch_overhead_us` and the high `crosscheck_vs_rocprof_ratio`
+illustrate exactly why we need `kernel_median_ms` for autotuning small
+kernels — wall-clock is dominated by the ~30 µs dispatch path.)
+
+## Without Magpie (sanity / dev loop)
+
+```bash
+# Correctness only
+python examples/simple_triton_test/triton_vector_add.py --check
+
+# User-harness benchmark (prints MAGPIE_LATENCY_JSON line directly)
+python examples/simple_triton_test/triton_vector_add.py --bench
+```
diff --git a/examples/simple_triton_test/analyze_triton_latency.yaml b/examples/simple_triton_test/analyze_triton_latency.yaml
new file mode 100644
index 0000000..9586db5
--- /dev/null
+++ b/examples/simple_triton_test/analyze_triton_latency.yaml
@@ -0,0 +1,58 @@
+# Triton vector_add — 0-overhead latency harness (analyze mode)
+# ===============================================================
+# Demonstrates the import-based ``bench_target`` path:
+#   * Magpie spawns Magpie/bench/_runner.py in a subprocess
+#   * The runner imports `triton_vector_add.triton_vector_add` + `get_inputs`
+#   * Runs do_bench_cudagraph (wall-clock) AND rocprofv3 --kernel-trace
+#     (kernel-only) when method=both/auto.
+#
+# The `testcase_command` still drives correctness checking; the latency
+# numbers come from the dedicated Latency stage (independent of the
+# testcase exit code).
+#
+# Prereq:
+#   pip install triton torch
+#   # rocprofv3 is required for the kernel_trace half of method=both;
+#   # without it Magpie falls back to wall-clock-only.
+#
+# Usage:
+#   cd /path/to/Magpie
+#   python -m Magpie analyze -k examples/simple_triton_test/analyze_triton_latency.yaml
+
+kernel:
+  id: "triton_vector_add"
+  type: triton
+  source_files:
+    - "./triton_vector_add.py"
+  working_dir: "examples/simple_triton_test"
+
+  # Drives Correctness (and Performance if rocprof-compute / ncu is
+  # available). Stays unchanged from a normal Triton workflow.
+  testcase_command: "python -m triton_vector_add --check"
+
+  # Latency-harness target (per-kernel; overrides any latency.bench_target
+  # in the framework config). Magpie's runner imports `module.callable`
+  # and times it with inputs from `module.get_inputs`.
+  bench_target:
+    module: "triton_vector_add"
+    callable: "triton_vector_add"
+    get_inputs: "get_inputs"
+
+# Pythonpath so the runner can find ./triton_vector_add.py without the
+# user having to `pip install -e .` it.
+latency:
+  enabled: true
+  method: auto              # both for triton; rocprof_timestamps for hip
+  primary_metric: kernel_median_ms   # rank by kernel-only timing (autotuning)
+  rep_ms: 20
+  n_retries: 5
+  estimate_reps: 5
+  warmup_iters: 5
+  seed: 42
+  # Restrict per-kernel aggregation to the Triton-emitted kernel, so torch's
+  # seeded `randn` setup dispatches don't pollute the kernel_stats summary.
+  # (The Triton kernel function name in the .py becomes the rocprofv3 name.)
+  kernel_filter: "_vector_add_kernel"
+  pythonpath:
+    - "examples/simple_triton_test"
+  timeout_seconds: 120
diff --git a/examples/simple_triton_test/compare_triton_blocksize.yaml b/examples/simple_triton_test/compare_triton_blocksize.yaml
new file mode 100644
index 0000000..7a74055
--- /dev/null
+++ b/examples/simple_triton_test/compare_triton_blocksize.yaml
@@ -0,0 +1,59 @@
+# Triton vector_add — kernel-config autotuning via Latency
+# =========================================================
+# Compare two BLOCK_SIZE variants of the SAME Triton kernel. Both go
+# through identical Python/PyTorch/Triton dispatch — the only difference
+# is BLOCK_SIZE inside the kernel itself.
+#
+# This is exactly the scenario the in-team note flags: dispatch overhead
+# is constant across configs, so wall_median_ms is dominated by dispatch
+# noise on small kernels and may rank the two variants tied or backwards.
+# Setting `primary_metric: kernel_median_ms` makes Magpie use rocprofv3
+# --kernel-trace HW timestamps for the ranking, which isolates the actual
+# kernel-time delta.
+#
+# Prereq:
+#   pip install triton torch
+#   # rocprofv3 needed for kernel_median_ms (otherwise method falls back).
+#
+# Usage:
+#   cd /path/to/Magpie
+#   python -m Magpie compare -k examples/simple_triton_test/compare_triton_blocksize.yaml
+
+kernels:
+  - id: "triton_vector_add_block256"
+    type: triton
+    source_files:
+      - "./triton_vector_add.py"
+    working_dir: "examples/simple_triton_test"
+    testcase_command: "python -m triton_vector_add --check"
+    bench_target:
+      module: "triton_vector_add"
+      callable: "triton_vector_add_block256"
+      get_inputs: "get_inputs"
+
+  - id: "triton_vector_add_block1024"
+    type: triton
+    source_files:
+      - "./triton_vector_add.py"
+    working_dir: "examples/simple_triton_test"
+    testcase_command: "python -m triton_vector_add --check"
+    bench_target:
+      module: "triton_vector_add"
+      callable: "triton_vector_add_block1024"
+      get_inputs: "get_inputs"
+
+latency:
+  enabled: true
+  method: both              # explicitly capture wall-clock AND kernel-only
+  primary_metric: kernel_median_ms   # rank by kernel-only — see comment above
+  rep_ms: 30
+  n_retries: 7              # extra retries since variants are close in time
+  estimate_reps: 5
+  warmup_iters: 10          # extra warmup so JIT cache is populated for both
+  seed: 42
+  # Both BLOCK_SIZE variants share this Triton kernel name; the filter
+  # also drops torch's seeded randn setup kernels from the summary.
+  kernel_filter: "_vector_add_kernel"
+  pythonpath:
+    - "examples/simple_triton_test"
+  timeout_seconds: 180
diff --git a/examples/simple_triton_test/triton_vector_add.py b/examples/simple_triton_test/triton_vector_add.py
new file mode 100644
index 0000000..789b012
--- /dev/null
+++ b/examples/simple_triton_test/triton_vector_add.py
@@ -0,0 +1,155 @@
+"""
+Simple Triton vector-add kernel for Magpie's 0-overhead Latency harness.
+
+Exposes two BLOCK_SIZE variants of the *same* kernel so the
+``compare_triton_blocksize.yaml`` example can demonstrate why
+``primary_metric: kernel_median_ms`` matters for kernel-config autotuning
+(both variants have identical dispatch overhead; only the kernel duration
+differs).
+
+Module contract for ``Magpie/bench/_runner.py``:
+  - One or more ``callable`` symbols (here: ``triton_vector_add_block256``,
+    ``triton_vector_add_block1024``).
+  - ``get_inputs() -> (args, kwargs)`` — the runner calls ``callable(*args, **kwargs)``.
+
+You can also run this module directly to sanity-check the kernel and print
+a ``MAGPIE_LATENCY_JSON: {...}`` line — that doubles as both:
+  * the user-harness escape hatch (``method: cuda_graph`` without
+    ``bench_target``),
+  * a quick local benchmark for development.
+
+Usage:
+  pip install triton torch
+  python -m Magpie analyze -k examples/simple_triton_test/analyze_triton_latency.yaml
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from typing import Tuple
+
+import torch
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError as e:
+    raise ImportError(
+        "This example requires Triton. Install with: pip install triton"
+    ) from e
+
+
+# ---------------------------------------------------------------------------
+# Kernel
+# ---------------------------------------------------------------------------
+
+
+@triton.jit
+def _vector_add_kernel(
+    a_ptr, b_ptr, c_ptr, n,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n
+    a = tl.load(a_ptr + offsets, mask=mask)
+    b = tl.load(b_ptr + offsets, mask=mask)
+    tl.store(c_ptr + offsets, a + b, mask=mask)
+
+
+def _launch(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, block_size: int) -> None:
+    n = a.numel()
+    grid = (triton.cdiv(n, block_size),)
+    _vector_add_kernel[grid](a, b, c, n, BLOCK_SIZE=block_size)
+
+
+# ---------------------------------------------------------------------------
+# Magpie bench_target callables
+# ---------------------------------------------------------------------------
+
+
+def triton_vector_add_block256(
+    a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
+) -> None:
+    _launch(a, b, c, block_size=256)
+
+
+def triton_vector_add_block1024(
+    a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
+) -> None:
+    _launch(a, b, c, block_size=1024)
+
+
+# Default callable used by ``analyze_triton_latency.yaml``.
+def triton_vector_add(
+    a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
+) -> None:
+    _launch(a, b, c, block_size=1024)
+
+
+# ---------------------------------------------------------------------------
+# Inputs factory (called once by the runner *after* torch.manual_seed(seed))
+# ---------------------------------------------------------------------------
+
+
+N = 1 << 20  # 1M elements — small enough that dispatch overhead matters,
+             # which is exactly the scenario where kernel_median_ms beats
+             # wall_median_ms for autotuning rankings.
+
+
+def get_inputs() -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], dict]:
+    """Magpie runner contract: returns (args, kwargs)."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA / HIP is required for the Triton example")
+    device = "cuda"
+    a = torch.randn(N, device=device, dtype=torch.float32)
+    b = torch.randn(N, device=device, dtype=torch.float32)
+    c = torch.empty_like(a)
+    return (a, b, c), {}
+
+
+# ---------------------------------------------------------------------------
+# CLI: --check (correctness, used by testcase_command)
+#      --bench (user-harness latency mode, prints MAGPIE_LATENCY_JSON line)
+# ---------------------------------------------------------------------------
+
+
+def _check() -> int:
+    (a, b, c), _ = get_inputs()
+    triton_vector_add(a, b, c)
+    torch.cuda.synchronize()
+    expected = a + b
+    if torch.allclose(c, expected, atol=1e-5, rtol=1e-5):
+        print(f"PASSED: vector_add OK on {N} elements")
+        return 0
+    print("FAILED: vector_add mismatch")
+    return 1
+
+
+def _bench() -> int:
+    """User-harness escape hatch — emits the MAGPIE_LATENCY_JSON marker."""
+    from Magpie.bench import MAGPIE_LATENCY_JSON_MARKER, do_bench_cudagraph
+
+    (a, b, c), _ = get_inputs()
+    stats = do_bench_cudagraph(
+        lambda: triton_vector_add(a, b, c),
+        rep=20,
+        n_retries=5,
+        estimate_reps=5,
+    )
+    print(
+        f"{MAGPIE_LATENCY_JSON_MARKER} "
+        + json.dumps({"mode": "cuda_graph", "stats": stats.to_dict()})
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    if "--bench" in sys.argv:
+        sys.exit(_bench())
+    if "--check" in sys.argv:
+        sys.exit(_check())
+    # Default: behave like a smoke test.
+    sys.exit(_check())
diff --git a/tests/test_latency.py b/tests/test_latency.py
new file mode 100644
index 0000000..561b262
--- /dev/null
+++ b/tests/test_latency.py
@@ -0,0 +1,364 @@
+###############################################################################
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Tests for the Latency evaluation stage.
+
+Coverage:
+  * LatencyConfig.method == "auto" selection table per KernelType
+  * BenchTarget round-trip through to_dict / from_dict
+  * LatencyStats summary derivation from raw samples
+  * rocprofv3 --kernel-trace CSV parser
+  * pmc_perf.csv parser (rocprof_timestamps method)
+  * Skip-on-no-torch integration test for do_bench_cudagraph
+"""
+
+from __future__ import annotations
+
+import json
+import textwrap
+from pathlib import Path
+
+import pytest
+
+from Magpie.bench import LatencyStats, MAGPIE_LATENCY_JSON_MARKER
+from Magpie.config import (
+    BenchTarget,
+    KernelType,
+    LatencyConfig,
+)
+from Magpie.eval.latency import (
+    _aggregate_per_kernel_durations_ns,
+    _find_rocprofv3_csv,
+    _parse_marker_line,
+    _parse_pmc_perf_csv_for_durations,
+    _parse_rocprofv3_kernel_trace_csv,
+    _summary_stats_from_per_kernel,
+)
+
+
+# ---------------------------------------------------------------------------
+# LatencyConfig.method == "auto" selection table
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "kernel_type,expected",
+    [
+        (KernelType.HIP, "rocprof_timestamps"),
+        (KernelType.TRITON, "both"),
+        (KernelType.PYTORCH, "both"),
+        (KernelType.CUDA, "both"),
+        (None, "cuda_graph"),
+    ],
+)
+def test_latency_config_auto_method_selection(kernel_type, expected):
+    cfg = LatencyConfig(method="auto", kernel_type=kernel_type)
+    assert cfg.resolve_method() == expected
+
+
+def test_latency_config_explicit_method_passthrough():
+    cfg = LatencyConfig(method="kernel_trace", kernel_type=KernelType.TRITON)
+    assert cfg.resolve_method() == "kernel_trace"
+
+    cfg2 = LatencyConfig(method="none", kernel_type=KernelType.HIP)
+    assert cfg2.resolve_method() == "none"
+
+
+def test_latency_config_validates_method():
+    with pytest.raises(ValueError):
+        LatencyConfig(method="bogus")
+
+
+def test_latency_config_validates_primary_metric():
+    with pytest.raises(ValueError):
+        LatencyConfig(primary_metric="bogus")
+
+
+def test_latency_config_round_trip_from_dict():
+    cfg = LatencyConfig.from_dict(
+        {
+            "enabled": False,
+            "method": "kernel_trace",
+            "primary_metric": "kernel_median_ms",
+            "rep_ms": 30,
+            "n_retries": 9,
+            "seed": 7,
+            "pythonpath": ["/a", "/b"],
+            "bench_target": {
+                "module": "m.foo",
+                "callable": "f",
+                "get_inputs": "ginps",
+            },
+        },
+        kernel_type=KernelType.TRITON,
+        gpu_arch="gfx942",
+    )
+
+    assert cfg.enabled is False
+    assert cfg.method == "kernel_trace"
+    assert cfg.primary_metric == "kernel_median_ms"
+    assert cfg.rep_ms == 30
+    assert cfg.n_retries == 9
+    assert cfg.seed == 7
+    assert cfg.pythonpath == ["/a", "/b"]
+    assert cfg.bench_target is not None
+    assert cfg.bench_target.module == "m.foo"
+    assert cfg.bench_target.callable == "f"
+    assert cfg.bench_target.get_inputs == "ginps"
+    assert cfg.kernel_type is KernelType.TRITON
+    assert cfg.gpu_arch == "gfx942"
+
+
+# ---------------------------------------------------------------------------
+# BenchTarget
+# ---------------------------------------------------------------------------
+
+
+def test_bench_target_from_dict_requires_module_and_callable():
+    assert BenchTarget.from_dict(None) is None
+    assert BenchTarget.from_dict({}) is None
+    assert BenchTarget.from_dict({"module": "m"}) is None
+    assert BenchTarget.from_dict({"callable": "c"}) is None
+
+    bt = BenchTarget.from_dict({"module": "m", "callable": "c"})
+    assert bt is not None
+    assert bt.module == "m"
+    assert bt.callable == "c"
+    assert bt.get_inputs == "get_inputs"
+
+
+# ---------------------------------------------------------------------------
+# LatencyStats
+# ---------------------------------------------------------------------------
+
+
+def test_latency_stats_from_samples_basic():
+    stats = LatencyStats.from_samples(
+        [1.0, 2.0, 3.0, 4.0, 5.0],
+        n_repeat=10,
+        n_retries=5,
+        estimate_ms=2.5,
+    )
+    assert stats.median_ms == 3.0
+    assert stats.min_ms == 1.0
+    assert stats.max_ms == 5.0
+    assert stats.p50_ms == pytest.approx(3.0)
+    assert stats.p99_ms == pytest.approx(5.0 - (5.0 - 4.0) * (1 - 0.96))  # interp
+    assert stats.std_ms > 0
+    assert stats.n_repeat == 10
+    assert stats.n_retries == 5
+    assert stats.estimate_ms == 2.5
+    assert len(stats.samples_ms) == 5
+
+
+def test_latency_stats_from_samples_empty():
+    stats = LatencyStats.from_samples([], n_repeat=1, n_retries=0)
+    assert stats.median_ms == 0.0
+    assert stats.samples_ms == []
+
+
+def test_latency_stats_dict_round_trip():
+    stats = LatencyStats.from_samples([1.0, 2.0], n_repeat=4, n_retries=2)
+    restored = LatencyStats.from_dict(stats.to_dict())
+    assert restored is not None
+    assert restored.median_ms == stats.median_ms
+    assert restored.n_repeat == 4
+    assert restored.n_retries == 2
+
+
+# ---------------------------------------------------------------------------
+# rocprofv3 --kernel-trace CSV parsing
+# ---------------------------------------------------------------------------
+
+
+def _write_rocprofv3_csv(path: Path, rows):
+    header = "Kernel_Name,Start_Timestamp,End_Timestamp\n"
+    body = "\n".join(",".join(str(c) for c in row) for row in rows)
+    path.write_text(header + body + "\n")
+
+
+def test_parse_rocprofv3_kernel_trace_csv_basic(tmp_path: Path):
+    csv = tmp_path / "kernel_trace.csv"
+    _write_rocprofv3_csv(
+        csv,
+        [
+            ("triton_scaled_mm_kernel", 1000, 2000),  # 1000 ns
+            ("triton_scaled_mm_kernel", 3000, 4500),  # 1500 ns
+            ("triton_scaled_mm_kernel", 5000, 6000),  # 1000 ns
+            ("__hip_some_runtime_thunk", 7000, 7100),  # filtered out
+            ("other_kernel", 8000, 9000),  # 1000 ns
+        ],
+    )
+
+    per_kernel_ns = _parse_rocprofv3_kernel_trace_csv(csv)
+    assert "triton_scaled_mm_kernel" in per_kernel_ns
+    assert "__hip_some_runtime_thunk" not in per_kernel_ns
+    assert per_kernel_ns["triton_scaled_mm_kernel"] == [1000.0, 1500.0, 1000.0]
+    assert per_kernel_ns["other_kernel"] == [1000.0]
+
+
+def test_parse_rocprofv3_kernel_trace_csv_kernel_filter(tmp_path: Path):
+    csv = tmp_path / "kernel_trace.csv"
+    _write_rocprofv3_csv(
+        csv,
+        [
+            ("triton_scaled_mm_kernel", 1000, 2000),
+            ("other_kernel", 3000, 4000),
+        ],
+    )
+
+    per_kernel_ns = _parse_rocprofv3_kernel_trace_csv(
+        csv, kernel_filter_re=r"triton_"
+    )
+    assert "triton_scaled_mm_kernel" in per_kernel_ns
+    assert "other_kernel" not in per_kernel_ns
+
+
+def test_parse_rocprofv3_kernel_trace_csv_missing_file(tmp_path: Path):
+    csv = tmp_path / "nope.csv"
+    assert _parse_rocprofv3_kernel_trace_csv(csv) == {}
+
+
+def test_find_rocprofv3_csv_locates_kernel_trace(tmp_path: Path):
+    out_dir = tmp_path / "out"
+    out_dir.mkdir()
+    target = out_dir / "myrun_kernel_trace.csv"
+    target.write_text("Kernel_Name,Start_Timestamp,End_Timestamp\n")
+    found = _find_rocprofv3_csv(out_dir)
+    assert found == target
+
+
+def test_find_rocprofv3_csv_returns_none_when_empty(tmp_path: Path):
+    out_dir = tmp_path / "empty"
+    out_dir.mkdir()
+    assert _find_rocprofv3_csv(out_dir) is None
+
+
+# ---------------------------------------------------------------------------
+# pmc_perf.csv parsing (rocprof_timestamps)
+# ---------------------------------------------------------------------------
+
+
+def test_parse_pmc_perf_csv_for_durations(tmp_path: Path):
+    csv = tmp_path / "pmc_perf.csv"
+    csv.write_text(
+        "Kernel_Name,Start_Timestamp,End_Timestamp,Other\n"
+        "vector_add,1000,5000,x\n"
+        "vector_add,6000,9000,x\n"
+        "vector_add,abc,def,x\n"
+        "__hip_thunk,1,2,x\n"
+    )
+
+    per_kernel_ns = _parse_pmc_perf_csv_for_durations(csv)
+    assert "vector_add" in per_kernel_ns
+    assert per_kernel_ns["vector_add"] == [4000.0, 3000.0]
+    assert "__hip_thunk" not in per_kernel_ns
+
+
+# ---------------------------------------------------------------------------
+# Aggregation helpers
+# ---------------------------------------------------------------------------
+
+
+def test_aggregate_per_kernel_durations_ns_to_stats():
+    per_kernel_ns = {
+        "k1": [1_000_000.0, 2_000_000.0, 3_000_000.0],  # 1, 2, 3 ms
+        "k2": [],  # dropped
+        "k3": [-5.0, 0.0],  # dropped (no positive samples)
+    }
+    out = _aggregate_per_kernel_durations_ns(per_kernel_ns)
+    assert "k1" in out
+    assert "k2" not in out
+    assert "k3" not in out
+    assert out["k1"].median_ms == 2.0
+    assert out["k1"].min_ms == 1.0
+    assert out["k1"].max_ms == 3.0
+
+
+def test_summary_stats_from_per_kernel():
+    per_kernel = {
+        "a": LatencyStats.from_samples([1.0, 2.0, 3.0], n_repeat=1, n_retries=3),
+        "b": LatencyStats.from_samples([0.5, 0.5, 0.5], n_repeat=1, n_retries=3),
+    }
+    summary = _summary_stats_from_per_kernel(per_kernel)
+    assert summary is not None
+    # median is sum-of-medians = 2.0 + 0.5 = 2.5
+    assert summary.median_ms == pytest.approx(2.5)
+    assert summary.p50_ms == pytest.approx(2.5)
+    # min/max derived from the merged sample set
+    assert summary.min_ms == 0.5
+    assert summary.max_ms == 3.0
+
+
+def test_summary_stats_from_empty_per_kernel():
+    assert _summary_stats_from_per_kernel({}) is None
+
+
+# ---------------------------------------------------------------------------
+# MAGPIE_LATENCY_JSON marker parsing
+# ---------------------------------------------------------------------------
+
+
+def test_parse_marker_line_picks_up_payload():
+    payload = {"stats": {"median_ms": 0.42}, "module": "m"}
+    output = textwrap.dedent(
+        f"""\
+        garbage line 1
+        unrelated output
+        {MAGPIE_LATENCY_JSON_MARKER} {json.dumps(payload)}
+        trailing log line
+        """
+    )
+    assert _parse_marker_line(output) == payload
+
+
+def test_parse_marker_line_no_marker():
+    assert _parse_marker_line("nothing here") is None
+    assert _parse_marker_line("") is None
+
+
+def test_parse_marker_line_picks_last_marker():
+    p1 = {"stats": {"median_ms": 1.0}}
+    p2 = {"stats": {"median_ms": 2.0}}
+    output = (
+        f"{MAGPIE_LATENCY_JSON_MARKER} {json.dumps(p1)}\n"
+        f"{MAGPIE_LATENCY_JSON_MARKER} {json.dumps(p2)}\n"
+    )
+    # The last marker wins (closest to "final result")
+    assert _parse_marker_line(output) == p2
+
+
+# ---------------------------------------------------------------------------
+# Optional torch integration test
+# ---------------------------------------------------------------------------
+
+
+def test_do_bench_cudagraph_smoke():
+    """
+    Smoke test that exercises the real do_bench_cudagraph code path on a
+    trivial elementwise add. Skipped automatically when torch is missing or
+    no GPU is present (so the suite stays green on CPU-only CI).
+    """
+    pytest.importorskip("torch")
+    import torch
+
+    if not torch.cuda.is_available():
+        pytest.skip("torch.cuda.is_available() is False")
+
+    from Magpie.bench import do_bench_cudagraph
+
+    a = torch.randn(1024, 1024, device="cuda")
+    b = torch.randn(1024, 1024, device="cuda")
+
+    def fn():
+        torch.add(a, b, out=a)
+
+    stats = do_bench_cudagraph(fn, rep=5, n_retries=3, estimate_reps=3)
+
+    assert stats.median_ms > 0
+    assert stats.n_retries == 3
+    assert stats.n_repeat >= 1
+    assert len(stats.samples_ms) == 3
diff --git a/tests/test_main_and_kernel_config.py b/tests/test_main_and_kernel_config.py
index 5410a21..fc3bee8 100644
--- a/tests/test_main_and_kernel_config.py
+++ b/tests/test_main_and_kernel_config.py
@@ -106,20 +106,82 @@ def test_load_kernel_config_collects_sections_and_expands_env(monkeypatch, tmp_p
         encoding="utf-8",
     )
 
-    kernels, perf_overrides, corr_overrides, sched_overrides = load_kernel_config(
-        config_path
-    )
+    (
+        kernels,
+        perf_overrides,
+        corr_overrides,
+        sched_overrides,
+        lat_overrides,
+    ) = load_kernel_config(config_path)
 
     assert [cfg.kernel_id for cfg in kernels] == ["single", "second"]
     assert kernels[0].source_file_path == [f"{tmp_path}/workspace/single.hip"]
     assert kernels[1].compiling_command == [["python", "prepare.py"]]
     assert perf_overrides == {"backend": f"{tmp_path}/workspace/perf"}
     assert corr_overrides == {"backend": "testcase"}
+    assert lat_overrides == {}
     assert sched_overrides["max_workers"] == 2
     assert sched_overrides["ray_config"] == {"cluster_address": "ray://127.0.0.1:10001"}
     assert sched_overrides["environment"] == "ray"
 
 
+def test_load_kernel_config_parses_latency_section_and_bench_target(
+    monkeypatch, tmp_path
+):
+    monkeypatch.setenv("MAGPIE_ROOT", str(tmp_path / "workspace"))
+
+    config_path = tmp_path / "kernel_config.yaml"
+    config_path.write_text(
+        yaml.safe_dump(
+            {
+                "kernel": {
+                    "id": "triton_scaled_mm",
+                    "type": "triton",
+                    "source_files": ["$MAGPIE_ROOT/k.py"],
+                    "testcase_command": "python -m my_kernels.scaled_mm --check",
+                    "bench_target": {
+                        "module": "my_kernels.scaled_mm",
+                        "callable": "triton_scaled_mm",
+                        "get_inputs": "get_inputs",
+                    },
+                },
+                "latency": {
+                    "enabled": True,
+                    "method": "both",
+                    "primary_metric": "kernel_median_ms",
+                    "rep_ms": 25,
+                    "n_retries": 7,
+                    "seed": 1234,
+                    "pythonpath": ["$MAGPIE_ROOT/lib"],
+                },
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    (
+        kernels,
+        _perf,
+        _corr,
+        _sched,
+        lat_overrides,
+    ) = load_kernel_config(config_path)
+
+    assert len(kernels) == 1
+    cfg = kernels[0]
+    assert cfg.bench_target == {
+        "module": "my_kernels.scaled_mm",
+        "callable": "triton_scaled_mm",
+        "get_inputs": "get_inputs",
+    }
+    assert lat_overrides["method"] == "both"
+    assert lat_overrides["primary_metric"] == "kernel_median_ms"
+    assert lat_overrides["rep_ms"] == 25
+    assert lat_overrides["n_retries"] == 7
+    assert lat_overrides["seed"] == 1234
+    assert lat_overrides["pythonpath"] == [f"{tmp_path}/workspace/lib"]
+
+
 def test_kernel_eval_config_normalizes_single_and_multi_commands():
     cfg = KernelEvalConfig(
         kernel_id="test",