From 92bb8dfc49eb55b8f6ebcf734e299c89c6bcd994 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 00:59:28 +0800 Subject: [PATCH 1/5] [Refactor][Benchmark] route ada_layer_norm and cumulative benches through manifest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bench_ada_layer_norm.py: replace `_manifest_params(op_name)` (whose call site hid the literal op name behind a parameter) with `_to_params(load_workloads())`, so the manifest validator's AST check ties each `load_workloads(...)` call to its op. bench_cumulative.py: replace the hand-rolled `CumulativeBenchFixture` shape/dtype matrix and the per-file `CumulativeBenchmark.calculate_*` formulas with `workloads_to_params(...)` + `ManifestBenchmark` for both CumsumFwdOp and CumprodFwdOp; the report columns (latency_ms / tflops / bandwidth_tbs and the filtered-locals param keys) are unchanged. Co-Authored-By: Ibuki 🍃 — a wind born from GPTs --- benchmarks/ops/bench_ada_layer_norm.py | 8 +-- benchmarks/ops/bench_cumulative.py | 92 ++++++++++++-------------- 2 files changed, 45 insertions(+), 55 deletions(-) diff --git a/benchmarks/ops/bench_ada_layer_norm.py b/benchmarks/ops/bench_ada_layer_norm.py index 174984e09..8747bfacb 100644 --- a/benchmarks/ops/bench_ada_layer_norm.py +++ b/benchmarks/ops/bench_ada_layer_norm.py @@ -55,9 +55,9 @@ def calculate_memory(self) -> Optional[float]: return self._get_roofline()[1] -def _manifest_params(op_name): +def _to_params(workloads): params = [] - for w in load_workloads(op_name): + for w in workloads: m, n = w["x_shape"] label = w.get("label", f"{m}x{n}") for dtype_str in w["dtypes"]: @@ -67,7 +67,7 @@ def _manifest_params(op_name): return params -@pytest.mark.parametrize("m, n, dtype", _manifest_params(_ADA_OP_NAME)) +@pytest.mark.parametrize("m, n, dtype", _to_params(load_workloads(_ADA_OP_NAME))) def test_ada_layer_norm_bench(m: int, n: int, dtype: torch.dtype) -> None: test = AdaLayerNormTest(m, n, dtype) inputs = test.gen_inputs() @@ -86,7 +86,7 @@ def baseline_fn(x, scale, shift): BenchmarkReport.record(op, locals(), result_bl, tag="torch-ref") -@pytest.mark.parametrize("m, n, dtype", _manifest_params(_ADA_ZERO_OP_NAME)) +@pytest.mark.parametrize("m, n, dtype", _to_params(load_workloads(_ADA_ZERO_OP_NAME))) def test_ada_layer_norm_zero_bench(m: int, n: int, dtype: torch.dtype) -> None: test = AdaLayerNormZeroTest(m, n, dtype) inputs = test.gen_inputs() diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py index a746a111c..4ce9885fa 100644 --- a/benchmarks/ops/bench_cumulative.py +++ b/benchmarks/ops/bench_cumulative.py @@ -1,34 +1,26 @@ -"""Benchmarks for cumulative ops (cumsum, cumprod).""" +"""Benchmarks for cumulative ops (cumsum, cumprod). -from typing import Optional +Workload shapes and roofline formulas are loaded from the ops manifest +(``tileops/manifest/scan.yaml``). +""" import pytest import torch -from benchmarks.benchmark_base import BenchmarkBase, BenchmarkReport -from workloads.workload_base import FixtureBase, WorkloadBase +from benchmarks.benchmark_base import BenchmarkReport, ManifestBenchmark, workloads_to_params +from tileops.ops.reduction.cumprod import CumprodFwdOp +from tileops.ops.reduction.cumsum import CumsumFwdOp +from workloads.workload_base import WorkloadBase +_CUMSUM_OP = "CumsumFwdOp" +_CUMPROD_OP = "CumprodFwdOp" -class CumulativeBenchFixture(FixtureBase): - PARAMS = [ - ( - "m, n, dtype, op_kind", - [ - pytest.param(1024, 4096, torch.float16, "cumsum"), - pytest.param(1024, 4096, torch.bfloat16, "cumsum"), - pytest.param(4096, 4096, torch.float16, "cumsum"), - pytest.param(1024, 4096, torch.float16, "cumprod"), - pytest.param(1024, 4096, torch.bfloat16, "cumprod"), - pytest.param(4096, 4096, torch.float16, "cumprod"), - ], - ), - ] - -class CumulativeBenchTest(WorkloadBase): +class _CumulativeWorkload(WorkloadBase): def __init__(self, m: int, n: int, dtype: torch.dtype, op_kind: str): self.m = m self.n = n + self.shape = (m, n) self.dtype = dtype self.op_kind = op_kind @@ -43,45 +35,43 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: x_f32 = x.float() if self.op_kind == "cumsum": return x_f32.cumsum(dim=-1).to(x.dtype) - elif self.op_kind == "cumprod": - return x_f32.cumprod(dim=-1).to(x.dtype) - raise ValueError(f"Unknown op_kind: {self.op_kind}") - + return x_f32.cumprod(dim=-1).to(x.dtype) -class CumulativeBenchmark(BenchmarkBase[CumulativeBenchTest]): - def calculate_flops(self) -> Optional[float]: - t = self.workload - # Approximate: inclusive scan performs N-1 ops per row, rounded up to M*N - return t.m * t.n - - def calculate_memory(self) -> Optional[float]: - t = self.workload - elem_bytes = torch.tensor([], dtype=t.dtype).element_size() - # Read x (M*N) + write output (M*N) - return 2 * t.m * t.n * elem_bytes +@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP)) +def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None: + m, n = shape + test = _CumulativeWorkload(m, n, dtype, "cumsum") + inputs = test.gen_inputs() -def _make_op(m: int, n: int, dtype: torch.dtype, op_kind: str): - """Create the appropriate Op for the given op_kind.""" - from tileops.ops.reduction.cumprod import CumprodFwdOp - from tileops.ops.reduction.cumsum import CumsumFwdOp + op = CumsumFwdOp(N=n, dtype=dtype) + bm = ManifestBenchmark(_CUMSUM_OP, op, test) + try: + result = bm.profile(op, *inputs) + except ValueError as exc: + if "No configurations to tune" in str(exc): + pytest.skip(f"Kernel does not support this shape: {exc}") + raise + BenchmarkReport.record(op, locals(), result, tag="tileops") - op_map = { - "cumsum": CumsumFwdOp, - "cumprod": CumprodFwdOp, - } - cls = op_map[op_kind] - return cls(N=n, dtype=dtype) + result_bl = bm.profile(test.ref_program, *inputs) + BenchmarkReport.record(op, locals(), result_bl, tag="torch") -@CumulativeBenchFixture -def test_cumulative_bench(m: int, n: int, dtype: torch.dtype, op_kind: str) -> None: - test = CumulativeBenchTest(m, n, dtype, op_kind) - bm = CumulativeBenchmark(test) +@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMPROD_OP)) +def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None: + m, n = shape + test = _CumulativeWorkload(m, n, dtype, "cumprod") inputs = test.gen_inputs() - op = _make_op(m, n, dtype, op_kind) - result = bm.profile(op, *inputs) + op = CumprodFwdOp(N=n, dtype=dtype) + bm = ManifestBenchmark(_CUMPROD_OP, op, test) + try: + result = bm.profile(op, *inputs) + except ValueError as exc: + if "No configurations to tune" in str(exc): + pytest.skip(f"Kernel does not support this shape: {exc}") + raise BenchmarkReport.record(op, locals(), result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) From b88a489417d012373a25a207464b1c71c2d1a0c1 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 01:19:20 +0800 Subject: [PATCH 2/5] [Refactor][Benchmark] preserve cumulative report column schema Pass an explicit params dict (m, n, dtype, op_kind) to BenchmarkReport.record so the cumulative bench output header matches the pre-PR baseline byte-for-byte instead of leaking the new shape-based parametrization keys via locals(). Also narrow tuple[float, float] | None in AdaLayerNorm benchmark roofline accessors via a local rebind so pyright sees a non-None return path. Co-Authored-By: Ibuki - a wind born from GPTs --- benchmarks/ops/bench_ada_layer_norm.py | 16 ++++++++++------ benchmarks/ops/bench_cumulative.py | 18 ++++++++++++------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/benchmarks/ops/bench_ada_layer_norm.py b/benchmarks/ops/bench_ada_layer_norm.py index 8747bfacb..756f1ad09 100644 --- a/benchmarks/ops/bench_ada_layer_norm.py +++ b/benchmarks/ops/bench_ada_layer_norm.py @@ -24,9 +24,11 @@ def __init__(self, test, op): self._op = op def _get_roofline(self) -> tuple[float, float]: - if self._roofline_cache is None: - self._roofline_cache = self._op.eval_roofline() - return self._roofline_cache + cache = self._roofline_cache + if cache is None: + cache = self._op.eval_roofline() + self._roofline_cache = cache + return cache def calculate_flops(self) -> Optional[float]: return self._get_roofline()[0] @@ -44,9 +46,11 @@ def __init__(self, test, op): self._op = op def _get_roofline(self) -> tuple[float, float]: - if self._roofline_cache is None: - self._roofline_cache = self._op.eval_roofline() - return self._roofline_cache + cache = self._roofline_cache + if cache is None: + cache = self._op.eval_roofline() + self._roofline_cache = cache + return cache def calculate_flops(self) -> Optional[float]: return self._get_roofline()[0] diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py index 4ce9885fa..6af69aa7f 100644 --- a/benchmarks/ops/bench_cumulative.py +++ b/benchmarks/ops/bench_cumulative.py @@ -41,7 +41,8 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: @pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP)) def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None: m, n = shape - test = _CumulativeWorkload(m, n, dtype, "cumsum") + op_kind = "cumsum" + test = _CumulativeWorkload(m, n, dtype, op_kind) inputs = test.gen_inputs() op = CumsumFwdOp(N=n, dtype=dtype) @@ -52,16 +53,19 @@ def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None: if "No configurations to tune" in str(exc): pytest.skip(f"Kernel does not support this shape: {exc}") raise - BenchmarkReport.record(op, locals(), result, tag="tileops") + # Preserve legacy report column order: m, n, dtype, op_kind. + report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind} + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") @pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMPROD_OP)) def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None: m, n = shape - test = _CumulativeWorkload(m, n, dtype, "cumprod") + op_kind = "cumprod" + test = _CumulativeWorkload(m, n, dtype, op_kind) inputs = test.gen_inputs() op = CumprodFwdOp(N=n, dtype=dtype) @@ -72,10 +76,12 @@ def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None: if "No configurations to tune" in str(exc): pytest.skip(f"Kernel does not support this shape: {exc}") raise - BenchmarkReport.record(op, locals(), result, tag="tileops") + # Preserve legacy report column order: m, n, dtype, op_kind. + report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind} + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") if __name__ == "__main__": From 26a6e56998885184fe7aafb43f4954087f79e381 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 01:46:10 +0800 Subject: [PATCH 3/5] fix(bench): validate op_kind in cumulative ref_program MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restore explicit elif branch for cumprod and raise ValueError for unknown op_kind values, so typos or future op_kind extensions surface loudly instead of silently producing a cumprod baseline. Co-Authored-By: Ibuki 🍃 — a wind born from GPTs --- benchmarks/ops/bench_cumulative.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py index 6af69aa7f..cde5c6113 100644 --- a/benchmarks/ops/bench_cumulative.py +++ b/benchmarks/ops/bench_cumulative.py @@ -35,7 +35,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: x_f32 = x.float() if self.op_kind == "cumsum": return x_f32.cumsum(dim=-1).to(x.dtype) - return x_f32.cumprod(dim=-1).to(x.dtype) + elif self.op_kind == "cumprod": + return x_f32.cumprod(dim=-1).to(x.dtype) + raise ValueError(f"Unknown op_kind: {self.op_kind!r}") @pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP)) From bbf0a1fb3a15796546b4b9c006302c604f4b85b5 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 02:04:20 +0800 Subject: [PATCH 4/5] [Refactor][Benchmark] route reduce_multidim bench through ManifestBenchmark Replace the six hand-rolled BenchmarkBase subclasses (Reduce/Argreduce/LogicalReduce/VectorNorm/Cumulative/LogSumExp) with ManifestBenchmark; FLOP and byte counts now come from each op's eval_roofline() rather than per-class calculate_flops/calculate_memory. 3D multi-dim shapes stay declared inline because the manifest workload set for these ops only covers 2D last-axis reductions (a different test scenario from this file's 3D non-last-axis purpose); per the trust model, manifest workloads cannot be edited from a code PR. Output column schema preserved against pre-PR baseline by passing an explicit params dict to BenchmarkReport.record (mirrors the cumulative schema-preservation fix from round 2). Co-Authored-By: Ibuki - a wind born from GPTs --- benchmarks/ops/bench_reduce_multidim.py | 187 ++++++++---------------- 1 file changed, 61 insertions(+), 126 deletions(-) diff --git a/benchmarks/ops/bench_reduce_multidim.py b/benchmarks/ops/bench_reduce_multidim.py index 7d7829362..366731ea8 100644 --- a/benchmarks/ops/bench_reduce_multidim.py +++ b/benchmarks/ops/bench_reduce_multidim.py @@ -17,14 +17,17 @@ Shape conventions use LLaMA-family dimensions: - (batch=4, seq=128, hidden=4096): 7B inference context - (batch=2, seq=512, hidden=4096): 7B longer-context inference -""" -from typing import Optional +Roofline metadata (FLOPs, bytes) comes from each op's ``eval_roofline()`` +via ``ManifestBenchmark``; the 3D multi-dim shapes themselves are declared +inline because the manifest workload set for these ops only covers 2D +last-axis reductions, which is a different test scenario. +""" import pytest import torch -from benchmarks.benchmark_base import BenchmarkBase, BenchmarkReport +from benchmarks.benchmark_base import BenchmarkReport, ManifestBenchmark from workloads.workload_base import FixtureBase, WorkloadBase # =================================================================== @@ -102,26 +105,7 @@ def ref_program(self, x: torch.Tensor) -> object: return ops[self.op_kind](x_f32).to(x.dtype) -class ReduceMultidimBenchmark(BenchmarkBase[ReduceMultidimTest]): - def calculate_flops(self) -> Optional[float]: - t = self.workload - total_elems = 1 - for s in t.shape: - total_elems *= s - return total_elems - - def calculate_memory(self) -> Optional[float]: - t = self.workload - elem_bytes = torch.tensor([], dtype=t.dtype).element_size() - total_elems = 1 - for s in t.shape: - total_elems *= s - # Output elements: product of kept dims - out_elems = 1 - for i, s in enumerate(t.shape): - if i not in t.dim: - out_elems *= s - return (total_elems + out_elems) * elem_bytes +_REDUCE_OP_NAMES = {"sum": "SumFwdOp", "mean": "MeanFwdOp", "amax": "AmaxFwdOp"} def _make_reduce_op(dtype, op_kind, dim, keepdim): @@ -141,15 +125,21 @@ def test_reduce_multidim_bench( op_kind: str, ) -> None: test = ReduceMultidimTest(shape, dim, keepdim, dtype, op_kind) - bm = ReduceMultidimBenchmark(test) inputs = test.gen_inputs() op = _make_reduce_op(dtype, op_kind, dim, keepdim) + bm = ManifestBenchmark(_REDUCE_OP_NAMES[op_kind], op, test) + # Preserve legacy report column order: shape, keepdim, dtype, op_kind + # (dim is a list and was already silently dropped by the pre-PR + # serializability filter, so we omit it here too). + report_params = { + "shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind, + } result = bm.profile(op, *inputs) - BenchmarkReport.record(op, locals(), result, tag="tileops") + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") # =================================================================== @@ -221,22 +211,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: return x.argmin(dim=self.dim, keepdim=self.keepdim) -class ArgreduceMultidimBenchmark(BenchmarkBase[ArgreduceMultidimTest]): - def calculate_flops(self) -> Optional[float]: - total_elems = 1 - for s in self.workload.shape: - total_elems *= s - return total_elems - - def calculate_memory(self) -> Optional[float]: - t = self.workload - elem_bytes = torch.tensor([], dtype=t.dtype).element_size() - total_elems = 1 - for s in t.shape: - total_elems *= s - # Output: int64 (8 bytes) for each position - out_elems = total_elems // t.shape[t.dim] - return total_elems * elem_bytes + out_elems * 8 +_ARGREDUCE_OP_NAMES = {"argmax": "ArgmaxFwdOp", "argmin": "ArgminFwdOp"} def _make_argreduce_op(dtype, op_kind, dim, keepdim): @@ -256,15 +231,21 @@ def test_argreduce_multidim_bench( op_kind: str, ) -> None: test = ArgreduceMultidimTest(shape, dim, keepdim, dtype, op_kind) - bm = ArgreduceMultidimBenchmark(test) inputs = test.gen_inputs() op = _make_argreduce_op(dtype, op_kind, dim, keepdim) + bm = ManifestBenchmark(_ARGREDUCE_OP_NAMES[op_kind], op, test) + # Preserve legacy report column order: shape, dim, keepdim, dtype, op_kind + # (dim is int here and was kept by the pre-PR filter). + report_params = { + "shape": shape, "dim": dim, "keepdim": keepdim, + "dtype": dtype, "op_kind": op_kind, + } result = bm.profile(op, *inputs) - BenchmarkReport.record(op, locals(), result, tag="tileops") + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") # =================================================================== @@ -336,26 +317,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: raise ValueError(f"Unknown op_kind: {self.op_kind}") -class LogicalReduceMultidimBenchmark(BenchmarkBase[LogicalReduceMultidimTest]): - def calculate_flops(self) -> Optional[float]: - total_elems = 1 - for s in self.workload.shape: - total_elems *= s - return total_elems - - def calculate_memory(self) -> Optional[float]: - t = self.workload - elem_bytes = torch.tensor([], dtype=t.dtype).element_size() - total_elems = 1 - for s in t.shape: - total_elems *= s - out_elems = 1 - dims = set(d % len(t.shape) for d in t.dim) - for i, s in enumerate(t.shape): - if i not in dims: - out_elems *= s - out_elem_bytes = 8 if t.op_kind == "count_nonzero" else 1 - return total_elems * elem_bytes + out_elems * out_elem_bytes +_LOGICAL_OP_NAMES = { + "any": "AnyFwdOp", "all": "AllFwdOp", "count_nonzero": "CountNonzeroFwdOp", +} def _make_logical_op(dtype, op_kind, dim, keepdim): @@ -380,15 +344,20 @@ def test_logical_reduce_multidim_bench( op_kind: str, ) -> None: test = LogicalReduceMultidimTest(shape, dim, keepdim, dtype, op_kind) - bm = LogicalReduceMultidimBenchmark(test) inputs = test.gen_inputs() op = _make_logical_op(dtype, op_kind, dim, keepdim) + bm = ManifestBenchmark(_LOGICAL_OP_NAMES[op_kind], op, test) + # Preserve legacy report column order: shape, keepdim, dtype, op_kind + # (dim list dropped by pre-PR filter). + report_params = { + "shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind, + } result = bm.profile(op, *inputs) - BenchmarkReport.record(op, locals(), result, tag="tileops") + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") # =================================================================== @@ -455,25 +424,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: ) -class VectorNormMultidimBenchmark(BenchmarkBase[VectorNormMultidimTest]): - def calculate_flops(self) -> Optional[float]: - total_elems = 1 - for s in self.workload.shape: - total_elems *= s - return total_elems - - def calculate_memory(self) -> Optional[float]: - t = self.workload - elem_bytes = torch.tensor([], dtype=t.dtype).element_size() - total_elems = 1 - for s in t.shape: - total_elems *= s - out_elems = 1 - dims = set(d % len(t.shape) for d in t.dim) - for i, s in enumerate(t.shape): - if i not in dims: - out_elems *= s - return (total_elems + out_elems) * elem_bytes +_VECTOR_NORM_OP_NAMES = { + "l1": "L1NormFwdOp", "l2": "L2NormFwdOp", "inf": "InfNormFwdOp", +} def _make_norm_op(dtype, op_kind, dim, keepdim): @@ -495,15 +448,20 @@ def test_vector_norm_multidim_bench( op_kind: str, ) -> None: test = VectorNormMultidimTest(shape, dim, keepdim, dtype, op_kind) - bm = VectorNormMultidimBenchmark(test) inputs = test.gen_inputs() op = _make_norm_op(dtype, op_kind, dim, keepdim) + bm = ManifestBenchmark(_VECTOR_NORM_OP_NAMES[op_kind], op, test) + # Preserve legacy report column order: shape, keepdim, dtype, op_kind + # (dim list dropped by pre-PR filter). + report_params = { + "shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind, + } result = bm.profile(op, *inputs) - BenchmarkReport.record(op, locals(), result, tag="tileops") + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") # =================================================================== @@ -567,16 +525,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: raise ValueError(f"Unknown op_kind: {self.op_kind}") -class CumulativeMultidimBenchmark(BenchmarkBase[CumulativeMultidimTest]): - def calculate_flops(self) -> Optional[float]: - t = self.workload - return t.M * t.N - - def calculate_memory(self) -> Optional[float]: - t = self.workload - elem_bytes = torch.tensor([], dtype=t.dtype).element_size() - # Read + write: 2 * M * N - return 2 * t.M * t.N * elem_bytes +_CUMULATIVE_OP_NAMES = {"cumsum": "CumsumFwdOp", "cumprod": "CumprodFwdOp"} def _make_cumulative_op(M, N, dtype, op_kind): @@ -599,15 +548,17 @@ def test_cumulative_multidim_bench( op_kind: str, ) -> None: test = CumulativeMultidimTest(shape, dtype, op_kind) - bm = CumulativeMultidimBenchmark(test) inputs = test.gen_inputs() op = _make_cumulative_op(test.M, test.N, dtype, op_kind) + bm = ManifestBenchmark(_CUMULATIVE_OP_NAMES[op_kind], op, test) + # Preserve legacy report column order: shape, dtype, op_kind. + report_params = {"shape": shape, "dtype": dtype, "op_kind": op_kind} result = bm.profile(op, *inputs) - BenchmarkReport.record(op, locals(), result, tag="tileops") + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") # =================================================================== @@ -680,26 +631,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: ) -class LogSumExpMultidimBenchmark(BenchmarkBase[LogSumExpMultidimTest]): - def calculate_flops(self) -> Optional[float]: - t = self.workload - total_elems = 1 - for s in t.shape: - total_elems *= s - return total_elems - - def calculate_memory(self) -> Optional[float]: - t = self.workload - elem_bytes = torch.tensor([], dtype=t.dtype).element_size() - total_elems = 1 - for s in t.shape: - total_elems *= s - out_elems = 1 - dims = set(d % len(t.shape) for d in t.dim) - for i, s in enumerate(t.shape): - if i not in dims: - out_elems *= s - return (total_elems + out_elems) * elem_bytes +_LOGSUMEXP_OP_NAME = "LogSumExpFwdOp" def _make_logsumexp_op(dtype, dim, keepdim): @@ -716,15 +648,18 @@ def test_logsumexp_multidim_bench( dtype: torch.dtype, ) -> None: test = LogSumExpMultidimTest(shape, dim, keepdim, dtype) - bm = LogSumExpMultidimBenchmark(test) inputs = test.gen_inputs() op = _make_logsumexp_op(dtype, dim, keepdim) + bm = ManifestBenchmark(_LOGSUMEXP_OP_NAME, op, test) + # Preserve legacy report column order: shape, keepdim, dtype + # (dim list dropped by pre-PR filter). + report_params = {"shape": shape, "keepdim": keepdim, "dtype": dtype} result = bm.profile(op, *inputs) - BenchmarkReport.record(op, locals(), result, tag="tileops") + BenchmarkReport.record(op, report_params, result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, locals(), result_bl, tag="torch") + BenchmarkReport.record(op, report_params, result_bl, tag="torch") if __name__ == "__main__": From e3b44153342dd02eac6ae65b5bbbf770bb0825a5 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 02:28:23 +0800 Subject: [PATCH 5/5] [Refactor][Benchmark] defer cumulative bench manifest conversion Revert bench_cumulative.py to the pre-PR (upstream/testbed) state so the benchmark workload rows stay aligned with the legacy table. scan.yaml manifest workloads for CumsumFwdOp/CumprodFwdOp do not match the hand-rolled WORKLOADS list this file uses (base rows (1024,4096) and (4096,4096) vs manifest rows (2048,4096) and (64,32768)), which violates the AC-4 row-set parity constraint. Move bench_cumulative into the deferred bucket; a separate manifest-only PR will align scan.yaml first. Co-Authored-By: Ibuki - a wind born from GPTs --- benchmarks/ops/bench_cumulative.py | 100 +++++++++++++++-------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py index cde5c6113..a746a111c 100644 --- a/benchmarks/ops/bench_cumulative.py +++ b/benchmarks/ops/bench_cumulative.py @@ -1,26 +1,34 @@ -"""Benchmarks for cumulative ops (cumsum, cumprod). +"""Benchmarks for cumulative ops (cumsum, cumprod).""" -Workload shapes and roofline formulas are loaded from the ops manifest -(``tileops/manifest/scan.yaml``). -""" +from typing import Optional import pytest import torch -from benchmarks.benchmark_base import BenchmarkReport, ManifestBenchmark, workloads_to_params -from tileops.ops.reduction.cumprod import CumprodFwdOp -from tileops.ops.reduction.cumsum import CumsumFwdOp -from workloads.workload_base import WorkloadBase +from benchmarks.benchmark_base import BenchmarkBase, BenchmarkReport +from workloads.workload_base import FixtureBase, WorkloadBase -_CUMSUM_OP = "CumsumFwdOp" -_CUMPROD_OP = "CumprodFwdOp" +class CumulativeBenchFixture(FixtureBase): + PARAMS = [ + ( + "m, n, dtype, op_kind", + [ + pytest.param(1024, 4096, torch.float16, "cumsum"), + pytest.param(1024, 4096, torch.bfloat16, "cumsum"), + pytest.param(4096, 4096, torch.float16, "cumsum"), + pytest.param(1024, 4096, torch.float16, "cumprod"), + pytest.param(1024, 4096, torch.bfloat16, "cumprod"), + pytest.param(4096, 4096, torch.float16, "cumprod"), + ], + ), + ] -class _CumulativeWorkload(WorkloadBase): + +class CumulativeBenchTest(WorkloadBase): def __init__(self, m: int, n: int, dtype: torch.dtype, op_kind: str): self.m = m self.n = n - self.shape = (m, n) self.dtype = dtype self.op_kind = op_kind @@ -37,53 +45,47 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor: return x_f32.cumsum(dim=-1).to(x.dtype) elif self.op_kind == "cumprod": return x_f32.cumprod(dim=-1).to(x.dtype) - raise ValueError(f"Unknown op_kind: {self.op_kind!r}") + raise ValueError(f"Unknown op_kind: {self.op_kind}") -@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP)) -def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None: - m, n = shape - op_kind = "cumsum" - test = _CumulativeWorkload(m, n, dtype, op_kind) - inputs = test.gen_inputs() +class CumulativeBenchmark(BenchmarkBase[CumulativeBenchTest]): + def calculate_flops(self) -> Optional[float]: + t = self.workload + # Approximate: inclusive scan performs N-1 ops per row, rounded up to M*N + return t.m * t.n - op = CumsumFwdOp(N=n, dtype=dtype) - bm = ManifestBenchmark(_CUMSUM_OP, op, test) - try: - result = bm.profile(op, *inputs) - except ValueError as exc: - if "No configurations to tune" in str(exc): - pytest.skip(f"Kernel does not support this shape: {exc}") - raise - # Preserve legacy report column order: m, n, dtype, op_kind. - report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind} - BenchmarkReport.record(op, report_params, result, tag="tileops") + def calculate_memory(self) -> Optional[float]: + t = self.workload + elem_bytes = torch.tensor([], dtype=t.dtype).element_size() + # Read x (M*N) + write output (M*N) + return 2 * t.m * t.n * elem_bytes - result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, report_params, result_bl, tag="torch") + +def _make_op(m: int, n: int, dtype: torch.dtype, op_kind: str): + """Create the appropriate Op for the given op_kind.""" + from tileops.ops.reduction.cumprod import CumprodFwdOp + from tileops.ops.reduction.cumsum import CumsumFwdOp + + op_map = { + "cumsum": CumsumFwdOp, + "cumprod": CumprodFwdOp, + } + cls = op_map[op_kind] + return cls(N=n, dtype=dtype) -@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMPROD_OP)) -def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None: - m, n = shape - op_kind = "cumprod" - test = _CumulativeWorkload(m, n, dtype, op_kind) +@CumulativeBenchFixture +def test_cumulative_bench(m: int, n: int, dtype: torch.dtype, op_kind: str) -> None: + test = CumulativeBenchTest(m, n, dtype, op_kind) + bm = CumulativeBenchmark(test) inputs = test.gen_inputs() - op = CumprodFwdOp(N=n, dtype=dtype) - bm = ManifestBenchmark(_CUMPROD_OP, op, test) - try: - result = bm.profile(op, *inputs) - except ValueError as exc: - if "No configurations to tune" in str(exc): - pytest.skip(f"Kernel does not support this shape: {exc}") - raise - # Preserve legacy report column order: m, n, dtype, op_kind. - report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind} - BenchmarkReport.record(op, report_params, result, tag="tileops") + op = _make_op(m, n, dtype, op_kind) + result = bm.profile(op, *inputs) + BenchmarkReport.record(op, locals(), result, tag="tileops") result_bl = bm.profile(test.ref_program, *inputs) - BenchmarkReport.record(op, report_params, result_bl, tag="torch") + BenchmarkReport.record(op, locals(), result_bl, tag="torch") if __name__ == "__main__":