From 92bb8dfc49eb55b8f6ebcf734e299c89c6bcd994 Mon Sep 17 00:00:00 2001
From: lcy-seso <lcy.seso@gmail.com>
Date: Sat, 9 May 2026 00:59:28 +0800
Subject: [PATCH 1/5] [Refactor][Benchmark] route ada_layer_norm and cumulative
 benches through manifest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

bench_ada_layer_norm.py: replace `_manifest_params(op_name)` (whose call
site hid the literal op name behind a parameter) with
`_to_params(load_workloads(<literal>))`, so the manifest validator's AST
check ties each `load_workloads(...)` call to its op.

bench_cumulative.py: replace the hand-rolled `CumulativeBenchFixture`
shape/dtype matrix and the per-file `CumulativeBenchmark.calculate_*`
formulas with `workloads_to_params(...)` + `ManifestBenchmark` for both
CumsumFwdOp and CumprodFwdOp; the report columns (latency_ms / tflops /
bandwidth_tbs and the filtered-locals param keys) are unchanged.

Co-Authored-By: Ibuki 🍃 — a wind born from GPTs <Ibuki-wind@users.noreply.github.com>
---
 benchmarks/ops/bench_ada_layer_norm.py |  8 +--
 benchmarks/ops/bench_cumulative.py     | 92 ++++++++++++--------------
 2 files changed, 45 insertions(+), 55 deletions(-)
diff --git a/benchmarks/ops/bench_ada_layer_norm.py b/benchmarks/ops/bench_ada_layer_norm.py
index 174984e09..8747bfacb 100644
--- a/benchmarks/ops/bench_ada_layer_norm.py
+++ b/benchmarks/ops/bench_ada_layer_norm.py
@@ -55,9 +55,9 @@ def calculate_memory(self) -> Optional[float]:
         return self._get_roofline()[1]
 
 
-def _manifest_params(op_name):
+def _to_params(workloads):
     params = []
-    for w in load_workloads(op_name):
+    for w in workloads:
         m, n = w["x_shape"]
         label = w.get("label", f"{m}x{n}")
         for dtype_str in w["dtypes"]:
@@ -67,7 +67,7 @@ def _manifest_params(op_name):
     return params
 
 
-@pytest.mark.parametrize("m, n, dtype", _manifest_params(_ADA_OP_NAME))
+@pytest.mark.parametrize("m, n, dtype", _to_params(load_workloads(_ADA_OP_NAME)))
 def test_ada_layer_norm_bench(m: int, n: int, dtype: torch.dtype) -> None:
     test = AdaLayerNormTest(m, n, dtype)
     inputs = test.gen_inputs()
@@ -86,7 +86,7 @@ def baseline_fn(x, scale, shift):
     BenchmarkReport.record(op, locals(), result_bl, tag="torch-ref")
 
 
-@pytest.mark.parametrize("m, n, dtype", _manifest_params(_ADA_ZERO_OP_NAME))
+@pytest.mark.parametrize("m, n, dtype", _to_params(load_workloads(_ADA_ZERO_OP_NAME)))
 def test_ada_layer_norm_zero_bench(m: int, n: int, dtype: torch.dtype) -> None:
     test = AdaLayerNormZeroTest(m, n, dtype)
     inputs = test.gen_inputs()
diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py
index a746a111c..4ce9885fa 100644
--- a/benchmarks/ops/bench_cumulative.py
+++ b/benchmarks/ops/bench_cumulative.py
@@ -1,34 +1,26 @@
-"""Benchmarks for cumulative ops (cumsum, cumprod)."""
+"""Benchmarks for cumulative ops (cumsum, cumprod).
 
-from typing import Optional
+Workload shapes and roofline formulas are loaded from the ops manifest
+(``tileops/manifest/scan.yaml``).
+"""
 
 import pytest
 import torch
 
-from benchmarks.benchmark_base import BenchmarkBase, BenchmarkReport
-from workloads.workload_base import FixtureBase, WorkloadBase
+from benchmarks.benchmark_base import BenchmarkReport, ManifestBenchmark, workloads_to_params
+from tileops.ops.reduction.cumprod import CumprodFwdOp
+from tileops.ops.reduction.cumsum import CumsumFwdOp
+from workloads.workload_base import WorkloadBase
 
+_CUMSUM_OP = "CumsumFwdOp"
+_CUMPROD_OP = "CumprodFwdOp"
 
-class CumulativeBenchFixture(FixtureBase):
-    PARAMS = [
-        (
-            "m, n, dtype, op_kind",
-            [
-                pytest.param(1024, 4096, torch.float16, "cumsum"),
-                pytest.param(1024, 4096, torch.bfloat16, "cumsum"),
-                pytest.param(4096, 4096, torch.float16, "cumsum"),
-                pytest.param(1024, 4096, torch.float16, "cumprod"),
-                pytest.param(1024, 4096, torch.bfloat16, "cumprod"),
-                pytest.param(4096, 4096, torch.float16, "cumprod"),
-            ],
-        ),
-    ]
 
-
-class CumulativeBenchTest(WorkloadBase):
+class _CumulativeWorkload(WorkloadBase):
     def __init__(self, m: int, n: int, dtype: torch.dtype, op_kind: str):
         self.m = m
         self.n = n
+        self.shape = (m, n)
         self.dtype = dtype
         self.op_kind = op_kind
 
@@ -43,45 +35,43 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
         x_f32 = x.float()
         if self.op_kind == "cumsum":
             return x_f32.cumsum(dim=-1).to(x.dtype)
-        elif self.op_kind == "cumprod":
-            return x_f32.cumprod(dim=-1).to(x.dtype)
-        raise ValueError(f"Unknown op_kind: {self.op_kind}")
-
+        return x_f32.cumprod(dim=-1).to(x.dtype)
 
-class CumulativeBenchmark(BenchmarkBase[CumulativeBenchTest]):
-    def calculate_flops(self) -> Optional[float]:
-        t = self.workload
-        # Approximate: inclusive scan performs N-1 ops per row, rounded up to M*N
-        return t.m * t.n
-
-    def calculate_memory(self) -> Optional[float]:
-        t = self.workload
-        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
-        # Read x (M*N) + write output (M*N)
-        return 2 * t.m * t.n * elem_bytes
 
+@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP))
+def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None:
+    m, n = shape
+    test = _CumulativeWorkload(m, n, dtype, "cumsum")
+    inputs = test.gen_inputs()
 
-def _make_op(m: int, n: int, dtype: torch.dtype, op_kind: str):
-    """Create the appropriate Op for the given op_kind."""
-    from tileops.ops.reduction.cumprod import CumprodFwdOp
-    from tileops.ops.reduction.cumsum import CumsumFwdOp
+    op = CumsumFwdOp(N=n, dtype=dtype)
+    bm = ManifestBenchmark(_CUMSUM_OP, op, test)
+    try:
+        result = bm.profile(op, *inputs)
+    except ValueError as exc:
+        if "No configurations to tune" in str(exc):
+            pytest.skip(f"Kernel does not support this shape: {exc}")
+        raise
+    BenchmarkReport.record(op, locals(), result, tag="tileops")
 
-    op_map = {
-        "cumsum": CumsumFwdOp,
-        "cumprod": CumprodFwdOp,
-    }
-    cls = op_map[op_kind]
-    return cls(N=n, dtype=dtype)
+    result_bl = bm.profile(test.ref_program, *inputs)
+    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
 
 
-@CumulativeBenchFixture
-def test_cumulative_bench(m: int, n: int, dtype: torch.dtype, op_kind: str) -> None:
-    test = CumulativeBenchTest(m, n, dtype, op_kind)
-    bm = CumulativeBenchmark(test)
+@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMPROD_OP))
+def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None:
+    m, n = shape
+    test = _CumulativeWorkload(m, n, dtype, "cumprod")
     inputs = test.gen_inputs()
 
-    op = _make_op(m, n, dtype, op_kind)
-    result = bm.profile(op, *inputs)
+    op = CumprodFwdOp(N=n, dtype=dtype)
+    bm = ManifestBenchmark(_CUMPROD_OP, op, test)
+    try:
+        result = bm.profile(op, *inputs)
+    except ValueError as exc:
+        if "No configurations to tune" in str(exc):
+            pytest.skip(f"Kernel does not support this shape: {exc}")
+        raise
     BenchmarkReport.record(op, locals(), result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)

From b88a489417d012373a25a207464b1c71c2d1a0c1 Mon Sep 17 00:00:00 2001
From: lcy-seso <lcy.seso@gmail.com>
Date: Sat, 9 May 2026 01:19:20 +0800
Subject: [PATCH 2/5] [Refactor][Benchmark] preserve cumulative report column
 schema

Pass an explicit params dict (m, n, dtype, op_kind) to
BenchmarkReport.record so the cumulative bench output header matches
the pre-PR baseline byte-for-byte instead of leaking the new
shape-based parametrization keys via locals().

Also narrow tuple[float, float] | None in AdaLayerNorm benchmark
roofline accessors via a local rebind so pyright sees a non-None
return path.

Co-Authored-By: Ibuki - a wind born from GPTs <Ibuki-wind@users.noreply.github.com>
---
 benchmarks/ops/bench_ada_layer_norm.py | 16 ++++++++++------
 benchmarks/ops/bench_cumulative.py     | 18 ++++++++++++------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/benchmarks/ops/bench_ada_layer_norm.py b/benchmarks/ops/bench_ada_layer_norm.py
index 8747bfacb..756f1ad09 100644
--- a/benchmarks/ops/bench_ada_layer_norm.py
+++ b/benchmarks/ops/bench_ada_layer_norm.py
@@ -24,9 +24,11 @@ def __init__(self, test, op):
         self._op = op
 
     def _get_roofline(self) -> tuple[float, float]:
-        if self._roofline_cache is None:
-            self._roofline_cache = self._op.eval_roofline()
-        return self._roofline_cache
+        cache = self._roofline_cache
+        if cache is None:
+            cache = self._op.eval_roofline()
+            self._roofline_cache = cache
+        return cache
 
     def calculate_flops(self) -> Optional[float]:
         return self._get_roofline()[0]
@@ -44,9 +46,11 @@ def __init__(self, test, op):
         self._op = op
 
     def _get_roofline(self) -> tuple[float, float]:
-        if self._roofline_cache is None:
-            self._roofline_cache = self._op.eval_roofline()
-        return self._roofline_cache
+        cache = self._roofline_cache
+        if cache is None:
+            cache = self._op.eval_roofline()
+            self._roofline_cache = cache
+        return cache
 
     def calculate_flops(self) -> Optional[float]:
         return self._get_roofline()[0]
diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py
index 4ce9885fa..6af69aa7f 100644
--- a/benchmarks/ops/bench_cumulative.py
+++ b/benchmarks/ops/bench_cumulative.py
@@ -41,7 +41,8 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
 @pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP))
 def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None:
     m, n = shape
-    test = _CumulativeWorkload(m, n, dtype, "cumsum")
+    op_kind = "cumsum"
+    test = _CumulativeWorkload(m, n, dtype, op_kind)
     inputs = test.gen_inputs()
 
     op = CumsumFwdOp(N=n, dtype=dtype)
@@ -52,16 +53,19 @@ def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None:
         if "No configurations to tune" in str(exc):
             pytest.skip(f"Kernel does not support this shape: {exc}")
         raise
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    # Preserve legacy report column order: m, n, dtype, op_kind.
+    report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind}
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 @pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMPROD_OP))
 def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None:
     m, n = shape
-    test = _CumulativeWorkload(m, n, dtype, "cumprod")
+    op_kind = "cumprod"
+    test = _CumulativeWorkload(m, n, dtype, op_kind)
     inputs = test.gen_inputs()
 
     op = CumprodFwdOp(N=n, dtype=dtype)
@@ -72,10 +76,12 @@ def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None:
         if "No configurations to tune" in str(exc):
             pytest.skip(f"Kernel does not support this shape: {exc}")
         raise
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    # Preserve legacy report column order: m, n, dtype, op_kind.
+    report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind}
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 if __name__ == "__main__":

From 26a6e56998885184fe7aafb43f4954087f79e381 Mon Sep 17 00:00:00 2001
From: lcy-seso <lcy.seso@gmail.com>
Date: Sat, 9 May 2026 01:46:10 +0800
Subject: [PATCH 3/5] fix(bench): validate op_kind in cumulative ref_program
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restore explicit elif branch for cumprod and raise ValueError for
unknown op_kind values, so typos or future op_kind extensions surface
loudly instead of silently producing a cumprod baseline.

Co-Authored-By: Ibuki 🍃 — a wind born from GPTs <Ibuki-wind@users.noreply.github.com>
---
 benchmarks/ops/bench_cumulative.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py
index 6af69aa7f..cde5c6113 100644
--- a/benchmarks/ops/bench_cumulative.py
+++ b/benchmarks/ops/bench_cumulative.py
@@ -35,7 +35,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
         x_f32 = x.float()
         if self.op_kind == "cumsum":
             return x_f32.cumsum(dim=-1).to(x.dtype)
-        return x_f32.cumprod(dim=-1).to(x.dtype)
+        elif self.op_kind == "cumprod":
+            return x_f32.cumprod(dim=-1).to(x.dtype)
+        raise ValueError(f"Unknown op_kind: {self.op_kind!r}")
 
 
 @pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP))

From bbf0a1fb3a15796546b4b9c006302c604f4b85b5 Mon Sep 17 00:00:00 2001
From: lcy-seso <lcy.seso@gmail.com>
Date: Sat, 9 May 2026 02:04:20 +0800
Subject: [PATCH 4/5] [Refactor][Benchmark] route reduce_multidim bench through
 ManifestBenchmark

Replace the six hand-rolled BenchmarkBase subclasses
(Reduce/Argreduce/LogicalReduce/VectorNorm/Cumulative/LogSumExp) with
ManifestBenchmark; FLOP and byte counts now come from each op's
eval_roofline() rather than per-class calculate_flops/calculate_memory.

3D multi-dim shapes stay declared inline because the manifest workload
set for these ops only covers 2D last-axis reductions (a different
test scenario from this file's 3D non-last-axis purpose); per the trust
model, manifest workloads cannot be edited from a code PR.

Output column schema preserved against pre-PR baseline by passing an
explicit params dict to BenchmarkReport.record (mirrors the cumulative
schema-preservation fix from round 2).

Co-Authored-By: Ibuki - a wind born from GPTs <Ibuki-wind@users.noreply.github.com>
---
 benchmarks/ops/bench_reduce_multidim.py | 187 ++++++++----------------
 1 file changed, 61 insertions(+), 126 deletions(-)

diff --git a/benchmarks/ops/bench_reduce_multidim.py b/benchmarks/ops/bench_reduce_multidim.py
index 7d7829362..366731ea8 100644
--- a/benchmarks/ops/bench_reduce_multidim.py
+++ b/benchmarks/ops/bench_reduce_multidim.py
@@ -17,14 +17,17 @@
 Shape conventions use LLaMA-family dimensions:
   - (batch=4, seq=128, hidden=4096): 7B inference context
   - (batch=2, seq=512, hidden=4096): 7B longer-context inference
-"""
 
-from typing import Optional
+Roofline metadata (FLOPs, bytes) comes from each op's ``eval_roofline()``
+via ``ManifestBenchmark``; the 3D multi-dim shapes themselves are declared
+inline because the manifest workload set for these ops only covers 2D
+last-axis reductions, which is a different test scenario.
+"""
 
 import pytest
 import torch
 
-from benchmarks.benchmark_base import BenchmarkBase, BenchmarkReport
+from benchmarks.benchmark_base import BenchmarkReport, ManifestBenchmark
 from workloads.workload_base import FixtureBase, WorkloadBase
 
 # ===================================================================
@@ -102,26 +105,7 @@ def ref_program(self, x: torch.Tensor) -> object:
         return ops[self.op_kind](x_f32).to(x.dtype)
 
 
-class ReduceMultidimBenchmark(BenchmarkBase[ReduceMultidimTest]):
-    def calculate_flops(self) -> Optional[float]:
-        t = self.workload
-        total_elems = 1
-        for s in t.shape:
-            total_elems *= s
-        return total_elems
-
-    def calculate_memory(self) -> Optional[float]:
-        t = self.workload
-        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
-        total_elems = 1
-        for s in t.shape:
-            total_elems *= s
-        # Output elements: product of kept dims
-        out_elems = 1
-        for i, s in enumerate(t.shape):
-            if i not in t.dim:
-                out_elems *= s
-        return (total_elems + out_elems) * elem_bytes
+_REDUCE_OP_NAMES = {"sum": "SumFwdOp", "mean": "MeanFwdOp", "amax": "AmaxFwdOp"}
 
 
 def _make_reduce_op(dtype, op_kind, dim, keepdim):
@@ -141,15 +125,21 @@ def test_reduce_multidim_bench(
     op_kind: str,
 ) -> None:
     test = ReduceMultidimTest(shape, dim, keepdim, dtype, op_kind)
-    bm = ReduceMultidimBenchmark(test)
     inputs = test.gen_inputs()
 
     op = _make_reduce_op(dtype, op_kind, dim, keepdim)
+    bm = ManifestBenchmark(_REDUCE_OP_NAMES[op_kind], op, test)
+    # Preserve legacy report column order: shape, keepdim, dtype, op_kind
+    # (dim is a list and was already silently dropped by the pre-PR
+    # serializability filter, so we omit it here too).
+    report_params = {
+        "shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind,
+    }
     result = bm.profile(op, *inputs)
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 # ===================================================================
@@ -221,22 +211,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
         return x.argmin(dim=self.dim, keepdim=self.keepdim)
 
 
-class ArgreduceMultidimBenchmark(BenchmarkBase[ArgreduceMultidimTest]):
-    def calculate_flops(self) -> Optional[float]:
-        total_elems = 1
-        for s in self.workload.shape:
-            total_elems *= s
-        return total_elems
-
-    def calculate_memory(self) -> Optional[float]:
-        t = self.workload
-        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
-        total_elems = 1
-        for s in t.shape:
-            total_elems *= s
-        # Output: int64 (8 bytes) for each position
-        out_elems = total_elems // t.shape[t.dim]
-        return total_elems * elem_bytes + out_elems * 8
+_ARGREDUCE_OP_NAMES = {"argmax": "ArgmaxFwdOp", "argmin": "ArgminFwdOp"}
 
 
 def _make_argreduce_op(dtype, op_kind, dim, keepdim):
@@ -256,15 +231,21 @@ def test_argreduce_multidim_bench(
     op_kind: str,
 ) -> None:
     test = ArgreduceMultidimTest(shape, dim, keepdim, dtype, op_kind)
-    bm = ArgreduceMultidimBenchmark(test)
     inputs = test.gen_inputs()
 
     op = _make_argreduce_op(dtype, op_kind, dim, keepdim)
+    bm = ManifestBenchmark(_ARGREDUCE_OP_NAMES[op_kind], op, test)
+    # Preserve legacy report column order: shape, dim, keepdim, dtype, op_kind
+    # (dim is int here and was kept by the pre-PR filter).
+    report_params = {
+        "shape": shape, "dim": dim, "keepdim": keepdim,
+        "dtype": dtype, "op_kind": op_kind,
+    }
     result = bm.profile(op, *inputs)
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 # ===================================================================
@@ -336,26 +317,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
         raise ValueError(f"Unknown op_kind: {self.op_kind}")
 
 
-class LogicalReduceMultidimBenchmark(BenchmarkBase[LogicalReduceMultidimTest]):
-    def calculate_flops(self) -> Optional[float]:
-        total_elems = 1
-        for s in self.workload.shape:
-            total_elems *= s
-        return total_elems
-
-    def calculate_memory(self) -> Optional[float]:
-        t = self.workload
-        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
-        total_elems = 1
-        for s in t.shape:
-            total_elems *= s
-        out_elems = 1
-        dims = set(d % len(t.shape) for d in t.dim)
-        for i, s in enumerate(t.shape):
-            if i not in dims:
-                out_elems *= s
-        out_elem_bytes = 8 if t.op_kind == "count_nonzero" else 1
-        return total_elems * elem_bytes + out_elems * out_elem_bytes
+_LOGICAL_OP_NAMES = {
+    "any": "AnyFwdOp", "all": "AllFwdOp", "count_nonzero": "CountNonzeroFwdOp",
+}
 
 
 def _make_logical_op(dtype, op_kind, dim, keepdim):
@@ -380,15 +344,20 @@ def test_logical_reduce_multidim_bench(
     op_kind: str,
 ) -> None:
     test = LogicalReduceMultidimTest(shape, dim, keepdim, dtype, op_kind)
-    bm = LogicalReduceMultidimBenchmark(test)
     inputs = test.gen_inputs()
 
     op = _make_logical_op(dtype, op_kind, dim, keepdim)
+    bm = ManifestBenchmark(_LOGICAL_OP_NAMES[op_kind], op, test)
+    # Preserve legacy report column order: shape, keepdim, dtype, op_kind
+    # (dim list dropped by pre-PR filter).
+    report_params = {
+        "shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind,
+    }
     result = bm.profile(op, *inputs)
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 # ===================================================================
@@ -455,25 +424,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
         )
 
 
-class VectorNormMultidimBenchmark(BenchmarkBase[VectorNormMultidimTest]):
-    def calculate_flops(self) -> Optional[float]:
-        total_elems = 1
-        for s in self.workload.shape:
-            total_elems *= s
-        return total_elems
-
-    def calculate_memory(self) -> Optional[float]:
-        t = self.workload
-        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
-        total_elems = 1
-        for s in t.shape:
-            total_elems *= s
-        out_elems = 1
-        dims = set(d % len(t.shape) for d in t.dim)
-        for i, s in enumerate(t.shape):
-            if i not in dims:
-                out_elems *= s
-        return (total_elems + out_elems) * elem_bytes
+_VECTOR_NORM_OP_NAMES = {
+    "l1": "L1NormFwdOp", "l2": "L2NormFwdOp", "inf": "InfNormFwdOp",
+}
 
 
 def _make_norm_op(dtype, op_kind, dim, keepdim):
@@ -495,15 +448,20 @@ def test_vector_norm_multidim_bench(
     op_kind: str,
 ) -> None:
     test = VectorNormMultidimTest(shape, dim, keepdim, dtype, op_kind)
-    bm = VectorNormMultidimBenchmark(test)
     inputs = test.gen_inputs()
 
     op = _make_norm_op(dtype, op_kind, dim, keepdim)
+    bm = ManifestBenchmark(_VECTOR_NORM_OP_NAMES[op_kind], op, test)
+    # Preserve legacy report column order: shape, keepdim, dtype, op_kind
+    # (dim list dropped by pre-PR filter).
+    report_params = {
+        "shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind,
+    }
     result = bm.profile(op, *inputs)
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 # ===================================================================
@@ -567,16 +525,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
         raise ValueError(f"Unknown op_kind: {self.op_kind}")
 
 
-class CumulativeMultidimBenchmark(BenchmarkBase[CumulativeMultidimTest]):
-    def calculate_flops(self) -> Optional[float]:
-        t = self.workload
-        return t.M * t.N
-
-    def calculate_memory(self) -> Optional[float]:
-        t = self.workload
-        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
-        # Read + write: 2 * M * N
-        return 2 * t.M * t.N * elem_bytes
+_CUMULATIVE_OP_NAMES = {"cumsum": "CumsumFwdOp", "cumprod": "CumprodFwdOp"}
 
 
 def _make_cumulative_op(M, N, dtype, op_kind):
@@ -599,15 +548,17 @@ def test_cumulative_multidim_bench(
     op_kind: str,
 ) -> None:
     test = CumulativeMultidimTest(shape, dtype, op_kind)
-    bm = CumulativeMultidimBenchmark(test)
     inputs = test.gen_inputs()
 
     op = _make_cumulative_op(test.M, test.N, dtype, op_kind)
+    bm = ManifestBenchmark(_CUMULATIVE_OP_NAMES[op_kind], op, test)
+    # Preserve legacy report column order: shape, dtype, op_kind.
+    report_params = {"shape": shape, "dtype": dtype, "op_kind": op_kind}
     result = bm.profile(op, *inputs)
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 # ===================================================================
@@ -680,26 +631,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
         )
 
 
-class LogSumExpMultidimBenchmark(BenchmarkBase[LogSumExpMultidimTest]):
-    def calculate_flops(self) -> Optional[float]:
-        t = self.workload
-        total_elems = 1
-        for s in t.shape:
-            total_elems *= s
-        return total_elems
-
-    def calculate_memory(self) -> Optional[float]:
-        t = self.workload
-        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
-        total_elems = 1
-        for s in t.shape:
-            total_elems *= s
-        out_elems = 1
-        dims = set(d % len(t.shape) for d in t.dim)
-        for i, s in enumerate(t.shape):
-            if i not in dims:
-                out_elems *= s
-        return (total_elems + out_elems) * elem_bytes
+_LOGSUMEXP_OP_NAME = "LogSumExpFwdOp"
 
 
 def _make_logsumexp_op(dtype, dim, keepdim):
@@ -716,15 +648,18 @@ def test_logsumexp_multidim_bench(
     dtype: torch.dtype,
 ) -> None:
     test = LogSumExpMultidimTest(shape, dim, keepdim, dtype)
-    bm = LogSumExpMultidimBenchmark(test)
     inputs = test.gen_inputs()
 
     op = _make_logsumexp_op(dtype, dim, keepdim)
+    bm = ManifestBenchmark(_LOGSUMEXP_OP_NAME, op, test)
+    # Preserve legacy report column order: shape, keepdim, dtype
+    # (dim list dropped by pre-PR filter).
+    report_params = {"shape": shape, "keepdim": keepdim, "dtype": dtype}
     result = bm.profile(op, *inputs)
-    BenchmarkReport.record(op, locals(), result, tag="tileops")
+    BenchmarkReport.record(op, report_params, result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
+    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
 
 
 if __name__ == "__main__":

From e3b44153342dd02eac6ae65b5bbbf770bb0825a5 Mon Sep 17 00:00:00 2001
From: lcy-seso <lcy.seso@gmail.com>
Date: Sat, 9 May 2026 02:28:23 +0800
Subject: [PATCH 5/5] [Refactor][Benchmark] defer cumulative bench manifest
 conversion

Revert bench_cumulative.py to the pre-PR (upstream/testbed) state so the
benchmark workload rows stay aligned with the legacy table.

scan.yaml manifest workloads for CumsumFwdOp/CumprodFwdOp do not match
the hand-rolled WORKLOADS list this file uses (base rows (1024,4096) and
(4096,4096) vs manifest rows (2048,4096) and (64,32768)), which violates
the AC-4 row-set parity constraint. Move bench_cumulative into the
deferred bucket; a separate manifest-only PR will align scan.yaml first.

Co-Authored-By: Ibuki - a wind born from GPTs <Ibuki-wind@users.noreply.github.com>
---
 benchmarks/ops/bench_cumulative.py | 100 +++++++++++++++--------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/benchmarks/ops/bench_cumulative.py b/benchmarks/ops/bench_cumulative.py
index cde5c6113..a746a111c 100644
--- a/benchmarks/ops/bench_cumulative.py
+++ b/benchmarks/ops/bench_cumulative.py
@@ -1,26 +1,34 @@
-"""Benchmarks for cumulative ops (cumsum, cumprod).
+"""Benchmarks for cumulative ops (cumsum, cumprod)."""
 
-Workload shapes and roofline formulas are loaded from the ops manifest
-(``tileops/manifest/scan.yaml``).
-"""
+from typing import Optional
 
 import pytest
 import torch
 
-from benchmarks.benchmark_base import BenchmarkReport, ManifestBenchmark, workloads_to_params
-from tileops.ops.reduction.cumprod import CumprodFwdOp
-from tileops.ops.reduction.cumsum import CumsumFwdOp
-from workloads.workload_base import WorkloadBase
+from benchmarks.benchmark_base import BenchmarkBase, BenchmarkReport
+from workloads.workload_base import FixtureBase, WorkloadBase
 
-_CUMSUM_OP = "CumsumFwdOp"
-_CUMPROD_OP = "CumprodFwdOp"
 
+class CumulativeBenchFixture(FixtureBase):
+    PARAMS = [
+        (
+            "m, n, dtype, op_kind",
+            [
+                pytest.param(1024, 4096, torch.float16, "cumsum"),
+                pytest.param(1024, 4096, torch.bfloat16, "cumsum"),
+                pytest.param(4096, 4096, torch.float16, "cumsum"),
+                pytest.param(1024, 4096, torch.float16, "cumprod"),
+                pytest.param(1024, 4096, torch.bfloat16, "cumprod"),
+                pytest.param(4096, 4096, torch.float16, "cumprod"),
+            ],
+        ),
+    ]
 
-class _CumulativeWorkload(WorkloadBase):
+
+class CumulativeBenchTest(WorkloadBase):
     def __init__(self, m: int, n: int, dtype: torch.dtype, op_kind: str):
         self.m = m
         self.n = n
-        self.shape = (m, n)
         self.dtype = dtype
         self.op_kind = op_kind
 
@@ -37,53 +45,47 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
             return x_f32.cumsum(dim=-1).to(x.dtype)
         elif self.op_kind == "cumprod":
             return x_f32.cumprod(dim=-1).to(x.dtype)
-        raise ValueError(f"Unknown op_kind: {self.op_kind!r}")
+        raise ValueError(f"Unknown op_kind: {self.op_kind}")
 
 
-@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMSUM_OP))
-def test_cumsum_bench(shape: tuple, dtype: torch.dtype) -> None:
-    m, n = shape
-    op_kind = "cumsum"
-    test = _CumulativeWorkload(m, n, dtype, op_kind)
-    inputs = test.gen_inputs()
+class CumulativeBenchmark(BenchmarkBase[CumulativeBenchTest]):
+    def calculate_flops(self) -> Optional[float]:
+        t = self.workload
+        # Approximate: inclusive scan performs N-1 ops per row, rounded up to M*N
+        return t.m * t.n
 
-    op = CumsumFwdOp(N=n, dtype=dtype)
-    bm = ManifestBenchmark(_CUMSUM_OP, op, test)
-    try:
-        result = bm.profile(op, *inputs)
-    except ValueError as exc:
-        if "No configurations to tune" in str(exc):
-            pytest.skip(f"Kernel does not support this shape: {exc}")
-        raise
-    # Preserve legacy report column order: m, n, dtype, op_kind.
-    report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind}
-    BenchmarkReport.record(op, report_params, result, tag="tileops")
+    def calculate_memory(self) -> Optional[float]:
+        t = self.workload
+        elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
+        # Read x (M*N) + write output (M*N)
+        return 2 * t.m * t.n * elem_bytes
 
-    result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
+
+def _make_op(m: int, n: int, dtype: torch.dtype, op_kind: str):
+    """Create the appropriate Op for the given op_kind."""
+    from tileops.ops.reduction.cumprod import CumprodFwdOp
+    from tileops.ops.reduction.cumsum import CumsumFwdOp
+
+    op_map = {
+        "cumsum": CumsumFwdOp,
+        "cumprod": CumprodFwdOp,
+    }
+    cls = op_map[op_kind]
+    return cls(N=n, dtype=dtype)
 
 
-@pytest.mark.parametrize("shape, dtype", workloads_to_params(_CUMPROD_OP))
-def test_cumprod_bench(shape: tuple, dtype: torch.dtype) -> None:
-    m, n = shape
-    op_kind = "cumprod"
-    test = _CumulativeWorkload(m, n, dtype, op_kind)
+@CumulativeBenchFixture
+def test_cumulative_bench(m: int, n: int, dtype: torch.dtype, op_kind: str) -> None:
+    test = CumulativeBenchTest(m, n, dtype, op_kind)
+    bm = CumulativeBenchmark(test)
     inputs = test.gen_inputs()
 
-    op = CumprodFwdOp(N=n, dtype=dtype)
-    bm = ManifestBenchmark(_CUMPROD_OP, op, test)
-    try:
-        result = bm.profile(op, *inputs)
-    except ValueError as exc:
-        if "No configurations to tune" in str(exc):
-            pytest.skip(f"Kernel does not support this shape: {exc}")
-        raise
-    # Preserve legacy report column order: m, n, dtype, op_kind.
-    report_params = {"m": m, "n": n, "dtype": dtype, "op_kind": op_kind}
-    BenchmarkReport.record(op, report_params, result, tag="tileops")
+    op = _make_op(m, n, dtype, op_kind)
+    result = bm.profile(op, *inputs)
+    BenchmarkReport.record(op, locals(), result, tag="tileops")
 
     result_bl = bm.profile(test.ref_program, *inputs)
-    BenchmarkReport.record(op, report_params, result_bl, tag="torch")
+    BenchmarkReport.record(op, locals(), result_bl, tag="torch")
 
 
 if __name__ == "__main__":