From 59bf3b2d8b52f90b238ec9d9b5420ea70e615d63 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 05:09:40 +0800 Subject: [PATCH 1/5] refactor(manifest): unify per-element FLOP convention for elementwise activations and clamp family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add roofline.md §1.3 (Convention) naming the per-element FLOP rule: arithmetic ops, transcendentals, and compare-and-select each count as 1 FLOP per output element. Two-sided clamp also counts as 1 FLOP per element under this convention. Audit each activation manifest entry (gelu, silu, sigmoid, tanh, elu, selu, softplus, mish, hardsigmoid, hardswish) and replace ad-hoc constants with derivations matching the convention. Each entry now carries a one-line FLOPs derivation comment. Unify the clamp / min-max family — hardtanh, clamp_scalar, clamp_min, clamp_max, maximum, minimum — to 1 FLOP per output element. Removes the 4*N figures previously carried on hardtanh and clamp_scalar. Co-Authored-By: Ibuki 🍃 — a wind born from GPTs --- docs/design/roofline.md | 15 ++++++ .../elementwise_unary_activation.yaml | 53 +++++++++++-------- tileops/manifest/elementwise_unary_math.yaml | 8 +-- 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/docs/design/roofline.md b/docs/design/roofline.md index 8cf3a58c0..59f816d0d 100644 --- a/docs/design/roofline.md +++ b/docs/design/roofline.md @@ -25,6 +25,21 @@ Inputs: Bound type is whichever term dominates `sol_time` (memory-bound if `memory_time > compute_time`, else compute-bound). It depends on shape, not on the op; the roofline tool computes it per-workload and the manifest does not declare it. +## 1.3 Convention + +Dated: 2026-05-09. + +Per-element FLOP rule for elementwise ops: + +- One basic arithmetic op (add, sub, mul, div, neg, abs, recip) counts as 1 FLOP. +- One transcendental call (`exp`, `log`, `log1p`, `erf`, `tanh`, `sin`, `cos`, `sqrt`, `rsqrt`, etc.) counts as 1 FLOP at the convention level. Hardware-specific cost models do not feed back into the manifest. +- One compare-and-select (`max`, `min`, `maximum`, `minimum`, single-bound clamp, two-sided clamp, `relu`-style branch, `where`) counts as 1 FLOP per output element. Two-sided clamp counts the same as single-sided: 1 FLOP per element. +- Predicate-only outputs (`eq`, `gt`, etc.) count as 1 FLOP per element. + +Composite ops sum the counts of their primitive steps under this convention. For example, `sigmoid(x) = recip(1 + exp(-x))` is `neg + exp + add + recip = 4` FLOPs/elem; `silu(x) = x * sigmoid(x)` is `5` FLOPs/elem; `tanh(x)` is `1` FLOP/elem because `tanh` is a single transcendental at the convention level. + +The clamp / min-max family — `hardtanh`, `clamp_scalar`, `clamp_min`, `clamp_max`, `maximum`, `minimum` — collapses to 1 FLOP per output element under this rule. + ## 2. Field Specification ### 2.1 Output Contract diff --git a/tileops/manifest/elementwise_unary_activation.yaml b/tileops/manifest/elementwise_unary_activation.yaml index ece8f3047..e98e5254e 100644 --- a/tileops/manifest/elementwise_unary_activation.yaml +++ b/tileops/manifest/elementwise_unary_activation.yaml @@ -67,8 +67,9 @@ GeluFwdOp: roofline: vars: N: "product(input.shape)" - # erf-based: ~8 fp ops per element (mul, erf, add, mul, mul); tanh approx similar - flops: "8 * N" + # FLOPs: gelu(x) = x * 0.5 * (1 + erf(x / sqrt(2))). + # div(1) + erf(1) + add(1) + mul-by-half(1) + mul(1) = 5 per elem. + flops: "5 * N" bytes: "2 * N * elem_bytes" source: @@ -101,8 +102,9 @@ SiluFwdOp: roofline: vars: N: "product(input.shape)" - # sigmoid (exp + add + recip ~3) + mul = 4 fp ops per element - flops: "4 * N" + # FLOPs: silu(x) = x * sigmoid(x); sigmoid = neg + exp + add + recip = 4. + # silu adds one mul = 5 per elem. + flops: "5 * N" bytes: "2 * N * elem_bytes" source: @@ -135,8 +137,9 @@ HardswishFwdOp: roofline: vars: N: "product(input.shape)" - # add (1) + clamp (2 cmp + 2 sel = 4) + mul (1) + div (1) = 7 fp ops per element - flops: "7 * N" + # FLOPs: hardswish(x) = x * relu6(x + 3) / 6. + # add(1) + two-sided-clamp(1, per convention) + mul(1) + div(1) = 4 per elem. + flops: "4 * N" bytes: "2 * N * elem_bytes" source: @@ -169,8 +172,9 @@ HardsigmoidFwdOp: roofline: vars: N: "product(input.shape)" - # add (1) + clamp (2 cmp + 2 sel = 4) + div (1) = 6 fp ops per element - flops: "6 * N" + # FLOPs: hardsigmoid(x) = relu6(x + 3) / 6. + # add(1) + two-sided-clamp(1, per convention) + div(1) = 3 per elem. + flops: "3 * N" bytes: "2 * N * elem_bytes" source: @@ -203,8 +207,9 @@ MishFwdOp: roofline: vars: N: "product(input.shape)" - # softplus (exp + log1p ~ 3) + tanh (~3) + mul = 7 fp ops per element - flops: "7 * N" + # FLOPs: mish(x) = x * tanh(softplus(x)); softplus = exp + log1p = 2, + # tanh(1, transcendental) + final mul(1) = 4 per elem. + flops: "4 * N" bytes: "2 * N * elem_bytes" source: @@ -237,7 +242,8 @@ SeluFwdOp: roofline: vars: N: "product(input.shape)" - # branch select + (exp + sub + mul) on negative branch + mul by lambda ~ 5 fp ops per element + # FLOPs: selu(x) = lambda * (x if x>0 else alpha*(exp(x)-1)). + # compare-and-select(1) + exp(1) + sub(1) + mul-alpha(1) + mul-lambda(1) = 5 per elem. flops: "5 * N" bytes: "2 * N * elem_bytes" @@ -307,8 +313,9 @@ EluFwdOp: roofline: vars: N: "product(input.shape)" - # compare + (exp + sub + mul) on negative branch ~ 5 fp ops per element - flops: "5 * N" + # FLOPs: elu(x) = x if x>0 else alpha*(exp(x)-1). + # compare-and-select(1) + exp(1) + sub(1) + mul(1) = 4 per elem. + flops: "4 * N" bytes: "2 * N * elem_bytes" source: @@ -344,8 +351,10 @@ HardtanhFwdOp: roofline: vars: N: "product(input.shape)" - # 2 compares + 2 selects per element - flops: "4 * N" + # FLOPs: hardtanh = clamp(x, min_val, max_val). + # Two-sided clamp = 1 compare-and-select per output element under the + # roofline.md §1.3 convention. = 1 per elem. + flops: "N" bytes: "2 * N * elem_bytes" source: @@ -379,8 +388,9 @@ SoftplusFwdOp: roofline: vars: N: "product(input.shape)" - # mul (beta*x) + threshold compare + (exp + log1p + div) on log-branch ~ 6 fp ops per element - flops: "6 * N" + # FLOPs: softplus(x) = (1/beta) * log(1 + exp(beta*x)) with threshold guard. + # mul-beta(1) + threshold compare-and-select(1) + exp(1) + log1p(1) + div-by-beta(1) = 5 per elem. + flops: "5 * N" bytes: "2 * N * elem_bytes" source: @@ -459,11 +469,10 @@ ClampScalarFwdOp: roofline: vars: N: "product(input.shape)" - # min(max(x, lo), hi): 2 compares + 2 selects per element = 4 fp ops. - # When only one bound is provided (min=None or max=None) the actual - # cost halves to 2*N, but 4*N is the upper bound used as the roofline - # figure (matches HardtanhFwdOp's two-sided clamp convention). - flops: "4 * N" + # FLOPs: clamp = compare-and-select. Per roofline.md §1.3 convention, + # the clamp / min-max family collapses to 1 FLOP per output element + # regardless of one or two bounds. = 1 per elem. + flops: "N" bytes: "2 * N * elem_bytes" source: diff --git a/tileops/manifest/elementwise_unary_math.yaml b/tileops/manifest/elementwise_unary_math.yaml index f328d1071..1d0ac671f 100644 --- a/tileops/manifest/elementwise_unary_math.yaml +++ b/tileops/manifest/elementwise_unary_math.yaml @@ -580,7 +580,8 @@ SigmoidFwdOp: roofline: vars: N: "product(input.shape)" - # sigmoid(x) = 1 / (1 + exp(-x)): ~4 ops/elem + # FLOPs: sigmoid(x) = 1 / (1 + exp(-x)). + # neg(1) + exp(1) + add(1) + recip(1) = 4 per elem. flops: "4 * N" bytes: "2 * N * elem_bytes" @@ -613,8 +614,9 @@ TanhFwdOp: roofline: vars: N: "product(input.shape)" - # tanh(x) = 2 * sigmoid(2x) - 1: ~5 ops/elem - flops: "5 * N" + # FLOPs: tanh is one transcendental call; per roofline.md §1.3 + # convention, transcendentals count as 1 FLOP per element. + flops: "N" bytes: "2 * N * elem_bytes" source: From a06331e1fb601c3102f5fe17bde47a13c3aaa2cf Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 05:25:39 +0800 Subject: [PATCH 2/5] refactor(manifest): align relu / leaky_relu / prelu / clamp_fwd to convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 of the per-element FLOP convention rollout. ReLU is one compare-and-select per element under roofline.md §1.3, so collapse ReluFwdOp to N. LeakyReluFwdOp and PreluFwdOp are compare-and-select plus one mul on the negative branch, so collapse to 2 * N. Tensor-bound ClampFwdOp's func helper now mirrors the YAML clamp family at 1 FLOP per output element. Add scripts/perf/flop_convention_delta.py and the matching docs/perf/ report so the before/after FLOP table required by AC-5 is reproducible from a clean checkout without GPU access. --- docs/perf/flop_convention_delta.csv | 4 + docs/perf/flop_convention_delta.md | 42 ++++ scripts/perf/flop_convention_delta.py | 210 ++++++++++++++++++ tileops/manifest/elementwise_binary.yaml | 5 +- .../elementwise_unary_activation.yaml | 10 +- tileops/perf/formulas.py | 10 +- 6 files changed, 270 insertions(+), 11 deletions(-) create mode 100644 docs/perf/flop_convention_delta.csv create mode 100644 docs/perf/flop_convention_delta.md create mode 100644 scripts/perf/flop_convention_delta.py diff --git a/docs/perf/flop_convention_delta.csv b/docs/perf/flop_convention_delta.csv new file mode 100644 index 000000000..63095d480 --- /dev/null +++ b/docs/perf/flop_convention_delta.csv @@ -0,0 +1,4 @@ +family,op,label,shape,dtype,flops_before,flops_after,flops_delta,bytes_before,bytes_after,bytes_delta +activation,ReluFwdOp,hidden-state-prefill,2048x4096,float16,16777216,8388608,-8388608,33554432,33554432,0 +clamp,HardtanhFwdOp,hidden-state-prefill,2048x4096,float16,33554432,8388608,-25165824,33554432,33554432,0 +min-max,ClampFwdOp,elementwise-16M,4096x4096,float16,33554432,16777216,-16777216,134217728,134217728,0 diff --git a/docs/perf/flop_convention_delta.md b/docs/perf/flop_convention_delta.md new file mode 100644 index 000000000..6096d13d3 --- /dev/null +++ b/docs/perf/flop_convention_delta.md @@ -0,0 +1,42 @@ +# Per-element FLOP convention — before/after delta + +Reproducible, formula-only evaluation of the FLOP/byte change introduced +by [`docs/design/roofline.md`](../design/roofline.md) §1.3 on three +representative workloads (one activation, one scalar 2-sided clamp, one +Tensor-bound 2-sided clamp). No GPU is required — the table is computed +purely from manifest YAML and the Python helper at +[`tileops/perf/formulas.py`](../../tileops/perf/formulas.py). + +## Reproduce + +```bash +python scripts/perf/flop_convention_delta.py \ + --out docs/perf/flop_convention_delta.csv +``` + +The CSV at [`flop_convention_delta.csv`](flop_convention_delta.csv) is +checked in. Re-running the command on the current checkout overwrites +it byte-identically. + +## Result + +| family | op | label | shape | dtype | flops before | flops after | flops delta | bytes before | bytes after | bytes delta | +| ---------- | ------------- | -------------------- | --------- | ------- | -----------: | ----------: | ----------: | -----------: | ----------: | ----------: | +| activation | ReluFwdOp | hidden-state-prefill | 2048×4096 | float16 | 16,777,216 | 8,388,608 | -8,388,608 | 33,554,432 | 33,554,432 | 0 | +| clamp | HardtanhFwdOp | hidden-state-prefill | 2048×4096 | float16 | 33,554,432 | 8,388,608 | -25,165,824 | 33,554,432 | 33,554,432 | 0 | +| min-max | ClampFwdOp | elementwise-16M | 4096×4096 | float16 | 33,554,432 | 16,777,216 | -16,777,216 | 134,217,728 | 134,217,728 | 0 | + +`flops before` columns reflect the coefficients that lived on each +entry on `upstream/testbed` immediately before the convention commit +(verifiable via `git diff upstream/testbed -- tileops/manifest/`). +`flops after` columns are evaluated from the manifest formulas (for +`ReluFwdOp` / `HardtanhFwdOp`) or from `clamp_fwd_roofline` (for the +Tensor-bound `ClampFwdOp`) on the current checkout. Byte counts are +unchanged by the convention and serve as a sanity column. + +A GPU run was not performed; AC-5 explicitly accepts a +formula-evaluation table. Roofline efficiency depends on +`max(memory_time, compute_time)`; for these elementwise workloads +`memory_time` already dominates, so the FLOP-coefficient reduction +shifts each workload further into the memory-bound regime without +changing predicted achievable bandwidth. diff --git a/scripts/perf/flop_convention_delta.py b/scripts/perf/flop_convention_delta.py new file mode 100644 index 000000000..7cdca6c6f --- /dev/null +++ b/scripts/perf/flop_convention_delta.py @@ -0,0 +1,210 @@ +"""Generate a before/after FLOP/byte table for the per-element FLOP +convention rollout (``docs/design/roofline.md`` §1.3). + +Pure formula evaluation — no GPU, no kernel JIT. The "after" column +loads the affected manifest entries from this checkout and evaluates +each entry's ``roofline.flops`` / ``roofline.bytes`` expression on the +representative workload shape. The "before" column hard-codes the +coefficients that lived on the same entries on ``upstream/testbed`` +immediately before the convention commit (see ``git log`` / +``git diff upstream/testbed`` on this branch). + +Usage: + python scripts/perf/flop_convention_delta.py \ + --out docs/perf/flop_convention_delta.csv + +Reproducible from a clean checkout: the script only reads YAML and +evaluates simple Python expressions; it does not import ``tileops.ops`` +and so does not trigger TileLang JIT compilation. +""" + +from __future__ import annotations + +import argparse +import csv +from dataclasses import dataclass +from pathlib import Path +from typing import Mapping + +import yaml + +REPO_ROOT = Path(__file__).resolve().parents[2] +MANIFEST_DIR = REPO_ROOT / "tileops" / "manifest" + + +def _product(seq) -> int: + out = 1 + for v in seq: + out *= int(v) + return out + + +def _load_op(family_file: str, op_name: str) -> dict: + with (MANIFEST_DIR / family_file).open() as f: + data = yaml.safe_load(f) + return data[op_name] + + +class _Shape: + """Minimal stand-in for a tensor exposing ``.shape`` and ``.ndim``.""" + + def __init__(self, shape: tuple[int, ...]): + self.shape = tuple(shape) + self.ndim = len(self.shape) + + +def _eval_inline(roofline: dict, tensor_shapes: Mapping[str, tuple[int, ...]], + elem_bytes: int) -> tuple[int, int]: + """Evaluate ``vars`` then ``flops`` / ``bytes`` in a sealed namespace.""" + ns: dict = { + "product": _product, + "len": len, + "min": min, + "max": max, + "int": int, + "elem_bytes": elem_bytes, + } + for name, shape in tensor_shapes.items(): + ns[name] = _Shape(shape) + for vname, vexpr in (roofline.get("vars") or {}).items(): + ns[vname] = eval(vexpr, {"__builtins__": {}}, ns) # noqa: S307 + flops = int(eval(roofline["flops"], {"__builtins__": {}}, ns)) # noqa: S307 + nbytes = int(eval(roofline["bytes"], {"__builtins__": {}}, ns)) # noqa: S307 + return flops, nbytes + + +# --- "before" coefficients captured from ``upstream/testbed`` --- +# These are the FLOP coefficients on the named entries before the +# convention commit. Bytes formulas were not changed by the convention, +# so the byte column is computed once from the current expression. +_BEFORE_FLOPS_COEFF: dict[str, int] = { + "ReluFwdOp": 2, # was "2 * N" + "HardtanhFwdOp": 4, # was "4 * N" + "ClampFwdOp_Nmult": 2, # was "2 * n_total" inside clamp_fwd_roofline +} + + +@dataclass(frozen=True) +class Row: + family: str + op_name: str + label: str + shape: tuple[int, ...] + dtype_name: str + elem_bytes: int + flops_before: int + flops_after: int + bytes_before: int + bytes_after: int + + +def _row_activation() -> Row: + """ReLU on the Llama-3.1-8B prefill hidden state.""" + op = _load_op("elementwise_unary_activation.yaml", "ReluFwdOp") + shape = (2048, 4096) + n = _product(shape) + flops_after, bytes_after = _eval_inline( + op["roofline"], {"input": shape}, elem_bytes=2, + ) + return Row( + family="activation", + op_name="ReluFwdOp", + label="hidden-state-prefill", + shape=shape, + dtype_name="float16", + elem_bytes=2, + flops_before=_BEFORE_FLOPS_COEFF["ReluFwdOp"] * n, + flops_after=flops_after, + bytes_before=bytes_after, # bytes formula unchanged + bytes_after=bytes_after, + ) + + +def _row_clamp_scalar() -> Row: + """Hardtanh (scalar 2-sided clamp) on the same shape.""" + op = _load_op("elementwise_unary_activation.yaml", "HardtanhFwdOp") + shape = (2048, 4096) + n = _product(shape) + flops_after, bytes_after = _eval_inline( + op["roofline"], {"input": shape}, elem_bytes=2, + ) + return Row( + family="clamp", + op_name="HardtanhFwdOp", + label="hidden-state-prefill", + shape=shape, + dtype_name="float16", + elem_bytes=2, + flops_before=_BEFORE_FLOPS_COEFF["HardtanhFwdOp"] * n, + flops_after=flops_after, + bytes_before=bytes_after, + bytes_after=bytes_after, + ) + + +def _row_clamp_tensor() -> Row: + """Tensor-bound 2-sided clamp (func mode in formulas.py).""" + shape = (4096, 4096) + n = _product(shape) + elem_bytes = 2 + # After convention: flops = N_total, bytes = 4 * N_total * elem_bytes. + flops_after = n + bytes_after = 4 * n * elem_bytes + return Row( + family="min-max", + op_name="ClampFwdOp", + label="elementwise-16M", + shape=shape, + dtype_name="float16", + elem_bytes=elem_bytes, + flops_before=_BEFORE_FLOPS_COEFF["ClampFwdOp_Nmult"] * n, + flops_after=flops_after, + bytes_before=bytes_after, + bytes_after=bytes_after, + ) + + +def collect() -> list[Row]: + return [_row_activation(), _row_clamp_scalar(), _row_clamp_tensor()] + + +def write_csv(rows: list[Row], out_path: Path) -> None: + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow([ + "family", "op", "label", "shape", "dtype", + "flops_before", "flops_after", "flops_delta", + "bytes_before", "bytes_after", "bytes_delta", + ]) + for r in rows: + w.writerow([ + r.family, r.op_name, r.label, + "x".join(map(str, r.shape)), r.dtype_name, + r.flops_before, r.flops_after, + r.flops_after - r.flops_before, + r.bytes_before, r.bytes_after, + r.bytes_after - r.bytes_before, + ]) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--out", + type=Path, + default=REPO_ROOT / "docs" / "perf" / "flop_convention_delta.csv", + ) + args = parser.parse_args() + rows = collect() + write_csv(rows, args.out) + for r in rows: + print( + f"{r.family:<10} {r.op_name:<16} {r.label:<22} " + f"flops {r.flops_before:>12} -> {r.flops_after:<12} " + f"bytes {r.bytes_before:>12} -> {r.bytes_after:<12}" + ) + + +if __name__ == "__main__": + main() diff --git a/tileops/manifest/elementwise_binary.yaml b/tileops/manifest/elementwise_binary.yaml index 6d4309d95..59e53742a 100644 --- a/tileops/manifest/elementwise_binary.yaml +++ b/tileops/manifest/elementwise_binary.yaml @@ -31,8 +31,9 @@ PreluFwdOp: vars: N: "product(input.shape)" W: "1 if weight.ndim == 0 or weight.shape[0] == 1 else weight.shape[0]" - # compare + mul + select per element - flops: "3 * N" + # FLOPs: prelu(x) = x if x>0 else weight*x. Per roofline.md §1.3, + # compare-and-select(1) + mul(1) = 2 per elem. + flops: "2 * N" # Read input (N) + read weight (small, ~C) + write output (N) bytes: "(2 * N + W) * elem_bytes" diff --git a/tileops/manifest/elementwise_unary_activation.yaml b/tileops/manifest/elementwise_unary_activation.yaml index e98e5254e..f9564f6ff 100644 --- a/tileops/manifest/elementwise_unary_activation.yaml +++ b/tileops/manifest/elementwise_unary_activation.yaml @@ -31,8 +31,9 @@ ReluFwdOp: roofline: vars: N: "product(input.shape)" - # 1 compare + 1 select per element = 2 fp ops - flops: "2 * N" + # FLOPs: relu(x) = max(0, x). Per roofline.md §1.3 convention, + # one compare-and-select = 1 per elem. + flops: "N" # Read input + write output bytes: "2 * N * elem_bytes" @@ -278,8 +279,9 @@ LeakyReluFwdOp: roofline: vars: N: "product(input.shape)" - # compare + mul + select = 3 fp ops per element - flops: "3 * N" + # FLOPs: leaky_relu(x) = x if x>0 else negative_slope*x. Per + # roofline.md §1.3, compare-and-select(1) + mul(1) = 2 per elem. + flops: "2 * N" bytes: "2 * N * elem_bytes" source: diff --git a/tileops/perf/formulas.py b/tileops/perf/formulas.py index 41ef36828..bd392b6b2 100644 --- a/tileops/perf/formulas.py +++ b/tileops/perf/formulas.py @@ -446,10 +446,10 @@ def clamp_fwd_roofline(op: "Op") -> tuple[int, int]: broadcasting across all three operands. Reads ``op.N_total`` (the post-broadcast element count) and ``op.dtype.itemsize``. - Per-output element: one ``max(input, min)`` then one ``min(.., max)`` - → ``flops = 2 * N_total``. Bytes: read input + read min + read max + - write out, all post-broadcast at ``elem_bytes`` each → - ``bytes = 4 * N_total * elem_bytes``. + Per ``docs/design/roofline.md`` §1.3, two-sided clamp collapses to + one fused compare-and-select = ``flops = N_total``. Bytes: read + input + read min + read max + write out, all post-broadcast at + ``elem_bytes`` each → ``bytes = 4 * N_total * elem_bytes``. Args: op: bound ``ClampFwdOp`` instance exposing ``N_total`` and @@ -460,7 +460,7 @@ def clamp_fwd_roofline(op: "Op") -> tuple[int, int]: """ n_total = int(op.N_total) elem_bytes = op.dtype.itemsize - return 2 * n_total, 4 * n_total * elem_bytes + return n_total, 4 * n_total * elem_bytes def clamp_min_fwd_roofline(op: "Op") -> tuple[int, int]: From 083ad96541dd075fb532775b01872a3ad0517fda Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 06:17:50 +0800 Subject: [PATCH 3/5] fix(ops): align activation FLOPS_PER_ELEM with manifest convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update runtime FLOPS_PER_ELEM constants in tileops/ops/elementwise.py to match the per-element FLOP convention defined in docs/design/roofline.md §1.3, so that op.eval_roofline() and benchmark TFLOPs stay manifest-aligned. Per the convention: - compare-and-select (single or two-sided clamp) = 1 FLOP/elem - transcendental call (exp, tanh, erf, ...) = 1 FLOP/elem - arithmetic op (add, sub, mul, div, ...) = 1 FLOP/elem Aligned values: - ReluFwdOp: 2 -> 1 - GeluFwdOp: 8 -> 5 - SiluFwdOp: 4 -> 5 - TanhFwdOp: 5 -> 1 - HardswishFwdOp: 7 -> 4 - HardsigmoidFwdOp: 6 -> 3 - MishFwdOp: 7 -> 4 - LeakyReluFwdOp: 3 -> 2 - EluFwdOp: 5 -> 4 - HardtanhFwdOp: 4 -> 1 - SoftplusFwdOp: 7 -> 5 Also refresh UnaryOp.FLOPS_PER_ELEM / eval_roofline docstrings to reference the convention rather than the pre-convention examples. Co-Authored-By: Ibuki 🍃 — a wind born from GPTs --- tileops/ops/elementwise/activations.py | 61 ++++++++++++++++---------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/tileops/ops/elementwise/activations.py b/tileops/ops/elementwise/activations.py index a9c1b5b7b..f4fcbd063 100644 --- a/tileops/ops/elementwise/activations.py +++ b/tileops/ops/elementwise/activations.py @@ -36,8 +36,9 @@ class ReluFwdOp(_ParamFreeActivationOp): _op_name = "relu" kernel_cls = ReluFwdKernel - # Manifest: flops = "2 * N" (compare + select per element). - FLOPS_PER_ELEM = 2 + # Manifest: flops = "N". Per roofline.md §1.3, one + # compare-and-select counts as 1 FLOP per element. + FLOPS_PER_ELEM = 1 class GeluFwdOp(_GeluApproximateBase): @@ -57,9 +58,10 @@ class GeluFwdOp(_GeluApproximateBase): _op_name = "gelu" kernel_cls = GeluFwdKernel - # Manifest: flops = "8 * N" (erf-based: mul + erf + add + mul + mul ≈ 8; - # tanh approximation is similar order, see manifest comment). - FLOPS_PER_ELEM = 8 + # Manifest: flops = "5 * N". Per roofline.md §1.3: + # gelu(x) = x * 0.5 * (1 + erf(x/sqrt(2))) = + # div + erf(transcendental) + add + mul-by-half + mul = 5 per elem. + FLOPS_PER_ELEM = 5 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -77,8 +79,9 @@ class SiluFwdOp(_ParamFreeActivationOp): _op_name = "silu" kernel_cls = SiluFwdKernel - # Manifest: flops = "4 * N" (sigmoid + multiply). - FLOPS_PER_ELEM = 4 + # Manifest: flops = "5 * N". Per roofline.md §1.3: + # sigmoid = neg + exp + add + recip = 4; silu adds one mul = 5 per elem. + FLOPS_PER_ELEM = 5 class SigmoidFwdOp(UnaryOp): @@ -95,8 +98,9 @@ class TanhFwdOp(UnaryOp): _op_name = "tanh" kernel_cls = TanhFwdKernel - # Manifest: flops = "5 * N" (tanh(x) = 2 * sigmoid(2x) - 1 ≈ 5 ops/elem). - FLOPS_PER_ELEM = 5 + # Manifest: flops = "N". Per roofline.md §1.3, tanh is one + # transcendental call = 1 FLOP per element. + FLOPS_PER_ELEM = 1 class HardswishFwdOp(_ParamFreeActivationOp): @@ -104,8 +108,10 @@ class HardswishFwdOp(_ParamFreeActivationOp): _op_name = "hardswish" kernel_cls = HardswishFwdKernel - # Manifest: flops = "7 * N" (add + clamp(2 cmp+2 sel) + mul + div). - FLOPS_PER_ELEM = 7 + # Manifest: flops = "4 * N". Per roofline.md §1.3: + # hardswish(x) = x * relu6(x+3)/6 = + # add + two-sided-clamp(1) + mul + div = 4 per elem. + FLOPS_PER_ELEM = 4 class HardsigmoidFwdOp(_ParamFreeActivationOp): @@ -113,8 +119,10 @@ class HardsigmoidFwdOp(_ParamFreeActivationOp): _op_name = "hardsigmoid" kernel_cls = HardsigmoidFwdKernel - # Manifest: flops = "6 * N" (add + clamp(2 cmp+2 sel) + div). - FLOPS_PER_ELEM = 6 + # Manifest: flops = "3 * N". Per roofline.md §1.3: + # hardsigmoid(x) = relu6(x+3)/6 = + # add + two-sided-clamp(1) + div = 3 per elem. + FLOPS_PER_ELEM = 3 class MishFwdOp(_ParamFreeActivationOp): @@ -122,8 +130,10 @@ class MishFwdOp(_ParamFreeActivationOp): _op_name = "mish" kernel_cls = MishFwdKernel - # Manifest: flops = "7 * N" (softplus + tanh + mul). - FLOPS_PER_ELEM = 7 + # Manifest: flops = "4 * N". Per roofline.md §1.3: + # mish(x) = x * tanh(softplus(x)); + # softplus = exp + log1p = 2; tanh(transcendental) + final mul = 4 per elem. + FLOPS_PER_ELEM = 4 class SeluFwdOp(_ParamFreeActivationOp): @@ -152,8 +162,9 @@ class LeakyReluFwdOp(_ParametricActivationOp): _op_name = "leaky_relu" _wrapped = None - # Manifest: flops = "3 * N" (compare + mul + select). - FLOPS_PER_ELEM = 3 + # Manifest: flops = "2 * N". Per roofline.md §1.3: + # compare-and-select(1) + mul = 2 per elem. + FLOPS_PER_ELEM = 2 def __init__( self, @@ -193,8 +204,9 @@ class EluFwdOp(_ParametricActivationOp): _op_name = "elu" _wrapped = None - # Manifest: flops = "5 * N" (compare + (exp + sub + mul) + branch select). - FLOPS_PER_ELEM = 5 + # Manifest: flops = "4 * N". Per roofline.md §1.3: + # compare-and-select(1) + exp + sub + mul = 4 per elem. + FLOPS_PER_ELEM = 4 def __init__( self, @@ -235,8 +247,9 @@ class HardtanhFwdOp(_ParametricActivationOp): _op_name = "hardtanh" _wrapped = None - # Manifest: flops = "4 * N" (2 compares + 2 selects per element). - FLOPS_PER_ELEM = 4 + # Manifest: flops = "N". Per roofline.md §1.3, two-sided clamp + # collapses to 1 compare-and-select per output element. + FLOPS_PER_ELEM = 1 def __init__( self, @@ -278,8 +291,10 @@ class SoftplusFwdOp(_ParametricActivationOp): _op_name = "softplus" _wrapped = None - # Manifest: flops = "7 * N" (mul + exp + add + log + div + compare + select). - FLOPS_PER_ELEM = 7 + # Manifest: flops = "5 * N". Per roofline.md §1.3: + # mul-beta + threshold compare-and-select(1) + exp + log1p + div-by-beta + # = 5 per elem. + FLOPS_PER_ELEM = 5 def __init__( self, From 94d5aea8b530eb55976ee765f41ac45a299af630 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 12:51:09 +0800 Subject: [PATCH 4/5] [Chore][Perf] route ClampFwdOp delta row through clamp_fwd_roofline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review: _row_clamp_tensor now calls tileops.perf.formulas.clamp_fwd_roofline with a minimal SimpleNamespace stub op (exposing N_total and dtype.itemsize) so the after-column tracks the helper as the source of truth and cannot drift. Verified output unchanged: flops 33554432 -> 16777216, bytes 134217728 -> 134217728. Co-Authored-By: Ibuki 🍃 — a wind born from GPTs --- scripts/perf/flop_convention_delta.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/perf/flop_convention_delta.py b/scripts/perf/flop_convention_delta.py index 7cdca6c6f..65e3167b2 100644 --- a/scripts/perf/flop_convention_delta.py +++ b/scripts/perf/flop_convention_delta.py @@ -143,13 +143,25 @@ def _row_clamp_scalar() -> Row: def _row_clamp_tensor() -> Row: - """Tensor-bound 2-sided clamp (func mode in formulas.py).""" + """Tensor-bound 2-sided clamp (func mode in formulas.py). + + `flops_after` / `bytes_after` are evaluated by calling + `tileops.perf.formulas.clamp_fwd_roofline` so the table tracks the + helper as the source of truth. A minimal stub op exposes the two + attributes the helper reads: `N_total` and `dtype.itemsize`. + """ + from types import SimpleNamespace + + from tileops.perf.formulas import clamp_fwd_roofline + shape = (4096, 4096) n = _product(shape) elem_bytes = 2 - # After convention: flops = N_total, bytes = 4 * N_total * elem_bytes. - flops_after = n - bytes_after = 4 * n * elem_bytes + stub_op = SimpleNamespace( + N_total=n, + dtype=SimpleNamespace(itemsize=elem_bytes), + ) + flops_after, bytes_after = clamp_fwd_roofline(stub_op) # type: ignore[arg-type] return Row( family="min-max", op_name="ClampFwdOp", From ab99ac22f18001b778eafb16f481e9f467e04a39 Mon Sep 17 00:00:00 2001 From: lcy-seso Date: Sat, 9 May 2026 13:05:46 +0800 Subject: [PATCH 5/5] [Chore][Ops] dedent FLOPs comment continuation lines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review: continuation lines on multi-line FLOPs derivation comments were over-indented (8 spaces vs 4 for class attributes). Dedent to match surrounding class-level indentation. Co-Authored-By: Ibuki 🍃 — a wind born from GPTs --- tileops/ops/elementwise/activations.py | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tileops/ops/elementwise/activations.py b/tileops/ops/elementwise/activations.py index f4fcbd063..934bb40ac 100644 --- a/tileops/ops/elementwise/activations.py +++ b/tileops/ops/elementwise/activations.py @@ -37,7 +37,7 @@ class ReluFwdOp(_ParamFreeActivationOp): _op_name = "relu" kernel_cls = ReluFwdKernel # Manifest: flops = "N". Per roofline.md §1.3, one - # compare-and-select counts as 1 FLOP per element. + # compare-and-select counts as 1 FLOP per element. FLOPS_PER_ELEM = 1 @@ -59,8 +59,8 @@ class GeluFwdOp(_GeluApproximateBase): _op_name = "gelu" kernel_cls = GeluFwdKernel # Manifest: flops = "5 * N". Per roofline.md §1.3: - # gelu(x) = x * 0.5 * (1 + erf(x/sqrt(2))) = - # div + erf(transcendental) + add + mul-by-half + mul = 5 per elem. + # gelu(x) = x * 0.5 * (1 + erf(x/sqrt(2))) = + # div + erf(transcendental) + add + mul-by-half + mul = 5 per elem. FLOPS_PER_ELEM = 5 def __init__(self, *args, **kwargs): @@ -80,7 +80,7 @@ class SiluFwdOp(_ParamFreeActivationOp): _op_name = "silu" kernel_cls = SiluFwdKernel # Manifest: flops = "5 * N". Per roofline.md §1.3: - # sigmoid = neg + exp + add + recip = 4; silu adds one mul = 5 per elem. + # sigmoid = neg + exp + add + recip = 4; silu adds one mul = 5 per elem. FLOPS_PER_ELEM = 5 @@ -99,7 +99,7 @@ class TanhFwdOp(UnaryOp): _op_name = "tanh" kernel_cls = TanhFwdKernel # Manifest: flops = "N". Per roofline.md §1.3, tanh is one - # transcendental call = 1 FLOP per element. + # transcendental call = 1 FLOP per element. FLOPS_PER_ELEM = 1 @@ -109,8 +109,8 @@ class HardswishFwdOp(_ParamFreeActivationOp): _op_name = "hardswish" kernel_cls = HardswishFwdKernel # Manifest: flops = "4 * N". Per roofline.md §1.3: - # hardswish(x) = x * relu6(x+3)/6 = - # add + two-sided-clamp(1) + mul + div = 4 per elem. + # hardswish(x) = x * relu6(x+3)/6 = + # add + two-sided-clamp(1) + mul + div = 4 per elem. FLOPS_PER_ELEM = 4 @@ -120,8 +120,8 @@ class HardsigmoidFwdOp(_ParamFreeActivationOp): _op_name = "hardsigmoid" kernel_cls = HardsigmoidFwdKernel # Manifest: flops = "3 * N". Per roofline.md §1.3: - # hardsigmoid(x) = relu6(x+3)/6 = - # add + two-sided-clamp(1) + div = 3 per elem. + # hardsigmoid(x) = relu6(x+3)/6 = + # add + two-sided-clamp(1) + div = 3 per elem. FLOPS_PER_ELEM = 3 @@ -131,8 +131,8 @@ class MishFwdOp(_ParamFreeActivationOp): _op_name = "mish" kernel_cls = MishFwdKernel # Manifest: flops = "4 * N". Per roofline.md §1.3: - # mish(x) = x * tanh(softplus(x)); - # softplus = exp + log1p = 2; tanh(transcendental) + final mul = 4 per elem. + # mish(x) = x * tanh(softplus(x)); + # softplus = exp + log1p = 2; tanh(transcendental) + final mul = 4 per elem. FLOPS_PER_ELEM = 4 @@ -163,7 +163,7 @@ class LeakyReluFwdOp(_ParametricActivationOp): _op_name = "leaky_relu" _wrapped = None # Manifest: flops = "2 * N". Per roofline.md §1.3: - # compare-and-select(1) + mul = 2 per elem. + # compare-and-select(1) + mul = 2 per elem. FLOPS_PER_ELEM = 2 def __init__( @@ -205,7 +205,7 @@ class EluFwdOp(_ParametricActivationOp): _op_name = "elu" _wrapped = None # Manifest: flops = "4 * N". Per roofline.md §1.3: - # compare-and-select(1) + exp + sub + mul = 4 per elem. + # compare-and-select(1) + exp + sub + mul = 4 per elem. FLOPS_PER_ELEM = 4 def __init__( @@ -248,7 +248,7 @@ class HardtanhFwdOp(_ParametricActivationOp): _op_name = "hardtanh" _wrapped = None # Manifest: flops = "N". Per roofline.md §1.3, two-sided clamp - # collapses to 1 compare-and-select per output element. + # collapses to 1 compare-and-select per output element. FLOPS_PER_ELEM = 1 def __init__( @@ -292,8 +292,8 @@ class SoftplusFwdOp(_ParametricActivationOp): _op_name = "softplus" _wrapped = None # Manifest: flops = "5 * N". Per roofline.md §1.3: - # mul-beta + threshold compare-and-select(1) + exp + log1p + div-by-beta - # = 5 per elem. + # mul-beta + threshold compare-and-select(1) + exp + log1p + div-by-beta + # = 5 per elem. FLOPS_PER_ELEM = 5 def __init__(