tile-ai · lcy-seso · May 9, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/docs/design/roofline.md b/docs/design/roofline.md
@@ -25,6 +25,21 @@ Inputs:
 
 Bound type is whichever term dominates `sol_time` (memory-bound if `memory_time > compute_time`, else compute-bound). It depends on shape, not on the op; the roofline tool computes it per-workload and the manifest does not declare it.
 
+## 1.3 Convention
+
+Dated: 2026-05-09.
+
+Per-element FLOP rule for elementwise ops:
+
+- One basic arithmetic op (add, sub, mul, div, neg, abs, recip) counts as 1 FLOP.
+- One transcendental call (`exp`, `log`, `log1p`, `erf`, `tanh`, `sin`, `cos`, `sqrt`, `rsqrt`, etc.) counts as 1 FLOP at the convention level. Hardware-specific cost models do not feed back into the manifest.
+- One compare-and-select (`max`, `min`, `maximum`, `minimum`, single-bound clamp, two-sided clamp, `relu`-style branch, `where`) counts as 1 FLOP per output element. Two-sided clamp counts the same as single-sided: 1 FLOP per element.
+- Predicate-only outputs (`eq`, `gt`, etc.) count as 1 FLOP per element.
+
+Composite ops sum the counts of their primitive steps under this convention. For example, `sigmoid(x) = recip(1 + exp(-x))` is `neg + exp + add + recip = 4` FLOPs/elem; `silu(x) = x * sigmoid(x)` is `5` FLOPs/elem; `tanh(x)` is `1` FLOP/elem because `tanh` is a single transcendental at the convention level.
+
+The clamp / min-max family — `hardtanh`, `clamp_scalar`, `clamp_min`, `clamp_max`, `maximum`, `minimum` — collapses to 1 FLOP per output element under this rule.
+
 ## 2. Field Specification
 
 ### 2.1 Output Contract

diff --git a/docs/perf/flop_convention_delta.csv b/docs/perf/flop_convention_delta.csv
@@ -0,0 +1,4 @@
+family,op,label,shape,dtype,flops_before,flops_after,flops_delta,bytes_before,bytes_after,bytes_delta
+activation,ReluFwdOp,hidden-state-prefill,2048x4096,float16,16777216,8388608,-8388608,33554432,33554432,0
+clamp,HardtanhFwdOp,hidden-state-prefill,2048x4096,float16,33554432,8388608,-25165824,33554432,33554432,0
+min-max,ClampFwdOp,elementwise-16M,4096x4096,float16,33554432,16777216,-16777216,134217728,134217728,0
diff --git a/docs/perf/flop_convention_delta.md b/docs/perf/flop_convention_delta.md
@@ -0,0 +1,42 @@
+# Per-element FLOP convention — before/after delta
+
+Reproducible, formula-only evaluation of the FLOP/byte change introduced
+by [`docs/design/roofline.md`](../design/roofline.md) §1.3 on three
+representative workloads (one activation, one scalar 2-sided clamp, one
+Tensor-bound 2-sided clamp). No GPU is required — the table is computed
+purely from manifest YAML and the Python helper at
+[`tileops/perf/formulas.py`](../../tileops/perf/formulas.py).
+
+## Reproduce
+
+```bash
+python scripts/perf/flop_convention_delta.py \
+  --out docs/perf/flop_convention_delta.csv
+```
+
+The CSV at [`flop_convention_delta.csv`](flop_convention_delta.csv) is
+checked in. Re-running the command on the current checkout overwrites
+it byte-identically.
+
+## Result
+
+| family     | op            | label                | shape     | dtype   | flops before | flops after | flops delta | bytes before | bytes after | bytes delta |
+| ---------- | ------------- | -------------------- | --------- | ------- | -----------: | ----------: | ----------: | -----------: | ----------: | ----------: |
+| activation | ReluFwdOp     | hidden-state-prefill | 2048×4096 | float16 |   16,777,216 |   8,388,608 |  -8,388,608 |   33,554,432 |  33,554,432 |           0 |
+| clamp      | HardtanhFwdOp | hidden-state-prefill | 2048×4096 | float16 |   33,554,432 |   8,388,608 | -25,165,824 |   33,554,432 |  33,554,432 |           0 |
+| min-max    | ClampFwdOp    | elementwise-16M      | 4096×4096 | float16 |   33,554,432 |  16,777,216 | -16,777,216 |  134,217,728 | 134,217,728 |           0 |
+
+`flops before` columns reflect the coefficients that lived on each
+entry on `upstream/testbed` immediately before the convention commit
+(verifiable via `git diff upstream/testbed -- tileops/manifest/`).
+`flops after` columns are evaluated from the manifest formulas (for
+`ReluFwdOp` / `HardtanhFwdOp`) or from `clamp_fwd_roofline` (for the
+Tensor-bound `ClampFwdOp`) on the current checkout. Byte counts are
+unchanged by the convention and serve as a sanity column.
+
+A GPU run was not performed; AC-5 explicitly accepts a
+formula-evaluation table. Roofline efficiency depends on
+`max(memory_time, compute_time)`; for these elementwise workloads
+`memory_time` already dominates, so the FLOP-coefficient reduction
+shifts each workload further into the memory-bound regime without
+changing predicted achievable bandwidth.
diff --git a/scripts/perf/flop_convention_delta.py b/scripts/perf/flop_convention_delta.py
@@ -0,0 +1,222 @@
+"""Generate a before/after FLOP/byte table for the per-element FLOP
+convention rollout (``docs/design/roofline.md`` §1.3).
+
+Pure formula evaluation — no GPU, no kernel JIT. The "after" column
+loads the affected manifest entries from this checkout and evaluates
+each entry's ``roofline.flops`` / ``roofline.bytes`` expression on the
+representative workload shape. The "before" column hard-codes the
+coefficients that lived on the same entries on ``upstream/testbed``
+immediately before the convention commit (see ``git log`` /
+``git diff upstream/testbed`` on this branch).
+
+Usage:
+    python scripts/perf/flop_convention_delta.py \
+        --out docs/perf/flop_convention_delta.csv
+
+Reproducible from a clean checkout: the script only reads YAML and
+evaluates simple Python expressions; it does not import ``tileops.ops``
+and so does not trigger TileLang JIT compilation.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Mapping
+
+import yaml
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+MANIFEST_DIR = REPO_ROOT / "tileops" / "manifest"
+
+
+def _product(seq) -> int:
+    out = 1
+    for v in seq:
+        out *= int(v)
+    return out
+
+
+def _load_op(family_file: str, op_name: str) -> dict:
+    with (MANIFEST_DIR / family_file).open() as f:
+        data = yaml.safe_load(f)
+    return data[op_name]
+
+
+class _Shape:
+    """Minimal stand-in for a tensor exposing ``.shape`` and ``.ndim``."""
+
+    def __init__(self, shape: tuple[int, ...]):
+        self.shape = tuple(shape)
+        self.ndim = len(self.shape)
+
+
+def _eval_inline(roofline: dict, tensor_shapes: Mapping[str, tuple[int, ...]],
+                 elem_bytes: int) -> tuple[int, int]:
+    """Evaluate ``vars`` then ``flops`` / ``bytes`` in a sealed namespace."""
+    ns: dict = {
+        "product": _product,
+        "len": len,
+        "min": min,
+        "max": max,
+        "int": int,
+        "elem_bytes": elem_bytes,
+    }
+    for name, shape in tensor_shapes.items():
+        ns[name] = _Shape(shape)
+    for vname, vexpr in (roofline.get("vars") or {}).items():
+        ns[vname] = eval(vexpr, {"__builtins__": {}}, ns)  # noqa: S307
+    flops = int(eval(roofline["flops"], {"__builtins__": {}}, ns))  # noqa: S307
+    nbytes = int(eval(roofline["bytes"], {"__builtins__": {}}, ns))  # noqa: S307
+    return flops, nbytes
+
+
+# --- "before" coefficients captured from ``upstream/testbed`` ---
+# These are the FLOP coefficients on the named entries before the
+# convention commit. Bytes formulas were not changed by the convention,
+# so the byte column is computed once from the current expression.
+_BEFORE_FLOPS_COEFF: dict[str, int] = {
+    "ReluFwdOp": 2,        # was "2 * N"
+    "HardtanhFwdOp": 4,    # was "4 * N"
+    "ClampFwdOp_Nmult": 2,  # was "2 * n_total" inside clamp_fwd_roofline
+}
+
+
+@dataclass(frozen=True)
+class Row:
+    family: str
+    op_name: str
+    label: str
+    shape: tuple[int, ...]
+    dtype_name: str
+    elem_bytes: int
+    flops_before: int
+    flops_after: int
+    bytes_before: int
+    bytes_after: int
+
+
+def _row_activation() -> Row:
+    """ReLU on the Llama-3.1-8B prefill hidden state."""
+    op = _load_op("elementwise_unary_activation.yaml", "ReluFwdOp")
+    shape = (2048, 4096)
+    n = _product(shape)
+    flops_after, bytes_after = _eval_inline(
+        op["roofline"], {"input": shape}, elem_bytes=2,
+    )
+    return Row(
+        family="activation",
+        op_name="ReluFwdOp",
+        label="hidden-state-prefill",
+        shape=shape,
+        dtype_name="float16",
+        elem_bytes=2,
+        flops_before=_BEFORE_FLOPS_COEFF["ReluFwdOp"] * n,
+        flops_after=flops_after,
+        bytes_before=bytes_after,  # bytes formula unchanged
+        bytes_after=bytes_after,
+    )
+
+
+def _row_clamp_scalar() -> Row:
+    """Hardtanh (scalar 2-sided clamp) on the same shape."""
+    op = _load_op("elementwise_unary_activation.yaml", "HardtanhFwdOp")
+    shape = (2048, 4096)
+    n = _product(shape)
+    flops_after, bytes_after = _eval_inline(
+        op["roofline"], {"input": shape}, elem_bytes=2,
+    )
+    return Row(
+        family="clamp",
+        op_name="HardtanhFwdOp",
+        label="hidden-state-prefill",
+        shape=shape,
+        dtype_name="float16",
+        elem_bytes=2,
+        flops_before=_BEFORE_FLOPS_COEFF["HardtanhFwdOp"] * n,
+        flops_after=flops_after,
+        bytes_before=bytes_after,
+        bytes_after=bytes_after,
+    )
+
+
+def _row_clamp_tensor() -> Row:
+    """Tensor-bound 2-sided clamp (func mode in formulas.py).
+
+    `flops_after` / `bytes_after` are evaluated by calling
+    `tileops.perf.formulas.clamp_fwd_roofline` so the table tracks the
+    helper as the source of truth. A minimal stub op exposes the two
+    attributes the helper reads: `N_total` and `dtype.itemsize`.
+    """
+    from types import SimpleNamespace
+
+    from tileops.perf.formulas import clamp_fwd_roofline
+
+    shape = (4096, 4096)
+    n = _product(shape)
+    elem_bytes = 2
+    stub_op = SimpleNamespace(
+        N_total=n,
+        dtype=SimpleNamespace(itemsize=elem_bytes),
+    )
+    flops_after, bytes_after = clamp_fwd_roofline(stub_op)  # type: ignore[arg-type]
+    return Row(
+        family="min-max",
+        op_name="ClampFwdOp",
+        label="elementwise-16M",
+        shape=shape,
+        dtype_name="float16",
+        elem_bytes=elem_bytes,
+        flops_before=_BEFORE_FLOPS_COEFF["ClampFwdOp_Nmult"] * n,
+        flops_after=flops_after,
+        bytes_before=bytes_after,
+        bytes_after=bytes_after,
+    )
+
+
+def collect() -> list[Row]:
+    return [_row_activation(), _row_clamp_scalar(), _row_clamp_tensor()]
+
+
+def write_csv(rows: list[Row], out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow([
+            "family", "op", "label", "shape", "dtype",
+            "flops_before", "flops_after", "flops_delta",
+            "bytes_before", "bytes_after", "bytes_delta",
+        ])
+        for r in rows:
+            w.writerow([
+                r.family, r.op_name, r.label,
+                "x".join(map(str, r.shape)), r.dtype_name,
+                r.flops_before, r.flops_after,
+                r.flops_after - r.flops_before,
+                r.bytes_before, r.bytes_after,
+                r.bytes_after - r.bytes_before,
+            ])
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=REPO_ROOT / "docs" / "perf" / "flop_convention_delta.csv",
+    )
+    args = parser.parse_args()
+    rows = collect()
+    write_csv(rows, args.out)
+    for r in rows:
+        print(
+            f"{r.family:<10} {r.op_name:<16} {r.label:<22} "
+            f"flops {r.flops_before:>12} -> {r.flops_after:<12} "
+            f"bytes {r.bytes_before:>12} -> {r.bytes_after:<12}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tileops/manifest/elementwise_binary.yaml b/tileops/manifest/elementwise_binary.yaml
@@ -31,8 +31,9 @@ PreluFwdOp:
     vars:
       N: "product(input.shape)"
       W: "1 if weight.ndim == 0 or weight.shape[0] == 1 else weight.shape[0]"
-    # compare + mul + select per element
-    flops: "3 * N"
+    # FLOPs: prelu(x) = x if x>0 else weight*x. Per roofline.md §1.3,
+    # compare-and-select(1) + mul(1) = 2 per elem.
+    flops: "2 * N"
     # Read input (N) + read weight (small, ~C) + write output (N)
     bytes: "(2 * N + W) * elem_bytes"