Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions benchmarks/ops/bench_ada_layer_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ def __init__(self, test, op):
self._op = op

def _get_roofline(self) -> tuple[float, float]:
if self._roofline_cache is None:
self._roofline_cache = self._op.eval_roofline()
return self._roofline_cache
cache = self._roofline_cache
if cache is None:
cache = self._op.eval_roofline()
self._roofline_cache = cache
return cache
Comment thread
lcy-seso marked this conversation as resolved.

def calculate_flops(self) -> Optional[float]:
return self._get_roofline()[0]
Expand All @@ -44,9 +46,11 @@ def __init__(self, test, op):
self._op = op

def _get_roofline(self) -> tuple[float, float]:
if self._roofline_cache is None:
self._roofline_cache = self._op.eval_roofline()
return self._roofline_cache
cache = self._roofline_cache
if cache is None:
cache = self._op.eval_roofline()
self._roofline_cache = cache
return cache

def calculate_flops(self) -> Optional[float]:
return self._get_roofline()[0]
Expand All @@ -55,9 +59,9 @@ def calculate_memory(self) -> Optional[float]:
return self._get_roofline()[1]


def _manifest_params(op_name):
def _to_params(workloads):
params = []
for w in load_workloads(op_name):
for w in workloads:
Comment thread
lcy-seso marked this conversation as resolved.
m, n = w["x_shape"]
Comment thread
lcy-seso marked this conversation as resolved.
label = w.get("label", f"{m}x{n}")
for dtype_str in w["dtypes"]:
Expand All @@ -67,7 +71,7 @@ def _manifest_params(op_name):
return params


@pytest.mark.parametrize("m, n, dtype", _manifest_params(_ADA_OP_NAME))
@pytest.mark.parametrize("m, n, dtype", _to_params(load_workloads(_ADA_OP_NAME)))
def test_ada_layer_norm_bench(m: int, n: int, dtype: torch.dtype) -> None:
test = AdaLayerNormTest(m, n, dtype)
inputs = test.gen_inputs()
Expand All @@ -86,7 +90,7 @@ def baseline_fn(x, scale, shift):
BenchmarkReport.record(op, locals(), result_bl, tag="torch-ref")


@pytest.mark.parametrize("m, n, dtype", _manifest_params(_ADA_ZERO_OP_NAME))
@pytest.mark.parametrize("m, n, dtype", _to_params(load_workloads(_ADA_ZERO_OP_NAME)))
def test_ada_layer_norm_zero_bench(m: int, n: int, dtype: torch.dtype) -> None:
test = AdaLayerNormZeroTest(m, n, dtype)
inputs = test.gen_inputs()
Expand Down
187 changes: 61 additions & 126 deletions benchmarks/ops/bench_reduce_multidim.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,17 @@
Shape conventions use LLaMA-family dimensions:
- (batch=4, seq=128, hidden=4096): 7B inference context
- (batch=2, seq=512, hidden=4096): 7B longer-context inference
"""

from typing import Optional
Roofline metadata (FLOPs, bytes) comes from each op's ``eval_roofline()``
via ``ManifestBenchmark``; the 3D multi-dim shapes themselves are declared
inline because the manifest workload set for these ops only covers 2D
last-axis reductions, which is a different test scenario.
"""

import pytest
import torch

from benchmarks.benchmark_base import BenchmarkBase, BenchmarkReport
from benchmarks.benchmark_base import BenchmarkReport, ManifestBenchmark
from workloads.workload_base import FixtureBase, WorkloadBase

# ===================================================================
Expand Down Expand Up @@ -102,26 +105,7 @@ def ref_program(self, x: torch.Tensor) -> object:
return ops[self.op_kind](x_f32).to(x.dtype)


class ReduceMultidimBenchmark(BenchmarkBase[ReduceMultidimTest]):
def calculate_flops(self) -> Optional[float]:
t = self.workload
total_elems = 1
for s in t.shape:
total_elems *= s
return total_elems

def calculate_memory(self) -> Optional[float]:
t = self.workload
elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
total_elems = 1
for s in t.shape:
total_elems *= s
# Output elements: product of kept dims
out_elems = 1
for i, s in enumerate(t.shape):
if i not in t.dim:
out_elems *= s
return (total_elems + out_elems) * elem_bytes
_REDUCE_OP_NAMES = {"sum": "SumFwdOp", "mean": "MeanFwdOp", "amax": "AmaxFwdOp"}


def _make_reduce_op(dtype, op_kind, dim, keepdim):
Expand All @@ -141,15 +125,21 @@ def test_reduce_multidim_bench(
op_kind: str,
) -> None:
test = ReduceMultidimTest(shape, dim, keepdim, dtype, op_kind)
bm = ReduceMultidimBenchmark(test)
inputs = test.gen_inputs()

op = _make_reduce_op(dtype, op_kind, dim, keepdim)
bm = ManifestBenchmark(_REDUCE_OP_NAMES[op_kind], op, test)
# Preserve legacy report column order: shape, keepdim, dtype, op_kind
# (dim is a list and was already silently dropped by the pre-PR
# serializability filter, so we omit it here too).
report_params = {
"shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind,
}
result = bm.profile(op, *inputs)
BenchmarkReport.record(op, locals(), result, tag="tileops")
BenchmarkReport.record(op, report_params, result, tag="tileops")

result_bl = bm.profile(test.ref_program, *inputs)
BenchmarkReport.record(op, locals(), result_bl, tag="torch")
BenchmarkReport.record(op, report_params, result_bl, tag="torch")


# ===================================================================
Expand Down Expand Up @@ -221,22 +211,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
return x.argmin(dim=self.dim, keepdim=self.keepdim)


class ArgreduceMultidimBenchmark(BenchmarkBase[ArgreduceMultidimTest]):
def calculate_flops(self) -> Optional[float]:
total_elems = 1
for s in self.workload.shape:
total_elems *= s
return total_elems

def calculate_memory(self) -> Optional[float]:
t = self.workload
elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
total_elems = 1
for s in t.shape:
total_elems *= s
# Output: int64 (8 bytes) for each position
out_elems = total_elems // t.shape[t.dim]
return total_elems * elem_bytes + out_elems * 8
_ARGREDUCE_OP_NAMES = {"argmax": "ArgmaxFwdOp", "argmin": "ArgminFwdOp"}


def _make_argreduce_op(dtype, op_kind, dim, keepdim):
Expand All @@ -256,15 +231,21 @@ def test_argreduce_multidim_bench(
op_kind: str,
) -> None:
test = ArgreduceMultidimTest(shape, dim, keepdim, dtype, op_kind)
bm = ArgreduceMultidimBenchmark(test)
inputs = test.gen_inputs()

op = _make_argreduce_op(dtype, op_kind, dim, keepdim)
bm = ManifestBenchmark(_ARGREDUCE_OP_NAMES[op_kind], op, test)
# Preserve legacy report column order: shape, dim, keepdim, dtype, op_kind
# (dim is int here and was kept by the pre-PR filter).
report_params = {
"shape": shape, "dim": dim, "keepdim": keepdim,
"dtype": dtype, "op_kind": op_kind,
}
result = bm.profile(op, *inputs)
BenchmarkReport.record(op, locals(), result, tag="tileops")
BenchmarkReport.record(op, report_params, result, tag="tileops")

result_bl = bm.profile(test.ref_program, *inputs)
BenchmarkReport.record(op, locals(), result_bl, tag="torch")
BenchmarkReport.record(op, report_params, result_bl, tag="torch")


# ===================================================================
Expand Down Expand Up @@ -336,26 +317,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
raise ValueError(f"Unknown op_kind: {self.op_kind}")


class LogicalReduceMultidimBenchmark(BenchmarkBase[LogicalReduceMultidimTest]):
def calculate_flops(self) -> Optional[float]:
total_elems = 1
for s in self.workload.shape:
total_elems *= s
return total_elems

def calculate_memory(self) -> Optional[float]:
t = self.workload
elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
total_elems = 1
for s in t.shape:
total_elems *= s
out_elems = 1
dims = set(d % len(t.shape) for d in t.dim)
for i, s in enumerate(t.shape):
if i not in dims:
out_elems *= s
out_elem_bytes = 8 if t.op_kind == "count_nonzero" else 1
return total_elems * elem_bytes + out_elems * out_elem_bytes
_LOGICAL_OP_NAMES = {
"any": "AnyFwdOp", "all": "AllFwdOp", "count_nonzero": "CountNonzeroFwdOp",
}


def _make_logical_op(dtype, op_kind, dim, keepdim):
Expand All @@ -380,15 +344,20 @@ def test_logical_reduce_multidim_bench(
op_kind: str,
) -> None:
test = LogicalReduceMultidimTest(shape, dim, keepdim, dtype, op_kind)
bm = LogicalReduceMultidimBenchmark(test)
inputs = test.gen_inputs()

op = _make_logical_op(dtype, op_kind, dim, keepdim)
bm = ManifestBenchmark(_LOGICAL_OP_NAMES[op_kind], op, test)
# Preserve legacy report column order: shape, keepdim, dtype, op_kind
# (dim list dropped by pre-PR filter).
report_params = {
"shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind,
}
result = bm.profile(op, *inputs)
BenchmarkReport.record(op, locals(), result, tag="tileops")
BenchmarkReport.record(op, report_params, result, tag="tileops")

result_bl = bm.profile(test.ref_program, *inputs)
BenchmarkReport.record(op, locals(), result_bl, tag="torch")
BenchmarkReport.record(op, report_params, result_bl, tag="torch")


# ===================================================================
Expand Down Expand Up @@ -455,25 +424,9 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
)


class VectorNormMultidimBenchmark(BenchmarkBase[VectorNormMultidimTest]):
def calculate_flops(self) -> Optional[float]:
total_elems = 1
for s in self.workload.shape:
total_elems *= s
return total_elems

def calculate_memory(self) -> Optional[float]:
t = self.workload
elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
total_elems = 1
for s in t.shape:
total_elems *= s
out_elems = 1
dims = set(d % len(t.shape) for d in t.dim)
for i, s in enumerate(t.shape):
if i not in dims:
out_elems *= s
return (total_elems + out_elems) * elem_bytes
_VECTOR_NORM_OP_NAMES = {
"l1": "L1NormFwdOp", "l2": "L2NormFwdOp", "inf": "InfNormFwdOp",
}


def _make_norm_op(dtype, op_kind, dim, keepdim):
Expand All @@ -495,15 +448,20 @@ def test_vector_norm_multidim_bench(
op_kind: str,
) -> None:
test = VectorNormMultidimTest(shape, dim, keepdim, dtype, op_kind)
bm = VectorNormMultidimBenchmark(test)
inputs = test.gen_inputs()

op = _make_norm_op(dtype, op_kind, dim, keepdim)
bm = ManifestBenchmark(_VECTOR_NORM_OP_NAMES[op_kind], op, test)
# Preserve legacy report column order: shape, keepdim, dtype, op_kind
# (dim list dropped by pre-PR filter).
report_params = {
"shape": shape, "keepdim": keepdim, "dtype": dtype, "op_kind": op_kind,
}
result = bm.profile(op, *inputs)
BenchmarkReport.record(op, locals(), result, tag="tileops")
BenchmarkReport.record(op, report_params, result, tag="tileops")

result_bl = bm.profile(test.ref_program, *inputs)
BenchmarkReport.record(op, locals(), result_bl, tag="torch")
BenchmarkReport.record(op, report_params, result_bl, tag="torch")


# ===================================================================
Expand Down Expand Up @@ -567,16 +525,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
raise ValueError(f"Unknown op_kind: {self.op_kind}")


class CumulativeMultidimBenchmark(BenchmarkBase[CumulativeMultidimTest]):
def calculate_flops(self) -> Optional[float]:
t = self.workload
return t.M * t.N

def calculate_memory(self) -> Optional[float]:
t = self.workload
elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
# Read + write: 2 * M * N
return 2 * t.M * t.N * elem_bytes
_CUMULATIVE_OP_NAMES = {"cumsum": "CumsumFwdOp", "cumprod": "CumprodFwdOp"}


def _make_cumulative_op(M, N, dtype, op_kind):
Expand All @@ -599,15 +548,17 @@ def test_cumulative_multidim_bench(
op_kind: str,
) -> None:
test = CumulativeMultidimTest(shape, dtype, op_kind)
bm = CumulativeMultidimBenchmark(test)
inputs = test.gen_inputs()

op = _make_cumulative_op(test.M, test.N, dtype, op_kind)
bm = ManifestBenchmark(_CUMULATIVE_OP_NAMES[op_kind], op, test)
# Preserve legacy report column order: shape, dtype, op_kind.
report_params = {"shape": shape, "dtype": dtype, "op_kind": op_kind}
result = bm.profile(op, *inputs)
BenchmarkReport.record(op, locals(), result, tag="tileops")
BenchmarkReport.record(op, report_params, result, tag="tileops")

result_bl = bm.profile(test.ref_program, *inputs)
BenchmarkReport.record(op, locals(), result_bl, tag="torch")
BenchmarkReport.record(op, report_params, result_bl, tag="torch")


# ===================================================================
Expand Down Expand Up @@ -680,26 +631,7 @@ def ref_program(self, x: torch.Tensor) -> torch.Tensor:
)


class LogSumExpMultidimBenchmark(BenchmarkBase[LogSumExpMultidimTest]):
def calculate_flops(self) -> Optional[float]:
t = self.workload
total_elems = 1
for s in t.shape:
total_elems *= s
return total_elems

def calculate_memory(self) -> Optional[float]:
t = self.workload
elem_bytes = torch.tensor([], dtype=t.dtype).element_size()
total_elems = 1
for s in t.shape:
total_elems *= s
out_elems = 1
dims = set(d % len(t.shape) for d in t.dim)
for i, s in enumerate(t.shape):
if i not in dims:
out_elems *= s
return (total_elems + out_elems) * elem_bytes
_LOGSUMEXP_OP_NAME = "LogSumExpFwdOp"


def _make_logsumexp_op(dtype, dim, keepdim):
Expand All @@ -716,15 +648,18 @@ def test_logsumexp_multidim_bench(
dtype: torch.dtype,
) -> None:
test = LogSumExpMultidimTest(shape, dim, keepdim, dtype)
bm = LogSumExpMultidimBenchmark(test)
inputs = test.gen_inputs()

op = _make_logsumexp_op(dtype, dim, keepdim)
bm = ManifestBenchmark(_LOGSUMEXP_OP_NAME, op, test)
# Preserve legacy report column order: shape, keepdim, dtype
# (dim list dropped by pre-PR filter).
report_params = {"shape": shape, "keepdim": keepdim, "dtype": dtype}
result = bm.profile(op, *inputs)
BenchmarkReport.record(op, locals(), result, tag="tileops")
BenchmarkReport.record(op, report_params, result, tag="tileops")

result_bl = bm.profile(test.ref_program, *inputs)
BenchmarkReport.record(op, locals(), result_bl, tag="torch")
BenchmarkReport.record(op, report_params, result_bl, tag="torch")


if __name__ == "__main__":
Expand Down
Loading