Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion maint/scripts/regression_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _parse_table(output: str) -> dict[str, float]:


def _examples_root() -> Path:
return Path(__file__).resolve().parents[2] / "examples" / "maca"
return Path(__file__).resolve().parents[2] / "examples"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Expanded examples scope can silently drop benchmark results due to name collisions.

By switching discovery to the repo-wide examples tree (Line 60), regression_all() now has a much higher chance of encountering duplicate benchmark names across different drivers. The current merge logic (if k not in merged) silently ignores later entries, which can hide regressions and undercount totals.

A safer approach is to either fail on duplicate names or namespace keys by file path when merging.

Suggested fix (fail fast on duplicate benchmark names)
@@
-        for k, v in parsed.items():
-            if k not in merged:
-                merged[k] = v
-                _RESULTS.append(PerfResult(name=k, latency=v))
+        for k, v in parsed.items():
+            if k in merged:
+                failures.append(
+                    f"{rel_path}\nDuplicate benchmark name detected: {k!r}. "
+                    "Benchmark names must be globally unique across discovered drivers."
+                )
+                continue
+            merged[k] = v
+            _RESULTS.append(PerfResult(name=k, latency=v))
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@maint/scripts/regression_all.py` at line 60, The merge logic using `if k not
in merged` silently skips duplicate benchmark names, which can hide regressions
when the expanded examples scope encounters multiple benchmarks with the same
name across different drivers. Modify the merge operation in the
regression_all() function to detect when a key already exists in the merged
dictionary and raise an exception or error with details about the collision,
rather than silently ignoring the duplicate entry. Alternatively, namespace the
keys by including the source file path along with the benchmark name to ensure
uniqueness while preserving all results.



def _discover_bench_files(examples_root: Path) -> list[Path]:
Expand Down
46 changes: 31 additions & 15 deletions tilelang/quantize/mxfp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from typing import Literal

from tvm.target import Target

from tilelang import language as T
from tilelang.backend.target import determine_target
from tilelang.rocm.target import target_is_gfx950

# Implementation asm for fp4 to bf16, using twiddling
# Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
Expand Down Expand Up @@ -157,6 +162,26 @@
"""


def _resolve_mxfp_target(target):
if target is not None:
return target
current = Target.current(allow_none=True)
if current is not None:
return current
return determine_target("auto", return_object=True)
Comment thread
ventijing marked this conversation as resolved.


def _target_uses_portable_mxfp_dequant(target) -> bool:
"""Return True for targets that cannot compile CUDA PTX inline asm (e.g. Maca, AMD gfx950)."""
if target is None:
return False
if not isinstance(target, Target):
target = Target(target)
if target.kind.name == "maca":
return True
return target_is_gfx950(target)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated


def get_mxfp_intrin_group(
out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
source_format: Literal[T.int, T.uint] = T.uint,
Expand Down Expand Up @@ -195,33 +220,24 @@ def get_mxfp_intrin_group(
assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."

# Detect AMD gfx950 target to select the HIP C++ dequantization implementation.
# All other targets (NV, RDNA, MI300) use the default CUDA PTX path below.
_is_gfx950 = False
if target is not None:
try:
from tilelang.rocm.target import target_is_gfx950

_is_gfx950 = target_is_gfx950(target)
except (ImportError, ModuleNotFoundError, AttributeError):
# target_is_gfx950 unavailable in this build; assume non-gfx950.
pass
# Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below.
# All other targets (NV, RDNA, MI300) use the default CUDA PTX path.
_use_portable = _target_uses_portable_mxfp_dequant(_resolve_mxfp_target(target))

dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}"
if use_twiddling:
func_name += "_twiddling"

if _is_gfx950:
# AMD gfx950 path: use portable HIP C++ implementations.
# The function name stays the same so the call site is unchanged.
if _use_portable:
# Portable C++ path (Maca / AMD gfx950). Function name unchanged for call sites.
if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip}
elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip}
else:
raise AssertionError(
f"AMD gfx950 MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
f"Portable MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
f"got source_bit={source_bit}, out_dtype={out_dtype}"
)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add if branch in here like gfx950 to resolve same issue with maca target.


Expand Down
Loading