tile-ai · ventijing · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
@@ -207,7 +207,10 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
+              flash_linear_attention==0.4.0 \
+              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           fi
           uv pip install -v .
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
@@ -229,7 +232,10 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
+              flash_linear_attention==0.4.0 \
+              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           fi
           uv pip install -v .
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
@@ -244,7 +250,10 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
+              flash_linear_attention==0.4.0 \
+              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           fi
 
       - name: Clear uv cache for self-hosted runners (if setup failed)

diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py
@@ -57,7 +57,7 @@ def _parse_table(output: str) -> dict[str, float]:
 
 
 def _examples_root() -> Path:
-    return Path(__file__).resolve().parents[2] / "examples" / "maca"
+    return Path(__file__).resolve().parents[2] / "examples"
 
 
 def _discover_bench_files(examples_root: Path) -> list[Path]:

diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
@@ -1,5 +1,10 @@
 from typing import Literal
+
+from tvm.target import Target
+
 from tilelang import language as T
+from tilelang.backend.target import determine_target
+from tilelang.rocm.target import target_is_gfx950, target_is_hip
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -157,6 +162,28 @@
 """
 
 
+def _resolve_mxfp_target(target):
+    if target is not None and target != "auto":
+        return target
+    current = Target.current(allow_none=True)
+    if current is not None:
+        return current
+    return determine_target("auto", return_object=True)
+
+
+def _target_uses_portable_mxfp_dequant(target) -> bool:
+    """Return True for targets that cannot compile CUDA PTX inline asm (e.g. Maca, AMD gfx950)."""
+    if target is None:
+        return False
+    if not isinstance(target, Target):
+        target = Target(target)
+    if target.kind.name == "maca":
+        return True
+    if target_is_hip(target):
+        return target_is_gfx950(target)
+    return False
+
+
 def get_mxfp_intrin_group(
     out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
     source_format: Literal[T.int, T.uint] = T.uint,
@@ -195,33 +222,26 @@ def get_mxfp_intrin_group(
     assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
     assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
-    # Detect AMD gfx950 target to select the HIP C++ dequantization implementation.
-    # All other targets (NV, RDNA, MI300) use the default CUDA PTX path below.
-    _is_gfx950 = False
-    if target is not None:
-        try:
-            from tilelang.rocm.target import target_is_gfx950
-
-            _is_gfx950 = target_is_gfx950(target)
-        except (ImportError, ModuleNotFoundError, AttributeError):
-            # target_is_gfx950 unavailable in this build; assume non-gfx950.
-            pass
+    # Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below.
+    # All other targets (NV, RDNA, MI300) use the default CUDA PTX path.
+    # target=None keeps the CUDA PTX default; only target="auto" resolves from context.
+    _resolved = _resolve_mxfp_target(target) if target == "auto" else target
+    _use_portable = _target_uses_portable_mxfp_dequant(_resolved)
 
     dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}"
     if use_twiddling:
         func_name += "_twiddling"
 
-    if _is_gfx950:
-        # AMD gfx950 path: use portable HIP C++ implementations.
-        # The function name stays the same so the call site is unchanged.
+    if _use_portable:
+        # Portable C++ path (Maca / AMD gfx950). Function name unchanged for call sites.
         if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
             return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip}
         elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
             return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip}
         else:
             raise AssertionError(
-                f"AMD gfx950 MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
+                f"Portable MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
                 f"got source_bit={source_bit}, out_dtype={out_dtype}"
             )