From 1d335ad7edaf5c874d8a4cd1fb78cd53d70dbab6 Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Mon, 22 Jun 2026 05:59:32 +0000
Subject: [PATCH 01/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on
 Maca

---
 tilelang/quantize/mxfp.py | 46 ++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index c13539dc..8e54c3c0 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -1,5 +1,10 @@
 from typing import Literal
+
+from tvm.target import Target
+
 from tilelang import language as T
+from tilelang.backend.target import determine_target
+from tilelang.rocm.target import target_is_gfx950
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -157,6 +162,26 @@
 """
 
 
+def _resolve_mxfp_target(target):
+    if target is not None:
+        return target
+    current = Target.current(allow_none=True)
+    if current is not None:
+        return current
+    return determine_target("auto", return_object=True)
+
+
+def _target_uses_portable_mxfp_dequant(target) -> bool:
+    """Return True for targets that cannot compile CUDA PTX inline asm (e.g. Maca, AMD gfx950)."""
+    if target is None:
+        return False
+    if not isinstance(target, Target):
+        target = Target(target)
+    if target.kind.name == "maca":
+        return True
+    return target_is_gfx950(target)
+
+
 def get_mxfp_intrin_group(
     out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
     source_format: Literal[T.int, T.uint] = T.uint,
@@ -195,33 +220,24 @@ def get_mxfp_intrin_group(
     assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
     assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
-    # Detect AMD gfx950 target to select the HIP C++ dequantization implementation.
-    # All other targets (NV, RDNA, MI300) use the default CUDA PTX path below.
-    _is_gfx950 = False
-    if target is not None:
-        try:
-            from tilelang.rocm.target import target_is_gfx950
-
-            _is_gfx950 = target_is_gfx950(target)
-        except (ImportError, ModuleNotFoundError, AttributeError):
-            # target_is_gfx950 unavailable in this build; assume non-gfx950.
-            pass
+    # Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below.
+    # All other targets (NV, RDNA, MI300) use the default CUDA PTX path.
+    _use_portable = _target_uses_portable_mxfp_dequant(_resolve_mxfp_target(target))
 
     dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}"
     if use_twiddling:
         func_name += "_twiddling"
 
-    if _is_gfx950:
-        # AMD gfx950 path: use portable HIP C++ implementations.
-        # The function name stays the same so the call site is unchanged.
+    if _use_portable:
+        # Portable C++ path (Maca / AMD gfx950). Function name unchanged for call sites.
         if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
             return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip}
         elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
             return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip}
         else:
             raise AssertionError(
-                f"AMD gfx950 MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
+                f"Portable MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
                 f"got source_bit={source_bit}, out_dtype={out_dtype}"
             )
 

From b3ac85a0d98e177db6a2dba07e3662cb7c486d5e Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Mon, 22 Jun 2026 06:58:20 +0000
Subject: [PATCH 02/15] [MetaxGPU][regression] Default regression_all scan root
 to examples

---
 maint/scripts/regression_all.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py
index 331d6b89..4986ce19 100644
--- a/maint/scripts/regression_all.py
+++ b/maint/scripts/regression_all.py
@@ -57,7 +57,7 @@ def _parse_table(output: str) -> dict[str, float]:
 
 
 def _examples_root() -> Path:
-    return Path(__file__).resolve().parents[2] / "examples" / "maca"
+    return Path(__file__).resolve().parents[2] / "examples" 
 
 
 def _discover_bench_files(examples_root: Path) -> list[Path]:

From 1a6a8cb9b9a1b88f897ef13da0e05dc49efe10dd Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Mon, 22 Jun 2026 07:09:58 +0000
Subject: [PATCH 03/15] [MetaxGPU][regression] Default regression_all scan root
 to examples

---
 maint/scripts/regression_all.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py
index 4986ce19..3bb76928 100644
--- a/maint/scripts/regression_all.py
+++ b/maint/scripts/regression_all.py
@@ -57,7 +57,7 @@ def _parse_table(output: str) -> dict[str, float]:
 
 
 def _examples_root() -> Path:
-    return Path(__file__).resolve().parents[2] / "examples" 
+    return Path(__file__).resolve().parents[2] / "examples"
 
 
 def _discover_bench_files(examples_root: Path) -> list[Path]:

From 3393e0bb751212d295466352d4731cff7d63662d Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Mon, 22 Jun 2026 08:55:04 +0000
Subject: [PATCH 04/15] [CI] Update flash_linear_attention version to
 0.4.0+metax3.7.2.0torch2.8

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 78ed48f5..f23c76e4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -288,7 +288,7 @@ jobs:
             uv pip install -v -r requirements-test-rocm.txt
           elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
             uv pip install -r requirements-test-maca.txt
-            uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
             uv pip install -v -r requirements-test-metal.txt
           else

From a09c835686ca97395e33f2e050c9cf3ac4d7df5f Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Mon, 22 Jun 2026 09:21:19 +0000
Subject: [PATCH 05/15] [CI] Update flash_linear_attention version

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f23c76e4..4c7b373f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -287,8 +287,8 @@ jobs:
           elif [[ "${{ matrix.runner.toolkit }}" == *"ROCm"* ]]; then
             uv pip install -v -r requirements-test-rocm.txt
           elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
+          uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
             uv pip install -r requirements-test-maca.txt
-            uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
             uv pip install -v -r requirements-test-metal.txt
           else

From 3ce4d4322de4dc862f660d3c93b5d6767394ca5a Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Mon, 22 Jun 2026 09:43:41 +0000
Subject: [PATCH 06/15] [CI] Update flash_linear_attention version

---
 .github/workflows/ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4c7b373f..327e16d4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -287,8 +287,9 @@ jobs:
           elif [[ "${{ matrix.runner.toolkit }}" == *"ROCm"* ]]; then
             uv pip install -v -r requirements-test-rocm.txt
           elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-          uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
             uv pip install -r requirements-test-maca.txt
+            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
+            uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
             uv pip install -v -r requirements-test-metal.txt
           else

From 3eca72dcc5985c418974fce3895e613d293a960e Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Mon, 22 Jun 2026 10:54:11 +0000
Subject: [PATCH 07/15] [CI] Update flash_linear_attention version

---
 .github/workflows/ci.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 327e16d4..45b283ca 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -288,8 +288,12 @@ jobs:
             uv pip install -v -r requirements-test-rocm.txt
           elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
             uv pip install -r requirements-test-maca.txt
+            SITE=".venv/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
-            uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
+              flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 \
+              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
             uv pip install -v -r requirements-test-metal.txt
           else

From a66f8b6faf904456ec10ddb96dd9fb799f1378b5 Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Tue, 23 Jun 2026 03:25:43 +0000
Subject: [PATCH 08/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on
 Maca

---
 tilelang/quantize/mxfp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 8e54c3c0..93b22110 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -163,7 +163,7 @@
 
 
 def _resolve_mxfp_target(target):
-    if target is not None:
+    if target is not None and target != "auto":
         return target
     current = Target.current(allow_none=True)
     if current is not None:
@@ -222,7 +222,9 @@ def get_mxfp_intrin_group(
 
     # Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below.
     # All other targets (NV, RDNA, MI300) use the default CUDA PTX path.
-    _use_portable = _target_uses_portable_mxfp_dequant(_resolve_mxfp_target(target))
+    # target=None keeps the CUDA PTX default; only target="auto" resolves from context.
+    _resolved = _resolve_mxfp_target(target) if target == "auto" else target
+    _use_portable = _target_uses_portable_mxfp_dequant(_resolved)
 
     dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}"

From 695907124b388191b8b1e9ca8476e091099911d3 Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Tue, 23 Jun 2026 03:35:43 +0000
Subject: [PATCH 09/15] [CI] Update flash_linear_attention version

---
 .github/workflows/ci.yml                     |  2 +-
 .github/workflows/pr-regression-test-bot.yml | 24 +++++++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 45b283ca..d44ead5e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -292,7 +292,7 @@ jobs:
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
             rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
             uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
-              flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 \
+              flash_linear_attention==0.4.0 \
               -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
             uv pip install -v -r requirements-test-metal.txt
diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
index 52d81ee0..21a5ced7 100644
--- a/.github/workflows/pr-regression-test-bot.yml
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -207,7 +207,13 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip install -r requirements-test-maca.txt
+            SITE="new/lib/python${{ matrix.python-version }}/site-packages"
+            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
+            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
+              flash_linear_attention==0.4.0 \
+              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           fi
           uv pip install -v .
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
@@ -229,7 +235,13 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip install -r requirements-test-maca.txt
+            SITE="old/lib/python${{ matrix.python-version }}/site-packages"
+            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
+            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
+              flash_linear_attention==0.4.0 \
+              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           fi
           uv pip install -v .
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
@@ -244,7 +256,13 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip install -r requirements-test-maca.txt
+            SITE="test_regression/lib/python${{ matrix.python-version }}/site-packages"
+            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
+            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
+              flash_linear_attention==0.4.0 \
+              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           fi
 
       - name: Clear uv cache for self-hosted runners (if setup failed)

From eb238979aa011a6a0e73a55a93bb1725f172f3a6 Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Tue, 23 Jun 2026 05:56:35 +0000
Subject: [PATCH 10/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on
 Maca

---
 tilelang/quantize/mxfp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 93b22110..1891145c 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -4,7 +4,7 @@
 
 from tilelang import language as T
 from tilelang.backend.target import determine_target
-from tilelang.rocm.target import target_is_gfx950
+from tilelang.rocm.target import target_is_gfx950, target_is_hip
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -179,7 +179,9 @@ def _target_uses_portable_mxfp_dequant(target) -> bool:
         target = Target(target)
     if target.kind.name == "maca":
         return True
-    return target_is_gfx950(target)
+    if target_is_hip(target):
+        return target_is_gfx950(target)
+    return False
 
 
 def get_mxfp_intrin_group(

From e78ebddd00038805809e164dc970be4910084ecb Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Tue, 23 Jun 2026 06:21:53 +0000
Subject: [PATCH 11/15] [CI] Update flash_linear_attention version

---
 .github/workflows/pr-regression-test-bot.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
index 21a5ced7..d4088f80 100644
--- a/.github/workflows/pr-regression-test-bot.yml
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -207,7 +207,6 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -r requirements-test-maca.txt
             SITE="new/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
             rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
@@ -235,7 +234,6 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -r requirements-test-maca.txt
             SITE="old/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
             rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
@@ -256,7 +254,6 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip install -r requirements-test-maca.txt
             SITE="test_regression/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
             rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info

From 34040262725e59f80f640d994c550a7cc744e50a Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Wed, 24 Jun 2026 02:15:42 +0000
Subject: [PATCH 12/15] [CI] Update flash_linear_attention version

---
 .github/workflows/ci.yml                     | 6 +-----
 .github/workflows/pr-regression-test-bot.yml | 6 ------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d44ead5e..8451b3a4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -288,12 +288,8 @@ jobs:
             uv pip install -v -r requirements-test-rocm.txt
           elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
             uv pip install -r requirements-test-maca.txt
-            SITE=".venv/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
-            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
-            uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
-              flash_linear_attention==0.4.0 \
-              -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip install --force-reinstall --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
             uv pip install -v -r requirements-test-metal.txt
           else
diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
index d4088f80..eeec4da9 100644
--- a/.github/workflows/pr-regression-test-bot.yml
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -207,9 +207,7 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            SITE="new/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
-            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
             uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
               flash_linear_attention==0.4.0 \
               -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
@@ -234,9 +232,7 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            SITE="old/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
-            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
             uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
               flash_linear_attention==0.4.0 \
               -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
@@ -254,9 +250,7 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            SITE="test_regression/lib/python${{ matrix.python-version }}/site-packages"
             uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
-            rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info
             uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
               flash_linear_attention==0.4.0 \
               -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com

From 6ed645f78636a466ea1c9b739f4460f26f620cf1 Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Wed, 24 Jun 2026 02:56:47 +0000
Subject: [PATCH 13/15] [CI] Update flash_linear_attention version

---
 .github/workflows/ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8451b3a4..78ed48f5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -288,8 +288,7 @@ jobs:
             uv pip install -v -r requirements-test-rocm.txt
           elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
             uv pip install -r requirements-test-maca.txt
-            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
-            uv pip install --force-reinstall --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
+            uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
           elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
             uv pip install -v -r requirements-test-metal.txt
           else

From 9fc339e4e4ec716df5cf9501acdddead30e3f754 Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Thu, 25 Jun 2026 08:24:05 +0000
Subject: [PATCH 14/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on
 Maca

---
 .github/workflows/pr-regression-test-bot.yml |  3 -
 tilelang/quantize/mxfp.py                    | 67 ++++++++++----------
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
index eeec4da9..32c662b2 100644
--- a/.github/workflows/pr-regression-test-bot.yml
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -207,7 +207,6 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
             uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
               flash_linear_attention==0.4.0 \
               -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
@@ -232,7 +231,6 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
             uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
               flash_linear_attention==0.4.0 \
               -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
@@ -250,7 +248,6 @@ jobs:
           fi
           uv pip install -v -r requirements-test.txt
           if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then
-            uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true
             uv pip install --force-reinstall --no-deps --python-version 3.10.0 \
               flash_linear_attention==0.4.0 \
               -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com
diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 1891145c..903a90c8 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -1,10 +1,5 @@
 from typing import Literal
-
-from tvm.target import Target
-
 from tilelang import language as T
-from tilelang.backend.target import determine_target
-from tilelang.rocm.target import target_is_gfx950, target_is_hip
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -162,28 +157,6 @@
 """
 
 
-def _resolve_mxfp_target(target):
-    if target is not None and target != "auto":
-        return target
-    current = Target.current(allow_none=True)
-    if current is not None:
-        return current
-    return determine_target("auto", return_object=True)
-
-
-def _target_uses_portable_mxfp_dequant(target) -> bool:
-    """Return True for targets that cannot compile CUDA PTX inline asm (e.g. Maca, AMD gfx950)."""
-    if target is None:
-        return False
-    if not isinstance(target, Target):
-        target = Target(target)
-    if target.kind.name == "maca":
-        return True
-    if target_is_hip(target):
-        return target_is_gfx950(target)
-    return False
-
-
 def get_mxfp_intrin_group(
     out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
     source_format: Literal[T.int, T.uint] = T.uint,
@@ -222,26 +195,50 @@ def get_mxfp_intrin_group(
     assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
     assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
-    # Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below.
-    # All other targets (NV, RDNA, MI300) use the default CUDA PTX path.
-    # target=None keeps the CUDA PTX default; only target="auto" resolves from context.
-    _resolved = _resolve_mxfp_target(target) if target == "auto" else target
-    _use_portable = _target_uses_portable_mxfp_dequant(_resolved)
+    # Detect AMD gfx950 / Maca targets to select portable C++ dequantization.
+    # All other targets (NV, RDNA, MI300) use the default CUDA PTX path below.
+    _is_gfx950 = False
+    _is_maca = False
+    if target is not None:
+        from tvm.target import Target
+
+        tvm_target = target if isinstance(target, Target) else Target(target)
+        _is_maca = tvm_target.kind.name == "maca"
+        try:
+            from tilelang.rocm.target import target_is_gfx950
+
+            _is_gfx950 = target_is_gfx950(target)
+        except (ImportError, ModuleNotFoundError, AttributeError):
+            # target_is_gfx950 unavailable in this build; assume non-gfx950.
+            pass
 
     dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}"
     if use_twiddling:
         func_name += "_twiddling"
 
-    if _use_portable:
-        # Portable C++ path (Maca / AMD gfx950). Function name unchanged for call sites.
+    if _is_gfx950:
+        # AMD gfx950 path: use portable HIP C++ implementations.
+        # The function name stays the same so the call site is unchanged.
+        if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
+            return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip}
+        elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
+            return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip}
+        else:
+            raise AssertionError(
+                f"AMD gfx950 MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
+                f"got source_bit={source_bit}, out_dtype={out_dtype}"
+            )
+
+    elif _is_maca:
+        # Maca path: use portable C++ implementations (no CUDA PTX inline asm).
         if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
             return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip}
         elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16:
             return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip}
         else:
             raise AssertionError(
-                f"Portable MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
+                f"Maca MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
                 f"got source_bit={source_bit}, out_dtype={out_dtype}"
             )
 

From 491d249a6387bbf68f249a35ac5777c34f238639 Mon Sep 17 00:00:00 2001
From: jcao <jcao@metax-tech.com>
Date: Thu, 25 Jun 2026 08:58:11 +0000
Subject: [PATCH 15/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on
 Maca

---
 tilelang/quantize/mxfp.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 903a90c8..85632a3d 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -238,8 +238,7 @@ def get_mxfp_intrin_group(
             return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip}
         else:
             raise AssertionError(
-                f"Maca MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, "
-                f"got source_bit={source_bit}, out_dtype={out_dtype}"
+                f"Maca MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, got source_bit={source_bit}, out_dtype={out_dtype}"
             )
 
     # CUDA / default path: use PTX inline assembly implementations.