From 1d335ad7edaf5c874d8a4cd1fb78cd53d70dbab6 Mon Sep 17 00:00:00 2001 From: jcao Date: Mon, 22 Jun 2026 05:59:32 +0000 Subject: [PATCH 01/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on Maca --- tilelang/quantize/mxfp.py | 46 ++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py index c13539dc..8e54c3c0 100644 --- a/tilelang/quantize/mxfp.py +++ b/tilelang/quantize/mxfp.py @@ -1,5 +1,10 @@ from typing import Literal + +from tvm.target import Target + from tilelang import language as T +from tilelang.backend.target import determine_target +from tilelang.rocm.target import target_is_gfx950 # Implementation asm for fp4 to bf16, using twiddling # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18 @@ -157,6 +162,26 @@ """ +def _resolve_mxfp_target(target): + if target is not None: + return target + current = Target.current(allow_none=True) + if current is not None: + return current + return determine_target("auto", return_object=True) + + +def _target_uses_portable_mxfp_dequant(target) -> bool: + """Return True for targets that cannot compile CUDA PTX inline asm (e.g. Maca, AMD gfx950).""" + if target is None: + return False + if not isinstance(target, Target): + target = Target(target) + if target.kind.name == "maca": + return True + return target_is_gfx950(target) + + def get_mxfp_intrin_group( out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16, source_format: Literal[T.int, T.uint] = T.uint, @@ -195,33 +220,24 @@ def get_mxfp_intrin_group( assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'." assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'." - # Detect AMD gfx950 target to select the HIP C++ dequantization implementation. - # All other targets (NV, RDNA, MI300) use the default CUDA PTX path below. - _is_gfx950 = False - if target is not None: - try: - from tilelang.rocm.target import target_is_gfx950 - - _is_gfx950 = target_is_gfx950(target) - except (ImportError, ModuleNotFoundError, AttributeError): - # target_is_gfx950 unavailable in this build; assume non-gfx950. - pass + # Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below. + # All other targets (NV, RDNA, MI300) use the default CUDA PTX path. + _use_portable = _target_uses_portable_mxfp_dequant(_resolve_mxfp_target(target)) dtype_map = {T.float16: "f16", T.bfloat16: "bf16"} func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}" if use_twiddling: func_name += "_twiddling" - if _is_gfx950: - # AMD gfx950 path: use portable HIP C++ implementations. - # The function name stays the same so the call site is unchanged. + if _use_portable: + # Portable C++ path (Maca / AMD gfx950). Function name unchanged for call sites. if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16: return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip} elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16: return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip} else: raise AssertionError( - f"AMD gfx950 MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, " + f"Portable MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, " f"got source_bit={source_bit}, out_dtype={out_dtype}" ) From b3ac85a0d98e177db6a2dba07e3662cb7c486d5e Mon Sep 17 00:00:00 2001 From: jcao Date: Mon, 22 Jun 2026 06:58:20 +0000 Subject: [PATCH 02/15] [MetaxGPU][regression] Default regression_all scan root to examples --- maint/scripts/regression_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py index 331d6b89..4986ce19 100644 --- a/maint/scripts/regression_all.py +++ b/maint/scripts/regression_all.py @@ -57,7 +57,7 @@ def _parse_table(output: str) -> dict[str, float]: def _examples_root() -> Path: - return Path(__file__).resolve().parents[2] / "examples" / "maca" + return Path(__file__).resolve().parents[2] / "examples" def _discover_bench_files(examples_root: Path) -> list[Path]: From 1a6a8cb9b9a1b88f897ef13da0e05dc49efe10dd Mon Sep 17 00:00:00 2001 From: jcao Date: Mon, 22 Jun 2026 07:09:58 +0000 Subject: [PATCH 03/15] [MetaxGPU][regression] Default regression_all scan root to examples --- maint/scripts/regression_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py index 4986ce19..3bb76928 100644 --- a/maint/scripts/regression_all.py +++ b/maint/scripts/regression_all.py @@ -57,7 +57,7 @@ def _parse_table(output: str) -> dict[str, float]: def _examples_root() -> Path: - return Path(__file__).resolve().parents[2] / "examples" + return Path(__file__).resolve().parents[2] / "examples" def _discover_bench_files(examples_root: Path) -> list[Path]: From 3393e0bb751212d295466352d4731cff7d63662d Mon Sep 17 00:00:00 2001 From: jcao Date: Mon, 22 Jun 2026 08:55:04 +0000 Subject: [PATCH 04/15] [CI] Update flash_linear_attention version to 0.4.0+metax3.7.2.0torch2.8 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78ed48f5..f23c76e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -288,7 +288,7 @@ jobs: uv pip install -v -r requirements-test-rocm.txt elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then uv pip install -r requirements-test-maca.txt - uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com + uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then uv pip install -v -r requirements-test-metal.txt else From a09c835686ca97395e33f2e050c9cf3ac4d7df5f Mon Sep 17 00:00:00 2001 From: jcao Date: Mon, 22 Jun 2026 09:21:19 +0000 Subject: [PATCH 05/15] [CI] Update flash_linear_attention version --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f23c76e4..4c7b373f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -287,8 +287,8 @@ jobs: elif [[ "${{ matrix.runner.toolkit }}" == *"ROCm"* ]]; then uv pip install -v -r requirements-test-rocm.txt elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then + uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com uv pip install -r requirements-test-maca.txt - uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then uv pip install -v -r requirements-test-metal.txt else From 3ce4d4322de4dc862f660d3c93b5d6767394ca5a Mon Sep 17 00:00:00 2001 From: jcao Date: Mon, 22 Jun 2026 09:43:41 +0000 Subject: [PATCH 06/15] [CI] Update flash_linear_attention version --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c7b373f..327e16d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -287,8 +287,9 @@ jobs: elif [[ "${{ matrix.runner.toolkit }}" == *"ROCm"* ]]; then uv pip install -v -r requirements-test-rocm.txt elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com uv pip install -r requirements-test-maca.txt + uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true + uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then uv pip install -v -r requirements-test-metal.txt else From 3eca72dcc5985c418974fce3895e613d293a960e Mon Sep 17 00:00:00 2001 From: jcao Date: Mon, 22 Jun 2026 10:54:11 +0000 Subject: [PATCH 07/15] [CI] Update flash_linear_attention version --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 327e16d4..45b283ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -288,8 +288,12 @@ jobs: uv pip install -v -r requirements-test-rocm.txt elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then uv pip install -r requirements-test-maca.txt + SITE=".venv/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true - uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com + rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info + uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ + flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 \ + -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then uv pip install -v -r requirements-test-metal.txt else From a66f8b6faf904456ec10ddb96dd9fb799f1378b5 Mon Sep 17 00:00:00 2001 From: jcao Date: Tue, 23 Jun 2026 03:25:43 +0000 Subject: [PATCH 08/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on Maca --- tilelang/quantize/mxfp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py index 8e54c3c0..93b22110 100644 --- a/tilelang/quantize/mxfp.py +++ b/tilelang/quantize/mxfp.py @@ -163,7 +163,7 @@ def _resolve_mxfp_target(target): - if target is not None: + if target is not None and target != "auto": return target current = Target.current(allow_none=True) if current is not None: @@ -222,7 +222,9 @@ def get_mxfp_intrin_group( # Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below. # All other targets (NV, RDNA, MI300) use the default CUDA PTX path. - _use_portable = _target_uses_portable_mxfp_dequant(_resolve_mxfp_target(target)) + # target=None keeps the CUDA PTX default; only target="auto" resolves from context. + _resolved = _resolve_mxfp_target(target) if target == "auto" else target + _use_portable = _target_uses_portable_mxfp_dequant(_resolved) dtype_map = {T.float16: "f16", T.bfloat16: "bf16"} func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}" From 695907124b388191b8b1e9ca8476e091099911d3 Mon Sep 17 00:00:00 2001 From: jcao Date: Tue, 23 Jun 2026 03:35:43 +0000 Subject: [PATCH 09/15] [CI] Update flash_linear_attention version --- .github/workflows/ci.yml | 2 +- .github/workflows/pr-regression-test-bot.yml | 24 +++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45b283ca..d44ead5e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -292,7 +292,7 @@ jobs: uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ - flash_linear_attention==0.4.0+metax3.7.2.0torch2.8 \ + flash_linear_attention==0.4.0 \ -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then uv pip install -v -r requirements-test-metal.txt diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml index 52d81ee0..21a5ced7 100644 --- a/.github/workflows/pr-regression-test-bot.yml +++ b/.github/workflows/pr-regression-test-bot.yml @@ -207,7 +207,13 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com + uv pip install -r requirements-test-maca.txt + SITE="new/lib/python${{ matrix.python-version }}/site-packages" + uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true + rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info + uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ + flash_linear_attention==0.4.0 \ + -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com fi uv pip install -v . if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then @@ -229,7 +235,13 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com + uv pip install -r requirements-test-maca.txt + SITE="old/lib/python${{ matrix.python-version }}/site-packages" + uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true + rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info + uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ + flash_linear_attention==0.4.0 \ + -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com fi uv pip install -v . if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then @@ -244,7 +256,13 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip install -v --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0+metax3.5.3.9torch2.8 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com + uv pip install -r requirements-test-maca.txt + SITE="test_regression/lib/python${{ matrix.python-version }}/site-packages" + uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true + rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info + uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ + flash_linear_attention==0.4.0 \ + -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com fi - name: Clear uv cache for self-hosted runners (if setup failed) From eb238979aa011a6a0e73a55a93bb1725f172f3a6 Mon Sep 17 00:00:00 2001 From: jcao Date: Tue, 23 Jun 2026 05:56:35 +0000 Subject: [PATCH 10/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on Maca --- tilelang/quantize/mxfp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py index 93b22110..1891145c 100644 --- a/tilelang/quantize/mxfp.py +++ b/tilelang/quantize/mxfp.py @@ -4,7 +4,7 @@ from tilelang import language as T from tilelang.backend.target import determine_target -from tilelang.rocm.target import target_is_gfx950 +from tilelang.rocm.target import target_is_gfx950, target_is_hip # Implementation asm for fp4 to bf16, using twiddling # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18 @@ -179,7 +179,9 @@ def _target_uses_portable_mxfp_dequant(target) -> bool: target = Target(target) if target.kind.name == "maca": return True - return target_is_gfx950(target) + if target_is_hip(target): + return target_is_gfx950(target) + return False def get_mxfp_intrin_group( From e78ebddd00038805809e164dc970be4910084ecb Mon Sep 17 00:00:00 2001 From: jcao Date: Tue, 23 Jun 2026 06:21:53 +0000 Subject: [PATCH 11/15] [CI] Update flash_linear_attention version --- .github/workflows/pr-regression-test-bot.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml index 21a5ced7..d4088f80 100644 --- a/.github/workflows/pr-regression-test-bot.yml +++ b/.github/workflows/pr-regression-test-bot.yml @@ -207,7 +207,6 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip install -r requirements-test-maca.txt SITE="new/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info @@ -235,7 +234,6 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip install -r requirements-test-maca.txt SITE="old/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info @@ -256,7 +254,6 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip install -r requirements-test-maca.txt SITE="test_regression/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info From 34040262725e59f80f640d994c550a7cc744e50a Mon Sep 17 00:00:00 2001 From: jcao Date: Wed, 24 Jun 2026 02:15:42 +0000 Subject: [PATCH 12/15] [CI] Update flash_linear_attention version --- .github/workflows/ci.yml | 6 +----- .github/workflows/pr-regression-test-bot.yml | 6 ------ 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d44ead5e..8451b3a4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -288,12 +288,8 @@ jobs: uv pip install -v -r requirements-test-rocm.txt elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then uv pip install -r requirements-test-maca.txt - SITE=".venv/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true - rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info - uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ - flash_linear_attention==0.4.0 \ - -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com + uv pip install --force-reinstall --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then uv pip install -v -r requirements-test-metal.txt else diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml index d4088f80..eeec4da9 100644 --- a/.github/workflows/pr-regression-test-bot.yml +++ b/.github/workflows/pr-regression-test-bot.yml @@ -207,9 +207,7 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - SITE="new/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true - rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ flash_linear_attention==0.4.0 \ -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com @@ -234,9 +232,7 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - SITE="old/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true - rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ flash_linear_attention==0.4.0 \ -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com @@ -254,9 +250,7 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - SITE="test_regression/lib/python${{ matrix.python-version }}/site-packages" uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true - rm -rf "${SITE}/fla" "${SITE}"/flash_linear_attention*.dist-info "${SITE}"/flash-linear-attention*.dist-info uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ flash_linear_attention==0.4.0 \ -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com From 6ed645f78636a466ea1c9b739f4460f26f620cf1 Mon Sep 17 00:00:00 2001 From: jcao Date: Wed, 24 Jun 2026 02:56:47 +0000 Subject: [PATCH 13/15] [CI] Update flash_linear_attention version --- .github/workflows/ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8451b3a4..78ed48f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -288,8 +288,7 @@ jobs: uv pip install -v -r requirements-test-rocm.txt elif [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then uv pip install -r requirements-test-maca.txt - uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true - uv pip install --force-reinstall --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com + uv pip install --no-deps --python-version 3.10.0 flash_linear_attention==0.4.0 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then uv pip install -v -r requirements-test-metal.txt else From 9fc339e4e4ec716df5cf9501acdddead30e3f754 Mon Sep 17 00:00:00 2001 From: jcao Date: Thu, 25 Jun 2026 08:24:05 +0000 Subject: [PATCH 14/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on Maca --- .github/workflows/pr-regression-test-bot.yml | 3 - tilelang/quantize/mxfp.py | 67 ++++++++++---------- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml index eeec4da9..32c662b2 100644 --- a/.github/workflows/pr-regression-test-bot.yml +++ b/.github/workflows/pr-regression-test-bot.yml @@ -207,7 +207,6 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ flash_linear_attention==0.4.0 \ -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com @@ -232,7 +231,6 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ flash_linear_attention==0.4.0 \ -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com @@ -250,7 +248,6 @@ jobs: fi uv pip install -v -r requirements-test.txt if [[ "${{ matrix.runner.toolkit }}" == *"MACA"* ]]; then - uv pip uninstall -y flash-linear-attention flash_linear_attention 2>/dev/null || true uv pip install --force-reinstall --no-deps --python-version 3.10.0 \ flash_linear_attention==0.4.0 \ -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py index 1891145c..903a90c8 100644 --- a/tilelang/quantize/mxfp.py +++ b/tilelang/quantize/mxfp.py @@ -1,10 +1,5 @@ from typing import Literal - -from tvm.target import Target - from tilelang import language as T -from tilelang.backend.target import determine_target -from tilelang.rocm.target import target_is_gfx950, target_is_hip # Implementation asm for fp4 to bf16, using twiddling # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18 @@ -162,28 +157,6 @@ """ -def _resolve_mxfp_target(target): - if target is not None and target != "auto": - return target - current = Target.current(allow_none=True) - if current is not None: - return current - return determine_target("auto", return_object=True) - - -def _target_uses_portable_mxfp_dequant(target) -> bool: - """Return True for targets that cannot compile CUDA PTX inline asm (e.g. Maca, AMD gfx950).""" - if target is None: - return False - if not isinstance(target, Target): - target = Target(target) - if target.kind.name == "maca": - return True - if target_is_hip(target): - return target_is_gfx950(target) - return False - - def get_mxfp_intrin_group( out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16, source_format: Literal[T.int, T.uint] = T.uint, @@ -222,26 +195,50 @@ def get_mxfp_intrin_group( assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'." assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'." - # Maca and AMD gfx950 cannot compile CUDA PTX; use portable C++ below. - # All other targets (NV, RDNA, MI300) use the default CUDA PTX path. - # target=None keeps the CUDA PTX default; only target="auto" resolves from context. - _resolved = _resolve_mxfp_target(target) if target == "auto" else target - _use_portable = _target_uses_portable_mxfp_dequant(_resolved) + # Detect AMD gfx950 / Maca targets to select portable C++ dequantization. + # All other targets (NV, RDNA, MI300) use the default CUDA PTX path below. + _is_gfx950 = False + _is_maca = False + if target is not None: + from tvm.target import Target + + tvm_target = target if isinstance(target, Target) else Target(target) + _is_maca = tvm_target.kind.name == "maca" + try: + from tilelang.rocm.target import target_is_gfx950 + + _is_gfx950 = target_is_gfx950(target) + except (ImportError, ModuleNotFoundError, AttributeError): + # target_is_gfx950 unavailable in this build; assume non-gfx950. + pass dtype_map = {T.float16: "f16", T.bfloat16: "bf16"} func_name = f"decode_fp{source_bit}_to_{dtype_map[out_dtype]}" if use_twiddling: func_name += "_twiddling" - if _use_portable: - # Portable C++ path (Maca / AMD gfx950). Function name unchanged for call sites. + if _is_gfx950: + # AMD gfx950 path: use portable HIP C++ implementations. + # The function name stays the same so the call site is unchanged. + if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16: + return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip} + elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16: + return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip} + else: + raise AssertionError( + f"AMD gfx950 MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, " + f"got source_bit={source_bit}, out_dtype={out_dtype}" + ) + + elif _is_maca: + # Maca path: use portable C++ implementations (no CUDA PTX inline asm). if use_twiddling and source_bit == 4 and out_dtype == T.bfloat16: return {"func_name": func_name, "c_source": decode_f4_to_bf16_twiddling_hip} elif not use_twiddling and source_bit == 4 and out_dtype == T.bfloat16: return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip} else: raise AssertionError( - f"Portable MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, " + f"Maca MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, " f"got source_bit={source_bit}, out_dtype={out_dtype}" ) From 491d249a6387bbf68f249a35ac5777c34f238639 Mon Sep 17 00:00:00 2001 From: jcao Date: Thu, 25 Jun 2026 08:58:11 +0000 Subject: [PATCH 15/15] [MetaxGPU][quantize] Use portable C++ MXFP4 dequant on Maca --- tilelang/quantize/mxfp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py index 903a90c8..85632a3d 100644 --- a/tilelang/quantize/mxfp.py +++ b/tilelang/quantize/mxfp.py @@ -238,8 +238,7 @@ def get_mxfp_intrin_group( return {"func_name": func_name, "c_source": decode_f4_to_bf16_simple_hip} else: raise AssertionError( - f"Maca MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, " - f"got source_bit={source_bit}, out_dtype={out_dtype}" + f"Maca MXFP dequant only supports source_bit=4 and out_dtype=bfloat16, got source_bit={source_bit}, out_dtype={out_dtype}" ) # CUDA / default path: use PTX inline assembly implementations.