Revert "feat: Support fp8 qkv, fp16/bf16 out MHA for trtllm-gen. (flashinfer-ai#1490) (flashinfer-ai#1496)

yzh119 · web-flow · commit 370207b762df · 2025-08-15T00:45:32.000-07:00
## 📌 Description This reverts commit 96d142d. (flashinfer-ai#1490) ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -48,7 +48,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):
 
 
 class ArtifactPath:
-    TRTLLM_GEN_FMHA: str = "85756bb4c98571b902095e944fc27bda1fcec3e4/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "c8e0abb4b0438880a2b0a9b68449e3cf1513aadf/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
         "5d347c6234c9f0e7f1ab6519ea933183b48216ed/batched_gemm-32110eb-5262bae/"
     )
@@ -61,7 +61,7 @@ class ArtifactPath:
 
 class MetaInfoHash:
     TRTLLM_GEN_FMHA: str = (
-        "24db1b729f7cb86d3f4dc4fbb7de479c5a854cd03ffbf75dc356bffcde48700c"
+        "0d124e546c8a2e9fa59499625e8a6d140a2465573d4a3944f9d29f29f73292fb"
     )
     TRTLLM_GEN_BMM: str = (
         "aae02e5703ee0ce696c4b3a1f2a32936fcc960dcb69fdef52b6d0f8a7b673000"
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -49,7 +49,7 @@
     _check_cached_qkv_data_type,
     _check_kv_layout,
     _check_pos_encoding_mode,
-    check_shape_dtype_device,
+    _check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
     _get_cache_buf,
     _get_range_buf,
@@ -1231,14 +1231,14 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
         if out is None:
             out = torch.empty_like(q)
         else:
-            check_shape_dtype_device(out, q.shape, q.dtype, q.device, "out")
+            _check_shape_dtype_device(out, q.shape, q.dtype, q.device, "out")
 
         if self.use_tensor_cores:
             run_args = [
@@ -1749,7 +1749,7 @@ def run(
         if out is None:
             out = torch.empty_like(q_nope, device=device)
         else:
-            check_shape_dtype_device(
+            _check_shape_dtype_device(
                 out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
             )
 
@@ -1761,7 +1761,7 @@ def run(
                     device=device,
                 )
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     lse,
                     (q_nope.size(0), q_nope.size(1)),
                     q_nope.dtype,
@@ -2113,9 +2113,9 @@ def trtllm_batch_decode_with_kv_cache(
         assert isinstance(out, torch.Tensor)
 
         # Use uint8 as the container dtype to compliant with next fp4 gemm.
-        check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
 
-        check_shape_dtype_device(
+        _check_shape_dtype_device(
             out_scale_factor,
             fp4_out_scale_shape,
             torch.float8_e4m3fn,
@@ -2141,12 +2141,7 @@ def trtllm_batch_decode_with_kv_cache(
         o_sf_start_index = 0
         out_dtype = out_dtype or query.dtype
         out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
-        assert out_dtype in (
-            query.dtype,
-            torch.float16,
-            torch.bfloat16,
-        )
-        check_shape_dtype_device(out, query.shape, out_dtype, query.device, "out")
+        _check_shape_dtype_device(out, query.shape, query.dtype, query.device, "out")
     else:
         raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
@@ -2299,7 +2294,7 @@ def trtllm_batch_decode_with_kv_cache_mla(
         out = torch.empty(out_shape, dtype=torch.bfloat16, device=query.device)
     else:
         batch_size, _, num_q_heads, _ = query.shape
-        check_shape_dtype_device(
+        _check_shape_dtype_device(
             out,
             [batch_size, num_q_heads, kv_lora_rank],
             torch.bfloat16,
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -35,7 +35,7 @@
 from ..jit.cubin_loader import get_cubin
 from ..jit.cutlass_gemm.generate_kernels import generate_gemm_operations
 from ..utils import (
-    check_shape_dtype_device,
+    _check_shape_dtype_device,
     device_support_pdl,
     get_shuffle_matrix_a_row_indices,
     get_shuffle_matrix_sf_a_row_indices,
@@ -784,7 +784,7 @@ def cutlass_fused_moe(
     if output is None:
         output = torch.empty(output_shape, dtype=output_dtype, device=input.device)
     else:
-        check_shape_dtype_device(
+        _check_shape_dtype_device(
             output, output_shape, output_dtype, input.device, "output"
         )
 
diff --git a/flashinfer/mla.py b/flashinfer/mla.py
@@ -22,7 +22,7 @@
 from .jit import JitSpec
 from .jit import env as jit_env
 from .jit import gen_batch_mla_module, gen_jit_spec, sm100a_nvcc_flags
-from .utils import MaskMode, check_shape_dtype_device, determine_mla_backend
+from .utils import MaskMode, _check_shape_dtype_device, determine_mla_backend
 
 
 def _check_cutlass_shape(q_nope_pe, ckv_kpe_cache, kv_len, page_table):
@@ -394,7 +394,7 @@ def run(
             if out is None:
                 out = torch.empty_like(q_nope)
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
                 )
             q_nope_pe = torch.cat([q_nope, q_pe], dim=-1)
@@ -426,15 +426,15 @@ def run(
         if out is None:
             out = torch.empty_like(q_nope)
         else:
-            check_shape_dtype_device(
+            _check_shape_dtype_device(
                 out, q_nope.shape, q_nope.dtype, q_nope.device, "out"
             )
 
         if return_lse:
             if lse is None:
                 lse = torch.empty(q_nope.shape[:2], dtype=torch.float32, device=device)
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     lse, q_nope.shape[:2], torch.float32, q_nope.device, "lse"
                 )
         profiler_args = (profiler_buffer,) if self._use_profiler else ()
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -44,7 +44,7 @@
     _check_cached_qkv_data_type,
     _check_kv_layout,
     _check_pos_encoding_mode,
-    check_shape_dtype_device,
+    _check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
     _get_cache_buf,
     _unpack_paged_kv_cache,
@@ -2034,7 +2034,7 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
@@ -2043,7 +2043,7 @@ def run(
                 q.shape[:-1] + v_cache.shape[-1:], dtype=q.dtype, device=q.device
             )
         else:
-            check_shape_dtype_device(
+            _check_shape_dtype_device(
                 out, q.shape[:-1] + v_cache.shape[-1:], q.dtype, q.device, "out"
             )
 
@@ -2833,15 +2833,15 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
         if out is None:
             out = torch.empty(
                 q.shape[:-1] + v.shape[-1:], dtype=q.dtype, device=q.device
             )
         else:
-            check_shape_dtype_device(
+            _check_shape_dtype_device(
                 out, q.shape[:-1] + v.shape[-1:], q.dtype, q.device, "out"
             )
         if self._backend == "cutlass":
@@ -3244,9 +3244,9 @@ def trtllm_batch_context_with_kv_cache(
         assert isinstance(out, torch.Tensor)
 
         # Use uint8 as the container dtype to compliant with next fp4 gemm.
-        check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
 
-        check_shape_dtype_device(
+        _check_shape_dtype_device(
             out_scale_factor,
             fp4_out_scale_shape,
             torch.float8_e4m3fn,
@@ -3271,13 +3271,8 @@ def trtllm_batch_context_with_kv_cache(
         out_scale_factor = None
         o_sf_start_index = 0
         out_dtype = out_dtype or query.dtype
-        assert out_dtype in (
-            query.dtype,
-            torch.float16,
-            torch.bfloat16,
-        )
         out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
-        check_shape_dtype_device(out, query.shape, out_dtype, query.device, "out")
+        _check_shape_dtype_device(out, query.shape, query.dtype, query.device, "out")
     else:
         raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
diff --git a/flashinfer/sparse.py b/flashinfer/sparse.py
@@ -28,7 +28,7 @@
     PosEncodingMode,
     TensorLayout,
     _check_pos_encoding_mode,
-    check_shape_dtype_device,
+    _check_shape_dtype_device,
     _get_cache_alibi_slopes_buf,
     canonicalize_torch_dtype,
     determine_attention_backend,
@@ -577,14 +577,14 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
         if out is None:
             out = torch.empty_like(q, dtype=self._o_dtype)
         else:
-            check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
+            _check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
 
         if is_float8(q):
             assert q.dtype == k.dtype == v.dtype
@@ -1157,14 +1157,14 @@ def run(
                     (q.size(0), q.size(1)), dtype=torch.float32, device=q.device
                 )
             else:
-                check_shape_dtype_device(
+                _check_shape_dtype_device(
                     lse, (q.size(0), q.size(1)), torch.float32, q.device, "lse"
                 )
 
         if out is None:
             out = torch.empty_like(q, dtype=self._o_dtype)
         else:
-            check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
+            _check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
 
         if self._backend == "fa3":
             if (
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -431,22 +431,22 @@ def determine_mla_backend(device: torch.device) -> str:
     return "fa3" if is_sm90a_supported(device) else "fa2"
 
 
-def check_shape_dtype_device(
+def _check_shape_dtype_device(
     x: torch.Tensor,
-    expected_shape: Optional[Sequence[int]],
-    expected_dtype: Optional[torch.dtype],
-    expected_device: Optional[torch.device],
+    expected_shape: Sequence[int],
+    expected_dtype: torch.dtype,
+    expected_device: torch.device,
     name: str,
 ) -> None:
-    if expected_shape and x.shape != torch.Size(expected_shape):
+    if x.shape != torch.Size(expected_shape):
         raise ValueError(
             f"Invalid shape of {name}: expected {expected_shape}, got {x.shape}"
         )
-    if expected_dtype and x.dtype != expected_dtype:
+    if x.dtype != expected_dtype:
         raise ValueError(
             f"Invalid dtype of {name}: expected {expected_dtype}, got {x.dtype}"
         )
-    if expected_device and x.device != expected_device:
+    if x.device != expected_device:
         raise ValueError(
             f"Invalid device of {name}: expected {expected_device}, got {x.device}"
         )
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -597,9 +597,8 @@ class TllmFmhaKernelFactory {
     std::lock_guard<std::mutex> lg(s_mutex);
 
     if (!metainfo_loaded) {
-      std::string metainfo_raw =
-          getMetaInfo(tllm_gen_fmha_cubin_path + "/include/flashInferMetaInfo",
-                      tllm_gen_fmha_metainfo_hash, ".h");
+      std::string metainfo_raw = getMetaInfo(tllm_gen_fmha_cubin_path + "flashInferMetaInfo",
+                                             tllm_gen_fmha_metainfo_hash, ".h");
       metainfo = KernelType::KernelMeta::loadFromMetaInfoRaw(metainfo_raw);
       metainfo_loaded = true;
     }
diff --git a/tests/test_trtllm_gen_attention.py b/tests/test_trtllm_gen_attention.py
@@ -8,7 +8,7 @@
 from flashinfer.utils import FP4Tensor, ceil_div, round_up
 
 DTYPE_MAP = {
-    "fp16": torch.float16,
+    "half": torch.float16,
     "bf16": torch.bfloat16,
     "fp8": torch.float8_e4m3fn,
     "nvfp4": "nvfp4",
@@ -237,10 +237,8 @@ def unpack_compare_nvfp4(
 @pytest.mark.parametrize(
     "q_dtype,kv_dtype,o_dtype",
     [
+        ("half", "half", "half"),
         ("bf16", "bf16", "bf16"),
-        ("fp16", "fp16", "fp16"),
-        ("fp8", "fp8", "bf16"),
-        ("fp8", "fp8", "fp16"),
         ("fp8", "fp8", "fp8"),
         ("fp8", "fp8", "nvfp4"),
     ],
@@ -357,10 +355,8 @@ def test_trtllm_batch_prefill(
         )
         assert o_scale == 1.0
         rtol, atol = 4e-1, 1e0
-    elif q_dtype == "fp8" and o_dtype == "fp8":
+    elif o_dtype == "fp8":
         rtol, atol = 5e-2, 7e-2
-    elif q_dtype == "fp8" and o_dtype in ["bf16", "fp16"]:
-        rtol, atol = 4e-2, 6e-2
     else:
         rtol, atol = 1e-2, 1e-2
 
@@ -403,12 +399,10 @@ def test_trtllm_batch_prefill(
 @pytest.mark.parametrize(
     "q_dtype,kv_dtype,o_dtype",
     [
+        ("half", "half", "half"),
+        ("half", "fp8", "half"),
         ("bf16", "bf16", "bf16"),
-        ("fp16", "fp16", "fp16"),
         ("bf16", "fp8", "bf16"),
-        ("fp16", "fp8", "fp16"),
-        ("fp8", "fp8", "bf16"),
-        ("fp8", "fp8", "fp16"),
         ("fp8", "fp8", "fp8"),
         ("fp8", "fp8", "nvfp4"),
     ],
@@ -518,10 +512,8 @@ def test_trtllm_batch_decode(
         )
         assert o_scale == 1.0
         rtol, atol = 3e-1, 1e0
-    elif q_dtype == "fp8" and o_dtype == "fp8":
+    elif o_dtype == "fp8":
         rtol, atol = 5e-2, 7e-2
-    elif q_dtype == "fp8" and o_dtype in ["bf16", "fp16"]:
-        rtol, atol = 4e-2, 6e-2
     else:
         rtol, atol = 1e-2, 1e-2
 

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):`
`48`	`48`
`49`	`49`
`50`	`50`	`class ArtifactPath:`
`51`		`- TRTLLM_GEN_FMHA: str = "85756bb4c98571b902095e944fc27bda1fcec3e4/fmha/trtllm-gen/"`
	`51`	`+ TRTLLM_GEN_FMHA: str = "c8e0abb4b0438880a2b0a9b68449e3cf1513aadf/fmha/trtllm-gen/"`
`52`	`52`	`TRTLLM_GEN_BMM: str = (`
`53`	`53`	`"5d347c6234c9f0e7f1ab6519ea933183b48216ed/batched_gemm-32110eb-5262bae/"`
`54`	`54`	`)`
`@@ -61,7 +61,7 @@ class ArtifactPath:`
`61`	`61`
`62`	`62`	`class MetaInfoHash:`
`63`	`63`	`TRTLLM_GEN_FMHA: str = (`
`64`		`- "24db1b729f7cb86d3f4dc4fbb7de479c5a854cd03ffbf75dc356bffcde48700c"`
	`64`	`+ "0d124e546c8a2e9fa59499625e8a6d140a2465573d4a3944f9d29f29f73292fb"`
`65`	`65`	`)`
`66`	`66`	`TRTLLM_GEN_BMM: str = (`
`67`	`67`	`"aae02e5703ee0ce696c4b3a1f2a32936fcc960dcb69fdef52b6d0f8a7b673000"`