Fixed and cleaned up

Anerudhan · Anerudhan · commit d41b0c599d0d · 2025-12-11T21:24:09.000-08:00
diff --git a/flashinfer/cudnn/prefill.py b/flashinfer/cudnn/prefill.py
@@ -91,6 +91,7 @@ def _sdpa_prefill_key_fn(
     batch_offsets_stats: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
     lse: Optional[torch.Tensor] = None,
+    o_data_type: Optional[torch.dtype] = None,
 ):
     graph_b = actual_seq_lens_q.shape[0]
 
@@ -149,6 +150,7 @@ def _build_prefill_graph(
         batch_offsets_stats: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
         lse: Optional[torch.Tensor] = None,
+        o_data_type: Optional[torch.dtype] = None,
     ):
         handle = _create_cudnn_handle(torch.cuda.current_stream(q.device))
 
@@ -163,6 +165,16 @@ def _build_prefill_graph(
         cudnn_k_data_type = cudnn.datatypes._torch_to_cudnn_data_type(k_cache.dtype)
         cudnn_v_data_type = cudnn.datatypes._torch_to_cudnn_data_type(v_cache.dtype)
 
+        cudnn_o_data_type = cudnn.datatypes._torch_to_cudnn_data_type(o_data_type)
+
+        if (
+            cudnn_q_data_type == cudnn.data_type.FP8_E4M3
+            or cudnn_q_data_type == cudnn.data_type.FP8_E5M2
+        ) and cudnn.backend_version() < 91800:
+            raise RuntimeError(
+                f"FP8 is not supported in cuDNN backend version < 9.18.0, current version is {cudnn.backend_version()}"
+            )
+
         with cudnn.graph(handle) as (g, _):
             # Create tensors from the input tensors
             if q.dim() == 3:
@@ -318,7 +330,10 @@ def _build_prefill_graph(
                 actual_seq_lens_q is not None and actual_seq_lens_kv is not None
             )
 
-            if cudnn_q_data_type == cudnn.data_type.BFLOAT16:
+            if (
+                cudnn_q_data_type == cudnn.data_type.BFLOAT16
+                or cudnn_q_data_type == cudnn.data_type.HALF
+            ):
                 O, Stats = g.sdpa(
                     name="sdpa",
                     q=cudnn_q,
@@ -410,7 +425,7 @@ def _build_prefill_graph(
                 [graph_b, h_qo, graph_s_qo, d_vo]
             ).set_stride(
                 [graph_s_qo * d_vo * h_qo, d_vo, d_vo * h_qo, 1]
-            ).set_data_type(cudnn.data_type.BFLOAT16)
+            ).set_data_type(cudnn_o_data_type)
 
             if return_lse:
                 Stats.set_uid(UIDs.STATS_UID.value).set_output(
@@ -455,6 +470,7 @@ def _batch_prefill_with_kv_cache(
     batch_offsets_stats: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
     lse: Optional[torch.Tensor] = None,
+    o_data_type: Optional[torch.dtype] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     graph, tensors = _build_prefill_graph(
         q=q,
@@ -475,6 +491,7 @@ def _batch_prefill_with_kv_cache(
         batch_offsets_stats=batch_offsets_stats,
         out=out,
         lse=lse,
+        o_data_type=o_data_type,
     )
 
     var_map = {
@@ -555,6 +572,7 @@ def cudnn_batch_prefill_with_kv_cache(
     lse: Optional[torch.Tensor] = None,
     is_cuda_graph_compatible: bool = False,
     backend: Optional[str] = None,
+    o_data_type: Optional[torch.dtype] = None,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     """Performs batched prefill attention with paged KV cache using cuDNN.
 
@@ -581,7 +599,7 @@ def cudnn_batch_prefill_with_kv_cache(
         batch_offsets_o: Optional batch offsets for output tensor of shape (batch_size,) on GPU
         batch_offsets_k: Optional batch offsets for key tensor of shape (batch_size,) on GPU
         batch_offsets_v: Optional batch offsets for value tensor of shape (batch_size,) on GPU
-
+        o_data_type: Optional data type for output tensor
     Returns:
         Output tensor of shape (batch_size * seq_len_q, num_heads_qo, head_dim)
         If return_lse is True, also returns log-sum-exp tensor of shape (batch_size, seq_len_q, num_heads_qo)
@@ -624,8 +642,7 @@ def cudnn_batch_prefill_with_kv_cache(
 
     if out is None:
         out_shape = (num_tokens, h_qo, d_vo)
-        out = torch.empty(out_shape, device=q.device, dtype=torch.float16)
-        # out = torch.empty(out_shape, device=q.device, dtype=q.dtype)
+        out = torch.empty(out_shape, device=q.device, dtype=o_data_type)
 
     if CUDNN_AVAILABLE and backend != "cubin":
         return _batch_prefill_with_kv_cache(
@@ -651,6 +668,7 @@ def cudnn_batch_prefill_with_kv_cache(
             batch_offsets_stats=batch_offsets_stats,
             out=out,
             lse=lse,
+            o_data_type=o_data_type,
         )
     else:
         assert return_lse, "Currently only supports return_lse = True"
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -2200,6 +2200,7 @@ def run(
                 batch_offsets_o=self._qo_indptr_buf,
                 out=out,
                 lse=lse,
+                o_data_type=out_dtype,
             )
         else:
             if self._backend != "trtllm-gen":
diff --git a/tests/attention/test_cudnn_prefill.py b/tests/attention/test_cudnn_prefill.py
@@ -221,11 +221,8 @@ def test_cudnn_prefill_fp8(
         s_qo, s_kv + 1, (batch_size, 1, 1, 1), dtype=torch.int32, device=device
     )
 
-    print("actual_seq_lens_q ", actual_seq_lens_q)
-    print("actual_seq_lens_kv ", actual_seq_lens_kv)
-
     cumsum_s_qo = torch.sum(actual_seq_lens_q)
-    q = torch.ones(
+    q = torch.randn(
         cumsum_s_qo, num_qo_heads, head_dim, device=device, dtype=torch.bfloat16
     )
 
@@ -246,7 +243,7 @@ def test_cudnn_prefill_fp8(
     total_num_pages = num_pages_per_seq * batch_size
 
     kv_cache_shape = (total_num_pages, 2, num_kv_heads, page_size, head_dim)
-    kv_cache = torch.ones(size=kv_cache_shape, dtype=torch.bfloat16).to(device)
+    kv_cache = torch.randn(size=kv_cache_shape, dtype=torch.bfloat16).to(device) * 0.05
     kv_cache = kv_cache.as_strided(
         kv_cache.shape,
         (
@@ -383,8 +380,4 @@ def test_cudnn_prefill_fp8(
 
     output_ref = wrapper.run(q, kv_cache)
 
-    print("output ", output)
-    print("output_ref ", output_ref)
-    print("block_tables ", block_tables)
-
-    torch.testing.assert_close(output, output_ref, atol=1e-1, rtol=1e-1)
+    torch.testing.assert_close(output, output_ref, atol=1e-2, rtol=1e-2)

Original file line number	Diff line number	Diff line change
`@@ -2200,6 +2200,7 @@ def run(`
`2200`	`2200`	`batch_offsets_o=self._qo_indptr_buf,`
`2201`	`2201`	`out=out,`
`2202`	`2202`	`lse=lse,`
	`2203`	`+ o_data_type=out_dtype,`
`2203`	`2204`	`)`
`2204`	`2205`	`else:`
`2205`	`2206`	`if self._backend != "trtllm-gen":`