dequantize_fp8_cache_kernel: Move D=128 device-side-assertion check to host

ColinPeppler · facebook-github-bot · commit 26f24c956887 · 2025-09-12T13:25:13.000-07:00
Summary:
## What
Move the device-side assertions to the host since all the kernels share the same assertion.

## Why
When running evals with symmetric quantization, I ran into the following error.

&gt; CUDA error: too many resources requested for launch

 It failed with this launch configuration: blockDim = (32, 32) = 1024 threads per block.
- `$ cuobjdump --dump-resource-usage kv_cache.cu.pic.o.sm_90.cubin | c++filt | grep -A 1 'dequantize_fp8_cache_kernel'` gives me
     - `void fbgemm_gpu::dequantize_fp8_cache_kernel&lt;true, true&gt;... REG:66`
  - P1908720668
- That means one threadblock has 66 * 1024 = 67584 registers which exceeds the limit of 65,536.

Differential Revision: D82320518
diff --git a/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_dequantize.cu b/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_dequantize.cu
@@ -188,8 +188,6 @@ __global__ void dequantize_fp8_cache_kernel(
   auto MAX_T = cache_K.size(1);
   auto D_H = cache_K_dq.size(3);
   auto D_H_q = cache_K.size(3);
-  // TODO: support D_H < 128 for small model used in testing.
-  CUDA_KERNEL_ASSERT(D_H == 128);
   const uint8_t offset_bytes = (ExternalQParam) ? 0 : 4;
   CUDA_KERNEL_ASSERT(D_H_q - D_H == offset_bytes);
 
@@ -301,8 +299,6 @@ __global__ void dequantize_fp8_cache_kernel(
   auto MAX_T = cache_K.size(1);
   auto D_H = cache_K_dq.size(3);
   auto D_H_q = cache_K.size(3);
-  // TODO: support D_H < 128 for small model used in testing.
-  CUDA_KERNEL_ASSERT(D_H == 128);
   const uint8_t offset_bytes = (ExternalQParam) ? 0 : 4;
   CUDA_KERNEL_ASSERT(D_H_q - D_H == offset_bytes);
 
@@ -401,7 +397,6 @@ __global__ void dequantize_fp8_cache_kernel_paged(
   auto N_KVH = cache_K.size(2);
   auto D_H = cache_K_dq.size(3);
   auto D_H_q = cache_K.size(3);
-  CUDA_KERNEL_ASSERT(D_H == 128);
 
   auto b = blockIdx.x;
   // only need to dequantize this far.
@@ -518,6 +513,9 @@ std::tuple<at::Tensor, at::Tensor> dequantize_fp8_cache(
   }
   auto D_H = (D_HQ - fp8_qparam_offset);
 
+  // TODO: support D_H < 128 for small model used in testing.
+  TORCH_CHECK(D_H == 128, "D_H must be 128, got ", D_H);
+
   // TODO:
   // The below allocates Tensors that have the same shape as cache_K and
   // cache_V to store their dequantize results. For paged KV cache, this can
diff --git a/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py b/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py
@@ -357,6 +357,50 @@ def test_fp8_kv_cache(self, MAX_T: int, N_KVH_L: int) -> None:
             cache_v[:, :T], cache_v_bf16[:, :T], atol=1.0e-2, rtol=5.0e-2
         )
 
+    @settings(deadline=None)
+    @unittest.skipIf(
+        not torch.cuda.is_available()
+        or (
+            torch.version.cuda
+            and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9
+        )
+        or (torch.version.hip and torch.version.hip < "6.2")
+        or not HAS_XFORMERS,
+        "Skip when H100 is not available or MI300 is not available",
+    )
+    def test_dequantize_fp8_cache_too_many_resources_for_launch(self) -> None:
+        # With heavy register usage, dequantize_fp8_cache can fail with
+        # CUDA error: too many resources requested for launch
+        device = "cuda"
+
+        # Shapes/dtypes
+        B, MAX_T, N_KVH, D = 1, 139_264, 1, 128
+
+        cache_k = torch.randint(
+            low=0, high=256, size=(B, MAX_T, N_KVH, D), dtype=torch.uint8, device=device
+        )
+        cache_v = torch.randint(
+            low=0, high=256, size=(B, MAX_T, N_KVH, D), dtype=torch.uint8, device=device
+        )
+
+        # Per-token qparams (symmetric=True implies zp=0)
+        qparam_k = torch.zeros((B, MAX_T, N_KVH, 1), dtype=torch.int32, device=device)
+        qparam_v = torch.zeros((B, MAX_T, N_KVH, 1), dtype=torch.int32, device=device)
+
+        # Sequence length (single int32)
+        seq_len = torch.tensor([MAX_T], dtype=torch.int32, device=device)
+
+        torch.ops.fbgemm.dequantize_fp8_cache(  # type: ignore[reportCallIssue]
+            cache_k,
+            cache_v,
+            seq_len,
+            qparam_k=qparam_k,
+            qparam_v=qparam_v,
+            block_tables=None,
+            page_size=0,
+            symmetric=True,
+        )
+
     @settings(deadline=None)
     @given(
         MAX_T=st.sampled_from([8000, 16384]),