[PyTorch] Update cuBLASLt grouped gemm filter (#3119)

yaox12 · pre-commit-ci[bot] · vthumbe1503 · web-flow · commit 4130d73ef47a · 2026-06-15T19:53:19.000-07:00
* update cublaslt grouped gemm filter Signed-off-by: Xin Yao <xiny@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update nvfp4 filter and tests Signed-off-by: Xin Yao <xiny@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test correctness Signed-off-by: Varun Thumbe <vthumbe@nvidia.com> * better test Signed-off-by: Varun Thumbe <vthumbe@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xin Yao <xiny@nvidia.com> Signed-off-by: Varun Thumbe <vthumbe@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Varun Thumbe <vthumbe@nvidia.com>
diff --git a/tests/pytorch/test_grouped_linear.py b/tests/pytorch/test_grouped_linear.py
@@ -1497,6 +1497,7 @@ def test_fp8_grouped_gemm(shape, accumulate):
 _FUSED_GROUPED_GEMM_ENV = "NVTE_GROUPED_LINEAR_USE_FUSED_GROUPED_GEMM"
 _ALL_BOOLEAN = all_boolean
 _mxfp8_available, _reason_for_no_mxfp8 = mxfp8_available, reason_for_no_mxfp8
+_nvfp4_available, _reason_for_no_nvfp4 = nvfp4_available, reason_for_no_nvfp4
 
 
 @pytest.fixture(autouse=True)
@@ -1580,26 +1581,40 @@ def _run_grouped_linear_path(
             recipe.MXFP8BlockScaling(),
             marks=pytest.mark.skipif(not _mxfp8_available, reason=_reason_for_no_mxfp8),
         ),
+        pytest.param(
+            recipe.NVFP4BlockScaling(disable_stochastic_rounding=True),
+            marks=pytest.mark.skipif(not _nvfp4_available, reason=_reason_for_no_nvfp4),
+        ),
     ],
-    ids=["bf16", "mxfp8"],
+    ids=["bf16", "mxfp8", "nvfp4"],
 )
 @pytest.mark.parametrize("bias", _ALL_BOOLEAN)
 @pytest.mark.parametrize("fp8_model_params", _ALL_BOOLEAN)
 @pytest.mark.parametrize("delay_wgrad_compute", _ALL_BOOLEAN)
 def test_grouped_linear_grouped_tensor_path_matches_legacy(
     fp8_recipe, bias, fp8_model_params, delay_wgrad_compute, monkeypatch
 ):
-    if torch.cuda.get_device_capability() < (10, 0):
-        pytest.skip("GroupedTensor grouped GEMM path requires SM100+")
-
     use_fp8 = fp8_recipe is not None
+    device_capability = torch.cuda.get_device_capability()
+    if not (9, 0) <= device_capability <= (11, 0):
+        pytest.skip(
+            "GroupedTensor grouped GEMM path requires Hopper (SM90) or Blackwell (SM10x and SM110)."
+        )
+    if use_fp8 and device_capability < (10, 0):
+        pytest.skip("Quantized GroupedTensor grouped GEMM path requires Blackwell (SM100+).")
+    cublaslt_version = tex.get_cublasLt_version()
+    if device_capability < (10, 0) and cublaslt_version < 130400:
+        pytest.skip("Grouped GEMM on Hopper requires cuBLAS 13.4+.")
+    if cublaslt_version < 130300:
+        pytest.skip("Grouped GEMM requires cuBLAS 13.3+.")
+
     if fp8_model_params and not use_fp8:
         pytest.skip("fp8_model_params requires FP8")
 
     dtype = torch.bfloat16
     num_gemms = 3
-    in_features = 64
-    out_features = 64
+    in_features = 128
+    out_features = 128
     m_splits = [128, 256, 384]
     total_tokens = sum(m_splits)
 
@@ -1683,6 +1698,90 @@ def test_grouped_linear_grouped_tensor_path_single_grouped_bias_delay_wgrad(monk
     grouped_linear.backward_dw()
 
 
+@pytest.mark.skipif(not _nvfp4_available, reason=_reason_for_no_nvfp4)
+def test_grouped_linear_grouped_tensor_path_skips_non_rht_nvfp4(monkeypatch):
+    """Non-RHT NVFP4 falls back to the legacy path; check it stays numerically correct.
+
+    Graph-safe grouped quantization currently requires RHT, so requesting NVFP4 with
+    ``disable_rht=True`` while the fused grouped-tensor path is enabled falls back to the
+    legacy path internally. We verify the output and gradients against a reference built from
+    per-GEMM ``te.Linear`` modules that share the same weights and use the same NVFP4 recipe;
+    the grouped GEMM should match the loop of single GEMMs.
+    """
+    if torch.cuda.get_device_capability() < (10, 0):
+        pytest.skip("NVFP4 GroupedTensor grouped GEMM path requires SM100+")
+
+    monkeypatch.setenv(_FUSED_GROUPED_GEMM_ENV, "1")
+    FP8GlobalStateManager.reset()
+
+    dtype = torch.bfloat16
+    num_gemms = 3
+    in_features = 128
+    out_features = 128
+    m_splits = [128, 256, 384]
+    total_tokens = sum(m_splits)
+
+    torch.manual_seed(1234)
+    x_base = (0.1 * torch.randn(total_tokens, in_features, device="cuda")).to(dtype)
+    dy = (0.1 * torch.randn(total_tokens, out_features, device="cuda")).to(dtype)
+    weights = [
+        (0.1 * torch.randn(out_features, in_features, device="cuda")).to(dtype)
+        for _ in range(num_gemms)
+    ]
+
+    fp8_recipe = recipe.NVFP4BlockScaling(
+        disable_rht=True,
+        disable_stochastic_rounding=True,
+    )
+
+    # Grouped path: fused path enabled, but non-RHT NVFP4 falls back to legacy internally.
+    grouped_linear = GroupedLinear(
+        num_gemms,
+        in_features,
+        out_features,
+        bias=False,
+        params_dtype=dtype,
+        device="cuda",
+    )
+    with torch.no_grad():
+        for i in range(num_gemms):
+            getattr(grouped_linear, f"weight{i}").copy_(weights[i])
+
+    x = x_base.detach().clone().requires_grad_(True)
+    with autocast(enabled=True, recipe=fp8_recipe):
+        y = grouped_linear(x, m_splits)
+    y.backward(dy)
+
+    # Reference: one te.Linear per GEMM sharing the same weights and NVFP4 recipe.
+    ref_linears = torch.nn.ModuleList(
+        [
+            Linear(in_features, out_features, bias=False, params_dtype=dtype, device="cuda")
+            for _ in range(num_gemms)
+        ]
+    )
+    with torch.no_grad():
+        for i in range(num_gemms):
+            ref_linears[i].weight.copy_(weights[i])
+
+    x_ref = x_base.detach().clone().requires_grad_(True)
+    with autocast(enabled=True, recipe=fp8_recipe):
+        y_ref = torch.cat(
+            [ref_linears[i](x_i) for i, x_i in enumerate(torch.split(x_ref, m_splits))]
+        )
+    y_ref.backward(dy)
+
+    # cuBLAS grouped GEMM should match the loop of single GEMMs bit-for-bit.
+    tols = dict(rtol=0, atol=0)
+    torch.testing.assert_close(y.float(), y_ref.float(), **tols)
+    torch.testing.assert_close(x.grad.float(), x_ref.grad.float(), **tols)
+    for i in range(num_gemms):
+        torch.testing.assert_close(
+            getattr(grouped_linear, f"weight{i}").grad.float(),
+            ref_linears[i].weight.grad.float(),
+            **tols,
+        )
+
+
 @pytest.mark.parametrize(
     "fp8_recipe",
     [
@@ -1691,19 +1790,33 @@ def test_grouped_linear_grouped_tensor_path_single_grouped_bias_delay_wgrad(monk
             recipe.MXFP8BlockScaling(),
             marks=pytest.mark.skipif(not _mxfp8_available, reason=_reason_for_no_mxfp8),
         ),
+        pytest.param(
+            recipe.NVFP4BlockScaling(disable_stochastic_rounding=True),
+            marks=pytest.mark.skipif(not _nvfp4_available, reason=_reason_for_no_nvfp4),
+        ),
     ],
-    ids=["bf16", "mxfp8"],
+    ids=["bf16", "mxfp8", "nvfp4"],
 )
 @pytest.mark.parametrize("bias", _ALL_BOOLEAN)
 def test_grouped_linear_fused_path_cuda_graph_safe(fp8_recipe, bias, monkeypatch):
     """Fused GroupedTensor GEMM path should be CUDA graph capturable."""
-    if torch.cuda.get_device_capability() < (10, 0):
-        pytest.skip("GroupedTensor grouped GEMM path requires SM100+")
+    use_fp8 = fp8_recipe is not None
+    device_capability = torch.cuda.get_device_capability()
+    if not (9, 0) <= device_capability <= (11, 0):
+        pytest.skip(
+            "GroupedTensor grouped GEMM path requires Hopper (SM90) or Blackwell (SM10x and SM110)."
+        )
+    if use_fp8 and device_capability < (10, 0):
+        pytest.skip("Quantized GroupedTensor grouped GEMM path requires Blackwell (SM100+).")
+    cublaslt_version = tex.get_cublasLt_version()
+    if device_capability < (10, 0) and cublaslt_version < 130400:
+        pytest.skip("Grouped GEMM on Hopper requires cuBLAS 13.4+.")
+    if cublaslt_version < 130300:
+        pytest.skip("Grouped GEMM requires cuBLAS 13.3+.")
 
     monkeypatch.setenv(_FUSED_GROUPED_GEMM_ENV, "1")
     FP8GlobalStateManager.reset()
 
-    use_fp8 = fp8_recipe is not None
     dtype = torch.bfloat16
     device = "cuda"
     num_gemms = 3
diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu
@@ -228,7 +228,7 @@ struct GroupedGemmSetupWorkspace {
   }
 };
 
-inline bool grouped_gemm_supports_per_group_alpha_beta(int sm) { return sm >= 100; }
+inline bool grouped_gemm_supports_per_group_alpha_beta(int sm) { return sm >= 100 && sm <= 110; }
 
 inline size_t validate_grouped_gemm_inputs(
     size_t num_tensors, std::initializer_list<const transformer_engine::GroupedTensor *> inputs,
@@ -335,7 +335,8 @@ inline void check_grouped_gemm_requirements(const char *api_name) {
   const int sm = transformer_engine::cuda::sm_arch(current_device);
   const int cublas_ver = transformer_engine::cuda::cublas_version();
 #if CUBLAS_VERSION >= CUBLAS_GROUPED_GEMM_HOPPER_VERSION
-  NVTE_CHECK(sm >= 90, api_name, " requires Hopper (SM90) or newer architecture.");
+  NVTE_CHECK(sm >= 90 && sm <= 110, api_name,
+             " requires Hopper (SM90) or Blackwell (SM10x and SM110).");
   NVTE_CHECK(cublas_ver >= CUBLAS_GROUPED_GEMM_VERSION, api_name,
              " requires cuBLAS 13.3+, but run-time cuBLAS version is ", cublas_ver);
   if (sm < 100) {
@@ -344,7 +345,7 @@ inline void check_grouped_gemm_requirements(const char *api_name) {
                cublas_ver);
   }
 #else
-  NVTE_CHECK(sm >= 100, api_name, " requires Blackwell (SM100) or newer architecture.");
+  NVTE_CHECK(sm >= 100 && sm <= 110, api_name, " requires Blackwell (SM10x and SM110).");
   NVTE_CHECK(cublas_ver >= CUBLAS_GROUPED_GEMM_VERSION, api_name,
              " requires cuBLAS 13.3+, but run-time cuBLAS version is ", cublas_ver);
 #endif
@@ -400,7 +401,7 @@ inline void validate_fp8_block_grouped_gemm_support(const GroupedOperandSelectio
              "Grouped GEMM: A and B must both use FP8 block scaling or both not.");
   NVTE_CHECK(sm == 90,
              "Grouped GEMM: FP8 block scaling is only supported on Hopper (SM90); "
-             "use MXFP8 on Blackwell (SM100) or newer.");
+             "use MXFP8 on Blackwell (SM10x and SM110).");
 }
 
 inline bool is_compatible_grouped_scaling_mode(NVTEScalingMode a_mode, NVTEScalingMode b_mode) {
@@ -1567,7 +1568,7 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT
   NVTE_API_CALL(nvte_grouped_gemm);
   using namespace transformer_engine;
 
-  // Grouped GEMM requires Blackwell (SM100) or newer with cuBLAS 13.3+,
+  // Grouped GEMM requires Blackwell (SM10x and SM110) with cuBLAS 13.3+,
   // or Hopper (SM90) with cuBLAS 13.4+.
   check_grouped_gemm_requirements("nvte_grouped_gemm");
 
@@ -1650,7 +1651,7 @@ void nvte_grouped_gemm_with_discrete_inputA(const NVTETensor *A_list, size_t num
   NVTE_API_CALL(nvte_grouped_gemm_with_discrete_inputA);
   using namespace transformer_engine;
 
-  // Grouped GEMM requires Blackwell (SM100) or newer with cuBLAS 13.3+,
+  // Grouped GEMM requires Blackwell (SM10x and SM110) with cuBLAS 13.3+,
   // or Hopper (SM90) with cuBLAS 13.4+.
   check_grouped_gemm_requirements("nvte_grouped_gemm_with_discrete_inputA");
 
@@ -1801,7 +1802,7 @@ void nvte_grouped_gemm_with_discrete_out(const NVTEGroupedTensor A, int transa,
   NVTE_API_CALL(nvte_grouped_gemm_with_discrete_out);
   using namespace transformer_engine;
 
-  // Grouped GEMM requires Blackwell (SM100) or newer with cuBLAS 13.3+,
+  // Grouped GEMM requires Blackwell (SM10x and SM110) with cuBLAS 13.3+,
   // or Hopper (SM90) with cuBLAS 13.4+.
   check_grouped_gemm_requirements("nvte_grouped_gemm_with_discrete_out");
 
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -55,8 +55,7 @@
 from ..cpu_offload import is_cpu_offload_enabled, mark_not_offload, start_offload
 from ..triton.grouped_dbias_dscales import compute_grouped_dbias
 
-from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer
-from ..tensor.mxfp8_tensor import MXFP8Quantizer
+from ..tensor import Float8CurrentScalingQuantizer, Float8Quantizer, MXFP8Quantizer, NVFP4Quantizer
 from ..quantized_tensor import (
     QuantizedTensorStorage,
     Quantizer,
@@ -95,19 +94,29 @@ def _is_grouped_tensor_path_supported(
         save_original_input: bool,
         activation_dtype: torch.dtype,
         input_quantizers: List[Optional[Quantizer]],
-        weight_quantizers: List[Optional[Quantizer]],
         output_quantizers: List[Optional[Quantizer]],
-        grad_output_quantizers: List[Optional[Quantizer]],
     ) -> bool:
-        """Whether to use cublasLt grouped GEMM through GroupedTensor metadata.
+        """Whether to use cuBLASLt grouped GEMM through GroupedTensor metadata.
 
         There are no checks whether split sizes are supported. Splits
         may be in a CUDA tensor, so checking would hurt performance
         and be incompatible with CUDA Graphs.
 
+        Supported Compute Capability (CC) and precisions:
+        * Hopper (CC 9.0): BF16/FP16.
+        * Blackwell (CC 10.x and 11.0): BF16/FP16/MXFP8/NVFP4 with RHT.
+        FP8 delayed / current scaling, and FP8 block scaling are not supported because the
+        corresponding grouped quantization kernels are missing.
+        Non-RHT NVFP4 falls back to the legacy path because graph-safe grouped quantization
+        currently requires RHT.
+
+        Input/weight/grad_output quantizers are assumed to be of the same type, otherwise it would
+        trigger a fatal error in the cuBLASLt grouped GEMM check.
         """
+        # 1. Filter by environment variable
         if not bool(int(os.getenv("NVTE_GROUPED_LINEAR_USE_FUSED_GROUPED_GEMM", "0"))):
             return False
+        # 2. Filter out advanced features
         if (
             debug
             or cpu_offloading
@@ -116,16 +125,18 @@ def _is_grouped_tensor_path_supported(
             or save_original_input
         ):
             return False
-        if get_device_compute_capability() < (10, 0):
+        # 3. Filter by compute capability
+        if not (9, 0) <= get_device_compute_capability() <= (11, 0):
             return False
+        # 4. Output quantization is not supported.
         if any(q is not None for q in output_quantizers):
             return False
+        # 5. Filter by quantization recipes.
         if fp8:
-            return (
-                activation_dtype in (torch.bfloat16, torch.float16)
-                and all(isinstance(q, MXFP8Quantizer) for q in input_quantizers)
-                and all(isinstance(q, MXFP8Quantizer) for q in weight_quantizers)
-                and all(q is None or isinstance(q, MXFP8Quantizer) for q in grad_output_quantizers)
+            if not (10, 0) <= get_device_compute_capability() <= (11, 0):
+                return False
+            return all(isinstance(q, MXFP8Quantizer) for q in input_quantizers) or all(
+                isinstance(q, NVFP4Quantizer) and q.with_rht for q in input_quantizers
             )
         return activation_dtype in (torch.bfloat16, torch.float16)
 
@@ -234,7 +245,7 @@ def _forward_grouped_tensor(
         weights: Tuple[torch.Tensor, ...],
         biases: Tuple[torch.Tensor, ...],
     ) -> Tuple[torch.Tensor, list]:
-        """Forward path backed by GroupedTensor + cublasLt grouped GEMM."""
+        """Forward path backed by GroupedTensor + cuBLASLt grouped GEMM."""
         num_gemms = len(m_splits)
         device = inp.device
         in_features = weights[0].size(-1)
@@ -491,9 +502,7 @@ def forward(
             save_original_input=save_original_input,
             activation_dtype=activation_dtype,
             input_quantizers=input_quantizers,
-            weight_quantizers=weight_quantizers,
             output_quantizers=output_quantizers,
-            grad_output_quantizers=grad_output_quantizers,
         ):
             return _GroupedLinear._forward_grouped_tensor(
                 ctx,
@@ -745,7 +754,7 @@ def _backward_grouped_tensor(
                 columnwise=ctx.weights_requires_grad,
             )
             grad_output_quantizer.optimize_for_gemm = True
-            if ctx.use_bias:
+            if ctx.use_bias and isinstance(grad_output_quantizer, MXFP8Quantizer):
                 grouped_dy, dbias_packed = tex.bgrad_group_quantize(
                     dy_2d,
                     grad_output_quantizer,
diff --git a/transformer_engine/pytorch/ops/basic/grouped_linear.py b/transformer_engine/pytorch/ops/basic/grouped_linear.py