NVIDIA · timmoon10 · Dec 20, 2025 · Nov 26, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/benchmarks/linear/benchmark_grouped_linear.py b/benchmarks/linear/benchmark_grouped_linear.py
@@ -53,7 +53,7 @@
     --set=full \
     --kernel-name "GroupHadamardAmaxTmaKernel" \
     -s 5 -c 5 \
-    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4 --profile
+    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
 
 """
 
@@ -173,7 +173,9 @@ def benchmark_linear(
     return timing_ms
 
 
-def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None):
+def run_benchmark_linear(
+    mkns, recipe_name, use_bias, num_gemms=4, m_splits_provided=None, fwd_only=False
+):
     data = []
     assert not use_bias, "Bias is not supported for GroupedLinear benchmark"
 
@@ -182,22 +184,22 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None
         device = "cuda"
         x = torch.randn((m, k), dtype=torch.bfloat16, device=device, requires_grad=True)
         ws = [torch.randn((n, k), dtype=torch.bfloat16, device=device) for _ in range(num_gemms)]
-        assert m % num_gemms == 0
-        m_splits = [m // num_gemms] * num_gemms if m_splits is None else m_splits
+        m_splits = [m // num_gemms] * num_gemms if m_splits_provided is None else m_splits_provided
         # Bias is not supported for GroupedLinear benchmark
         bias = None
 
         # Run the benchmark
         print(f"fwd_m={m}, fwd_k={k}, fwd_n={n}")
         print(f"m_splits: {m_splits}")
+        print(f"fwd_only: {fwd_only}")
 
         grouped_fwd_bwd_timing_ms = benchmark_linear(
             x,
             ws,
             m_splits,
             bias,
             recipe_name,
-            mode="fwd_bwd",
+            mode="fwd_only" if fwd_only else "fwd_bwd",
             num_gemms=num_gemms,
         )
 
@@ -213,6 +215,8 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None
             ]
         )
 
+    timing_notation = "grouped_fwd_time_ms" if fwd_only else "grouped_fwd_bwd_time_ms"
+
     df = pd.DataFrame(
         data=data,
         columns=[
@@ -221,7 +225,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None
             "n",
             "recipe",
             "num_gemms",
-            "grouped_fwd_bwd_time_ms",
+            timing_notation,
         ],
     )
 
@@ -234,7 +238,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None
     parser = argparse.ArgumentParser()
     parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
     parser.add_argument(
-        "--output_dir",
+        "--output-dir",
         type=str,
         default="benchmark_output/",
         help="output path for report",
@@ -266,6 +270,12 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None
         default=2048,
         help="Output dimension to use, default is 2048",
     )
+    parser.add_argument(
+        "--fwd-only",
+        action="store_true",
+        default=False,
+        help="Run forward pass only, default is both forward and backward passes",
+    )
     args = parser.parse_args()
 
     jagged_input_splits = None
@@ -297,7 +307,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None
     if jagged_input_splits is not None:
         num_gemms_list = [len(jagged_input_splits)]
 
-    token_dim_list = [65536]
+    token_dim_list = [16384, 32768, 65536, 98304]
     hidden_dim_list = [7168]
     output_dim_list = [2048]
 
@@ -371,7 +381,8 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4, m_splits=None
                 recipe_name,
                 use_bias,
                 num_gemms=num_gemms,
-                m_splits=jagged_input_splits,
+                m_splits_provided=jagged_input_splits,
+                fwd_only=args.fwd_only,
             )
             df_linears = pd.concat([df_linears, df])
 

diff --git a/tests/pytorch/nvfp4/test_nvfp4_group_quantize.py b/tests/pytorch/nvfp4/test_nvfp4_group_quantize.py
@@ -198,7 +198,7 @@ def check_group_quantization_nvfp4_versus_reference(
 
         for i in range(len(x_qx)):
             if split_sections[i] == 0:
-                # then just assert the same same and dtype because the buffer won't be zero out
+                # then just assert the same shape and dtype because the buffer won't be zero out
                 assert_same_shape_and_dtype(x_amax_rowwise[i], x_amax_rowwise_ref[i])
                 assert_same_shape_and_dtype(x_qx[i], x_qx_ref[i])
                 assert_same_shape_and_dtype(x_sx[i], x_sx_ref[i])
@@ -221,7 +221,7 @@ def check_group_quantization_nvfp4_versus_reference(
         # assert with zero tolerance
         for i in range(len(x_qx_t)):
             if split_sections[i] == 0:
-                # then just assert the same same and dtype because the buffer won't be zero out
+                # then just assert the same shape and dtype because the buffer won't be zero out
                 assert_same_shape_and_dtype(x_amax_colwise[i], x_amax_colwise_ref[i])
                 assert_same_shape_and_dtype(x_qx_t[i], x_qx_t_ref[i])
                 assert_same_shape_and_dtype(x_sx_t[i], x_sx_t_ref[i])
@@ -247,6 +247,7 @@ def check_group_quantization_nvfp4_versus_reference(
         (1024, 256),
         # larger sizes
         (8192, 1024),
+        (16384, 8192),
         (16384, 16384),
     ],
 )

diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
@@ -174,6 +174,8 @@ list(APPEND transformer_engine_cuda_arch_specific_sources
      hadamard_transform/group_hadamard_transform.cu
      hadamard_transform/hadamard_transform.cu
      hadamard_transform/hadamard_transform_cast_fusion.cu
+     hadamard_transform/group_hadamard_transform_cast_fusion.cu
+     hadamard_transform/group_row_cast_col_hadamard_transform_cast_fusion.cu
      multi_tensor/compute_scale.cu
      recipe/mxfp8_scaling.cu
      transpose/quantize_transpose_square_blockwise.cu

@@ -100,3 +100,18 @@ void nvte_multi_tensor_quantize(const NVTETensor *inputs, NVTETensor *outputs,
     NVTE_CHECK_CUDA(cudaStreamWaitEvent(stream, detail::get_compute_stream_event(s)));
   }
 }
+
+// Group quantize assumes contiguous inputs and outputs in memory allocation
+// TODO (zhongbo): find a better way to make it a more generalized API
+void nvte_group_nvfp4_quantize_with_amax(const NVTETensor input, NVTETensor *outputs,
+                                         const size_t *split_sections, const size_t num_tensors,
+                                         const NVTEQuantizationConfig quant_config,
+                                         cudaStream_t stream) {
+  NVTE_API_CALL(nvte_group_nvfp4_quantize_with_amax);
+  using namespace transformer_engine;
+
+  constexpr bool IS_ACT = false;
+
+  dispatch::group_quantize_fwd_helper<IS_ACT, Empty, nullptr>(input, outputs, split_sections,
+                                                              num_tensors, quant_config, stream);
+}
@@ -19,6 +19,7 @@
 #include "../core/common.cuh"
 #include "../fp8/quantize_fp8.cuh"
 #include "../mxfp8/quantize_mxfp8.cuh"
+#include "../nvfp4/group_quantize_transpose_nvfp4.cuh"
 #include "../nvfp4/quantize_nvfp4.cuh"
 #include "../nvfp4/quantize_transpose_nvfp4.cuh"
 
@@ -320,6 +321,70 @@ void quantize_bwd_helper(const NVTETensor grad, const NVTETensor input, NVTETens
   }
 }
 
+template <bool IS_ACT, typename ParamOP, float (*OP)(float, const ParamOP &)>
+void group_quantize_fwd_helper(const NVTETensor input, NVTETensor *outputs,
+                               const size_t *split_sections, const size_t num_tensors,
+                               const NVTEQuantizationConfig quant_config, cudaStream_t stream) {
+  using namespace detail;
+
+  const Tensor *input_tensor = convertNVTETensorCheck(input);
+  std::vector<Tensor *> output_tensors;
+  for (size_t i = 0; i < num_tensors; ++i) {
+    output_tensors.push_back(convertNVTETensorCheck(outputs[i]));
+  }
+
+  // Quantization config
+  QuantizationConfig quant_config_cpp;
+  if (quant_config != nullptr) {
+    quant_config_cpp = *reinterpret_cast<QuantizationConfig *>(quant_config);
+  }
+
+  // Noop flag
+  Tensor dummy_tensor;
+  Tensor *noop_tensor = &dummy_tensor;
+  if (quant_config_cpp.noop_tensor != nullptr) {
+    noop_tensor = convertNVTETensorCheck(quant_config_cpp.noop_tensor);
+  }
+
+  // Check for unsupported options
+  if (quant_config_cpp.stochastic_rounding) {
+    NVTE_CHECK(output_tensors[0]->scaling_mode == NVTE_NVFP4_1D_SCALING,
+               "Stochastic rounding is only supported for NVFP4 quantization.");
+  }
+
+  // Take the scaling mode of the first output tensor
+  auto scaling_mode = output_tensors[0]->scaling_mode;
+
+  // Dispatch to quantization kernel depending on data format
+  switch (scaling_mode) {
+    case NVTE_NVFP4_1D_SCALING: {
+      NVTE_CHECK(!IS_ACT, "IS_ACT is not supported by FWD NVTE_NVFP4_1D_SCALING");
+
+      // Check tensors
+      CheckNoopTensor(*noop_tensor, "cast_noop");
+      CheckInputTensor(*input_tensor, "input");
+      // Skip checking output tensor list
+      // output list here is allowed to have empty tensor
+
+      // Choose kernel
+      int32_t rows = input_tensor->flat_first_dim();
+      int32_t cols = input_tensor->flat_last_dim();
+      auto dtype = input_tensor->dtype();
+
+      NVTE_CHECK(!quant_config_cpp.nvfp4_2d_quantization,
+                 "2D quantization is not supported for group quantize.");
+
+      // Launch NVFP4 group quantize kernel
+      nvfp4::group_quantize_transpose</*use_2d_quantization*/ false>(
+          *input_tensor, noop_tensor, output_tensors, split_sections, num_tensors,
+          &quant_config_cpp, stream);
+      break;
+    }
+    default:
+      NVTE_ERROR("Not implemented scaling mode: " + to_string(scaling_mode) + ".");
+  }
+}
+
 }  // namespace dispatch
 }  // namespace transformer_engine