greptile

zhongbozhu · zhongbozhu · commit e3320f488d6d · 2025-12-03T11:46:55.000-08:00
Signed-off-by: Zhongbo Zhu &lt;zhongboz@nvidia.com&gt;
diff --git a/benchmarks/linear/benchmark_grouped_linear.py b/benchmarks/linear/benchmark_grouped_linear.py
@@ -53,7 +53,7 @@
     --set=full \
     --kernel-name "GroupHadamardAmaxTmaKernel" \
     -s 5 -c 5 \
-    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4 --profile
+    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
 
 """
 
diff --git a/tests/pytorch/nvfp4/test_nvfp4_group_quantize.py b/tests/pytorch/nvfp4/test_nvfp4_group_quantize.py
@@ -198,7 +198,7 @@ def check_group_quantization_nvfp4_versus_reference(
 
         for i in range(len(x_qx)):
             if split_sections[i] == 0:
-                # then just assert the same same and dtype because the buffer won't be zero out
+                # then just assert the same shape and dtype because the buffer won't be zero out
                 assert_same_shape_and_dtype(x_amax_rowwise[i], x_amax_rowwise_ref[i])
                 assert_same_shape_and_dtype(x_qx[i], x_qx_ref[i])
                 assert_same_shape_and_dtype(x_sx[i], x_sx_ref[i])
@@ -221,7 +221,7 @@ def check_group_quantization_nvfp4_versus_reference(
         # assert with zero tolerance
         for i in range(len(x_qx_t)):
             if split_sections[i] == 0:
-                # then just assert the same same and dtype because the buffer won't be zero out
+                # then just assert the same shape and dtype because the buffer won't be zero out
                 assert_same_shape_and_dtype(x_amax_colwise[i], x_amax_colwise_ref[i])
                 assert_same_shape_and_dtype(x_qx_t[i], x_qx_t_ref[i])
                 assert_same_shape_and_dtype(x_sx_t[i], x_sx_t_ref[i])
diff --git a/transformer_engine/common/hadamard_transform/group_hadamard_transform.cu b/transformer_engine/common/hadamard_transform/group_hadamard_transform.cu
@@ -459,7 +459,7 @@ void group_hadamard_transform_amax(const Tensor& input_, std::vector<Tensor*>& o
   }
 
   // Multi zero out multiple amaxes if needed
-  // Curretly don't support multi-launch when num_tensors is larger than kMaxTensorsPerKernel
+  // Currently don't support multi-launch when num_tensors is larger than kMaxTensorsPerKernel
   // let the number of threads equal to number of tensors, use 1 block, kMaxTensorsPerKernel threads per block
   dim3 block_setup_amax(kMaxTensorsPerKernel);
   dim3 grid_setup_amax(1);
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -723,6 +723,13 @@ void split_quantize_nvfp4_impl(const TensorWrapper &input,
   NVTE_CHECK(quantizers.size() == num_tensors, "Expected ", num_tensors,
              " NVFP4 quantizers, but got ", quantizers.size(), ".");
 
+  // sanity check all the quantizers have the same scaling mode
+  bool all_same_scaling_mode =
+      std::all_of(quantizers.begin(), quantizers.end(), [&](const NVFP4Quantizer *quantizer) {
+        return quantizer->get_scaling_mode() == quantizers.front()->get_scaling_mode();
+      });
+  NVTE_CHECK(all_same_scaling_mode, "All quantizers must have the same scaling mode");
+
   // Trivial cases
   if (num_tensors == 0) {
     return;
diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -1481,7 +1481,7 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
     quant_config.set_rng_state(te_rng_state.data());
   }
 
-  // Restriction for the RHT cast fusion kernel.
+  // Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT
   bool eligible_for_rht_cast_fusion =
       input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0;
 

Original file line number	Diff line number	Diff line change
`@@ -459,7 +459,7 @@ void group_hadamard_transform_amax(const Tensor& input_, std::vector<Tensor*>& o`
`459`	`459`	`}`
`460`	`460`
`461`	`461`	`// Multi zero out multiple amaxes if needed`
`462`		`- // Curretly don't support multi-launch when num_tensors is larger than kMaxTensorsPerKernel`
	`462`	`+ // Currently don't support multi-launch when num_tensors is larger than kMaxTensorsPerKernel`
`463`	`463`	`// let the number of threads equal to number of tensors, use 1 block, kMaxTensorsPerKernel threads per block`
`464`	`464`	`dim3 block_setup_amax(kMaxTensorsPerKernel);`
`465`	`465`	`dim3 grid_setup_amax(1);`
Original file line number	Diff line number	Diff line change
`@@ -1481,7 +1481,7 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou`
`1481`	`1481`	`quant_config.set_rng_state(te_rng_state.data());`
`1482`	`1482`	`}`
`1483`	`1483`
`1484`		`- // Restriction for the RHT cast fusion kernel.`
	`1484`	`+ // Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT`
`1485`	`1485`	`bool eligible_for_rht_cast_fusion =`
`1486`	`1486`	`input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0;`
`1487`	`1487`