Skip to content

Commit e3320f4

Browse files
committed
greptile
Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com>
1 parent 15b0970 commit e3320f4

5 files changed

Lines changed: 12 additions & 5 deletions

File tree

benchmarks/linear/benchmark_grouped_linear.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
--set=full \
5454
--kernel-name "GroupHadamardAmaxTmaKernel" \
5555
-s 5 -c 5 \
56-
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4 --profile
56+
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
5757
5858
"""
5959

tests/pytorch/nvfp4/test_nvfp4_group_quantize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def check_group_quantization_nvfp4_versus_reference(
198198

199199
for i in range(len(x_qx)):
200200
if split_sections[i] == 0:
201-
# then just assert the same same and dtype because the buffer won't be zero out
201+
# then just assert the same shape and dtype because the buffer won't be zero out
202202
assert_same_shape_and_dtype(x_amax_rowwise[i], x_amax_rowwise_ref[i])
203203
assert_same_shape_and_dtype(x_qx[i], x_qx_ref[i])
204204
assert_same_shape_and_dtype(x_sx[i], x_sx_ref[i])
@@ -221,7 +221,7 @@ def check_group_quantization_nvfp4_versus_reference(
221221
# assert with zero tolerance
222222
for i in range(len(x_qx_t)):
223223
if split_sections[i] == 0:
224-
# then just assert the same same and dtype because the buffer won't be zero out
224+
# then just assert the same shape and dtype because the buffer won't be zero out
225225
assert_same_shape_and_dtype(x_amax_colwise[i], x_amax_colwise_ref[i])
226226
assert_same_shape_and_dtype(x_qx_t[i], x_qx_t_ref[i])
227227
assert_same_shape_and_dtype(x_sx_t[i], x_sx_t_ref[i])

transformer_engine/common/hadamard_transform/group_hadamard_transform.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ void group_hadamard_transform_amax(const Tensor& input_, std::vector<Tensor*>& o
459459
}
460460

461461
// Multi zero out multiple amaxes if needed
462-
// Curretly don't support multi-launch when num_tensors is larger than kMaxTensorsPerKernel
462+
// Currently don't support multi-launch when num_tensors is larger than kMaxTensorsPerKernel
463463
// let the number of threads equal to number of tensors, use 1 block, kMaxTensorsPerKernel threads per block
464464
dim3 block_setup_amax(kMaxTensorsPerKernel);
465465
dim3 grid_setup_amax(1);

transformer_engine/pytorch/csrc/extensions/cast.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,13 @@ void split_quantize_nvfp4_impl(const TensorWrapper &input,
723723
NVTE_CHECK(quantizers.size() == num_tensors, "Expected ", num_tensors,
724724
" NVFP4 quantizers, but got ", quantizers.size(), ".");
725725

726+
// sanity check all the quantizers have the same scaling mode
727+
bool all_same_scaling_mode =
728+
std::all_of(quantizers.begin(), quantizers.end(), [&](const NVFP4Quantizer *quantizer) {
729+
return quantizer->get_scaling_mode() == quantizers.front()->get_scaling_mode();
730+
});
731+
NVTE_CHECK(all_same_scaling_mode, "All quantizers must have the same scaling mode");
732+
726733
// Trivial cases
727734
if (num_tensors == 0) {
728735
return;

transformer_engine/pytorch/csrc/quantizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1481,7 +1481,7 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
14811481
quant_config.set_rng_state(te_rng_state.data());
14821482
}
14831483

1484-
// Restriction for the RHT cast fusion kernel.
1484+
// Restriction for the RHT cast fusion kernel because we are using MMA hardware for computing RHT
14851485
bool eligible_for_rht_cast_fusion =
14861486
input.dtype() == DType::kBFloat16 && rows % 64 == 0 && cols % 128 == 0;
14871487

0 commit comments

Comments
 (0)