[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · YigongQin · commit 8a3a36d822ae · 2026-04-22T14:08:17.000-07:00
for more information, see https://pre-commit.ci Signed-off-by: YigongQin <qqqyyy1233@outlook.com>
diff --git a/tests/pytorch/test_nvfp4_pertoken_quant.py b/tests/pytorch/test_nvfp4_pertoken_quant.py
@@ -135,9 +135,9 @@ def test_zero_input(self, dtype):
 
         # When amax=0, compute_global_encode_scaling_factor_FP4 returns 1.0
         # so global_scale = 1/S_enc = 1/1 = 1.0
-        assert (per_token_scales == 1.0).all(), (
-            f"Zero input should give global_scale=1.0 (S_enc fallback), got {per_token_scales}"
-        )
+        assert (
+            per_token_scales == 1.0
+        ).all(), f"Zero input should give global_scale=1.0 (S_enc fallback), got {per_token_scales}"
 
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     def test_uniform_rows_same_scale(self, dtype):
diff --git a/transformer_engine/common/cast/cast.cu b/transformer_engine/common/cast/cast.cu
@@ -166,22 +166,14 @@ void nvte_quantize_nvfp4_pertoken(const NVTETensor input, NVTETensor output_data
 
   if (itype == NVTEDType::kNVTEBFloat16) {
     dispatch::nvfp4::quantize_pertoken_kernel::launch_quantize_pertoken_nvfp4<__nv_bfloat16>(
-        num_rows, num_cols,
-        reinterpret_cast<const __nv_bfloat16 *>(input_ptr),
-        nullptr,
-        reinterpret_cast<uint8_t *>(data_ptr),
-        reinterpret_cast<fp8e4m3 *>(scales_ptr),
-        reinterpret_cast<float *>(pertoken_ptr),
-        stream);
+        num_rows, num_cols, reinterpret_cast<const __nv_bfloat16 *>(input_ptr), nullptr,
+        reinterpret_cast<uint8_t *>(data_ptr), reinterpret_cast<fp8e4m3 *>(scales_ptr),
+        reinterpret_cast<float *>(pertoken_ptr), stream);
   } else if (itype == NVTEDType::kNVTEFloat16) {
     dispatch::nvfp4::quantize_pertoken_kernel::launch_quantize_pertoken_nvfp4<half>(
-        num_rows, num_cols,
-        reinterpret_cast<const half *>(input_ptr),
-        nullptr,
-        reinterpret_cast<uint8_t *>(data_ptr),
-        reinterpret_cast<fp8e4m3 *>(scales_ptr),
-        reinterpret_cast<float *>(pertoken_ptr),
-        stream);
+        num_rows, num_cols, reinterpret_cast<const half *>(input_ptr), nullptr,
+        reinterpret_cast<uint8_t *>(data_ptr), reinterpret_cast<fp8e4m3 *>(scales_ptr),
+        reinterpret_cast<float *>(pertoken_ptr), stream);
   } else {
     NVTE_ERROR(
         "Unsupported input dtype for per-token NVFP4 quantization. "
diff --git a/transformer_engine/common/cast/nvfp4/quantize_pertoken_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_pertoken_nvfp4.cuh
@@ -119,8 +119,8 @@ __launch_bounds__(BLOCK_SIZE)
   // Block-wide max reduction
   using BlockReduce = cub::BlockReduce<float, BLOCK_SIZE>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  float row_amax = BlockReduce(temp_storage).Reduce(thread_max,
-      [](float a, float b) { return fmaxf(a, b); });
+  float row_amax =
+      BlockReduce(temp_storage).Reduce(thread_max, [](float a, float b) { return fmaxf(a, b); });
 
   // Compute and store per-token global scale
   // global_scale = row_amax / (fp8_max * fp4_max)
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -1592,9 +1592,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> quantize_nvfp4_pertoken(at::Tenso
   auto te_scales = makeTransformerEngineTensor(output_scales);
   auto te_pertoken = makeTransformerEngineTensor(output_per_token_scales);
 
-  nvte_quantize_nvfp4_pertoken(
-      te_input.data(), te_data.data(), te_scales.data(), te_pertoken.data(),
-      num_rows, num_cols, stream);
+  nvte_quantize_nvfp4_pertoken(te_input.data(), te_data.data(), te_scales.data(),
+                               te_pertoken.data(), num_rows, num_cols, stream);
 
   return {output_data, output_scales, output_per_token_scales};
 }