[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 893710fc7518 · 2026-04-21T21:54:20.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/pytorch/test_backward_override.py b/tests/pytorch/test_backward_override.py
@@ -200,7 +200,9 @@ def _maybe_skip_unsupported_recipe_shape(
                 " by 32."
             )
             return
-        if recipe_name in ("nvfp4", "nvfp4_pertoken") and (flat_first_dim % 16 != 0 or last_dim % 16 != 0):
+        if recipe_name in ("nvfp4", "nvfp4_pertoken") and (
+            flat_first_dim % 16 != 0 or last_dim % 16 != 0
+        ):
             pytest.skip(
                 "Linear/LayerNormLinear + NVFP4 requires prod(shape[:-1]) and shape[-1] divisible"
                 " by 16."
@@ -225,7 +227,9 @@ def _maybe_skip_unsupported_recipe_shape(
             pytest.skip(
                 "te_ops.Linear + MXFP8 requires prod(shape[:-1]) and shape[-1] divisible by 32."
             )
-        if recipe_name in ("nvfp4", "nvfp4_pertoken") and (flat_first_dim % 16 != 0 or last_dim % 16 != 0):
+        if recipe_name in ("nvfp4", "nvfp4_pertoken") and (
+            flat_first_dim % 16 != 0 or last_dim % 16 != 0
+        ):
             pytest.skip(
                 "te_ops.Linear + NVFP4 requires prod(shape[:-1]) and shape[-1] divisible by 16."
             )
diff --git a/tests/pytorch/test_nvfp4_pertoken_quant.py b/tests/pytorch/test_nvfp4_pertoken_quant.py
@@ -95,7 +95,9 @@ def test_output_shapes(self, num_rows, num_cols, dtype):
 
         assert data.shape == (num_rows, num_cols // 2), f"data shape: {data.shape}"
         assert scales.shape == (num_rows, num_cols // 16), f"scales shape: {scales.shape}"
-        assert per_token_scales.shape == (num_rows,), f"per_token_scales shape: {per_token_scales.shape}"
+        assert per_token_scales.shape == (
+            num_rows,
+        ), f"per_token_scales shape: {per_token_scales.shape}"
         assert data.dtype == torch.uint8
         assert scales.dtype == torch.uint8
         assert per_token_scales.dtype == torch.float32
diff --git a/transformer_engine/common/cast/cast.cu b/transformer_engine/common/cast/cast.cu
@@ -148,13 +148,9 @@ void nvte_group_nvfp4_quantize_with_amax(const NVTETensor input, NVTETensor *out
       input, outputs, split_sections, num_tensors, quant_config, stream);
 }
 
-void nvte_quantize_nvfp4_pertoken(const NVTETensor input,
-                                  NVTETensor output_data,
-                                  NVTETensor output_scales,
-                                  NVTETensor output_per_token_scales,
-                                  size_t num_rows,
-                                  size_t num_cols,
-                                  cudaStream_t stream) {
+void nvte_quantize_nvfp4_pertoken(const NVTETensor input, NVTETensor output_data,
+                                  NVTETensor output_scales, NVTETensor output_per_token_scales,
+                                  size_t num_rows, size_t num_cols, cudaStream_t stream) {
   NVTE_API_CALL(nvte_quantize_nvfp4_pertoken);
   using namespace transformer_engine;
 
@@ -170,24 +166,21 @@ void nvte_quantize_nvfp4_pertoken(const NVTETensor input,
 
   if (itype == DType::kBFloat16) {
     dispatch::nvfp4::quantize_pertoken_kernel::launch_quantize_pertoken_nvfp4<__nv_bfloat16>(
-        num_rows, num_cols,
-        reinterpret_cast<const __nv_bfloat16 *>(input_tensor.data.dptr),
+        num_rows, num_cols, reinterpret_cast<const __nv_bfloat16 *>(input_tensor.data.dptr),
         nullptr,  // row_offsets
         reinterpret_cast<uint8_t *>(data_tensor->data.dptr),
         reinterpret_cast<fp8e4m3 *>(scales_tensor->data.dptr),
-        reinterpret_cast<float *>(pertoken_tensor->data.dptr),
-        stream);
+        reinterpret_cast<float *>(pertoken_tensor->data.dptr), stream);
   } else if (itype == DType::kFloat16) {
     dispatch::nvfp4::quantize_pertoken_kernel::launch_quantize_pertoken_nvfp4<half>(
-        num_rows, num_cols,
-        reinterpret_cast<const half *>(input_tensor.data.dptr),
+        num_rows, num_cols, reinterpret_cast<const half *>(input_tensor.data.dptr),
         nullptr,  // row_offsets
         reinterpret_cast<uint8_t *>(data_tensor->data.dptr),
         reinterpret_cast<fp8e4m3 *>(scales_tensor->data.dptr),
-        reinterpret_cast<float *>(pertoken_tensor->data.dptr),
-        stream);
+        reinterpret_cast<float *>(pertoken_tensor->data.dptr), stream);
   } else {
-    NVTE_ERROR("Unsupported input dtype for per-token NVFP4 quantization. "
-               "Expected BFloat16 or Float16.");
+    NVTE_ERROR(
+        "Unsupported input dtype for per-token NVFP4 quantization. "
+        "Expected BFloat16 or Float16.");
   }
 }
diff --git a/transformer_engine/common/cast/nvfp4/quantize_pertoken_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_pertoken_nvfp4.cuh
@@ -25,6 +25,7 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #include <cub/cub.cuh>
 
 #include "../../common.h"
@@ -74,24 +75,20 @@ __global__ void
 __launch_bounds__(BLOCK_SIZE)
 #endif
     quantize_pertoken_nvfp4_kernel(
-        const int num_rows,
-        const int num_cols,
-        const IType *__restrict__ input,
+        const int num_rows, const int num_cols, const IType *__restrict__ input,
         const int *__restrict__ row_offsets,  // optional: nullptr for identity mapping
-        uint8_t *__restrict__ output_data,
-        fp8e4m3 *__restrict__ output_scales,
-        float *__restrict__ output_per_token_scales,
-        const int scale_stride) {
+        uint8_t *__restrict__ output_data, fp8e4m3 *__restrict__ output_scales,
+        float *__restrict__ output_per_token_scales, const int scale_stride) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 
   using namespace detail;
-  constexpr float fp8_max = TypeExtrema<fp8e4m3>::max;   // 448.0f
-  constexpr float fp4_max = TypeExtrema<fp4e2m1>::max;   // 6.0f
+  constexpr float fp8_max = TypeExtrema<fp8e4m3>::max;  // 448.0f
+  constexpr float fp4_max = TypeExtrema<fp4e2m1>::max;  // 6.0f
   constexpr float fp4_max_inv = 1.0f / fp4_max;
 
   // Packed type: 4 elements per float2 pair for FP4 conversion
-  using IType2 = typename std::conditional<std::is_same<IType, half>::value,
-                                           half2, __nv_bfloat162>::type;
+  using IType2 =
+      typename std::conditional<std::is_same<IType, half>::value, half2, __nv_bfloat162>::type;
 
   const int row_idx = blockIdx.x;
   if (row_idx >= num_rows) return;
@@ -167,9 +164,7 @@ __launch_bounds__(BLOCK_SIZE)
     output_scales[row_idx * scale_stride + sf_idx] = S_dec_b;
 
     // Compute inverse block scale for quantization
-    float block_encode_scale = (S_dec_b_f != 0.0f)
-                                   ? __fdividef(S_enc, S_dec_b_f)
-                                   : 0.0f;
+    float block_encode_scale = (S_dec_b_f != 0.0f) ? __fdividef(S_enc, S_dec_b_f) : 0.0f;
 
     // Quantize 16 elements to FP4 and pack into 8 bytes
     uint8_t *out_ptr = output_data + actual_row * (num_cols / 2) + col_start / 2;
@@ -190,30 +185,22 @@ __launch_bounds__(BLOCK_SIZE)
  * Host-side launcher for per-token NVFP4 quantization.
  */
 template <typename IType>
-void launch_quantize_pertoken_nvfp4(
-    const int num_rows,
-    const int num_cols,
-    const IType *input,
-    const int *row_offsets,
-    uint8_t *output_data,
-    fp8e4m3 *output_scales,
-    float *output_per_token_scales,
-    cudaStream_t stream) {
+void launch_quantize_pertoken_nvfp4(const int num_rows, const int num_cols, const IType *input,
+                                    const int *row_offsets, uint8_t *output_data,
+                                    fp8e4m3 *output_scales, float *output_per_token_scales,
+                                    cudaStream_t stream) {
   if (num_rows == 0 || num_cols == 0) return;
 
-  NVTE_CHECK(num_cols % PERTOKEN_SF_VEC_SIZE == 0,
-             "num_cols must be a multiple of ", PERTOKEN_SF_VEC_SIZE,
-             " for per-token NVFP4 quantization, got ", num_cols);
+  NVTE_CHECK(num_cols % PERTOKEN_SF_VEC_SIZE == 0, "num_cols must be a multiple of ",
+             PERTOKEN_SF_VEC_SIZE, " for per-token NVFP4 quantization, got ", num_cols);
 
   const int scale_stride = num_cols / PERTOKEN_SF_VEC_SIZE;
   dim3 grid(num_rows);
   dim3 block(PERTOKEN_BLOCK_SIZE);
 
   quantize_pertoken_nvfp4_kernel<IType, PERTOKEN_BLOCK_SIZE>
-      <<<grid, block, 0, stream>>>(
-          num_rows, num_cols, input, row_offsets,
-          output_data, output_scales, output_per_token_scales,
-          scale_stride);
+      <<<grid, block, 0, stream>>>(num_rows, num_cols, input, row_offsets, output_data,
+                                   output_scales, output_per_token_scales, scale_stride);
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
 
diff --git a/transformer_engine/common/include/transformer_engine/cast.h b/transformer_engine/common/include/transformer_engine/cast.h
@@ -466,13 +466,9 @@ void nvte_group_nvfp4_quantize_with_amax(const NVTETensor input, NVTETensor *out
  *  \param[in]      num_cols                Number of columns (must be multiple of 16).
  *  \param[in]      stream                  CUDA stream.
  */
-void nvte_quantize_nvfp4_pertoken(const NVTETensor input,
-                                  NVTETensor output_data,
-                                  NVTETensor output_scales,
-                                  NVTETensor output_per_token_scales,
-                                  size_t num_rows,
-                                  size_t num_cols,
-                                  cudaStream_t stream);
+void nvte_quantize_nvfp4_pertoken(const NVTETensor input, NVTETensor output_data,
+                                  NVTETensor output_scales, NVTETensor output_per_token_scales,
+                                  size_t num_rows, size_t num_cols, cudaStream_t stream);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -1556,8 +1556,7 @@ std::vector<py::object> split_quantize(const at::Tensor &tensor,
   return output_py_list;
 }
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor> quantize_nvfp4_pertoken(
-    at::Tensor input) {
+std::tuple<at::Tensor, at::Tensor, at::Tensor> quantize_nvfp4_pertoken(at::Tensor input) {
   // Input validation
   NVTE_CHECK(input.dim() == 2, "Input must be 2D (num_rows, num_cols)");
   NVTE_CHECK(input.is_cuda(), "Input must be on CUDA device");
@@ -1574,8 +1573,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> quantize_nvfp4_pertoken(
 
   // Allocate outputs
   auto output_data = at::empty({num_rows, num_cols / 2}, options.dtype(at::kByte));
-  auto output_scales = at::empty(
-      {num_rows, (num_cols + 15) / 16}, options.dtype(at::kByte));
+  auto output_scales = at::empty({num_rows, (num_cols + 15) / 16}, options.dtype(at::kByte));
   auto output_per_token_scales = at::empty({num_rows}, options.dtype(at::kFloat));
 
   // Wrap as NVTETensors
@@ -1586,9 +1584,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> quantize_nvfp4_pertoken(
 
   auto stream = at::cuda::getCurrentCUDAStream().stream();
 
-  nvte_quantize_nvfp4_pertoken(
-      te_input.data(), te_data.data(), te_scales.data(), te_pertoken.data(),
-      num_rows, num_cols, stream);
+  nvte_quantize_nvfp4_pertoken(te_input.data(), te_data.data(), te_scales.data(),
+                               te_pertoken.data(), num_rows, num_cols, stream);
 
   return {output_data, output_scales, output_per_token_scales};
 }
diff --git a/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py b/transformer_engine/pytorch/ops/fused/forward_grouped_mlp.py
@@ -751,9 +751,7 @@ def fuser_forward(
             grouped_fc1_x = input_
         else:
             fc1_x = maybe_dequantize(input_, dtype)
-            grouped_fc1_x = tex.group_quantize(
-                fc1_x, fc1_input_quantizer, num_groups, split_sizes
-            )
+            grouped_fc1_x = tex.group_quantize(fc1_x, fc1_input_quantizer, num_groups, split_sizes)
 
         # Pack data tensors for cuDNN kernel
         # NVFP4: data is uint8 (packed FP4), reinterpret as float4_e2m1fn_x2
@@ -785,7 +783,8 @@ def fuser_forward(
         global_scale_tensor = None
         try:
             _, _, fc1_per_token_scales = tex.quantize_nvfp4_pertoken(
-                fc1_x.reshape(in_shape[0], in_shape[1]) if not isinstance(input_, GroupedTensor)
+                fc1_x.reshape(in_shape[0], in_shape[1])
+                if not isinstance(input_, GroupedTensor)
                 else input_.dequantize(dtype=dtype).reshape(in_shape[0], in_shape[1])
             )
             global_scale_tensor = fc1_per_token_scales.reshape(-1, 1, 1)
@@ -831,9 +830,7 @@ def fuser_forward(
 
             fc1_w_data = fc1_weight_for_gemm.rowwise_data
             fc1_w_data = fc1_w_data.view(dtype=torch.float4_e2m1fn_x2)
-            fc1_w_data = fc1_w_data.view(
-                num_groups, fc1_weight_shape[0], fc1_weight_shape[1] // 2
-            )
+            fc1_w_data = fc1_w_data.view(num_groups, fc1_weight_shape[0], fc1_weight_shape[1] // 2)
             fc1_w_data = fc1_w_data.permute(1, 2, 0)
             fc1_w_scales = fc1_weight_for_gemm.scale_inv.view(dtype=torch.float8_e4m3fn)
             fc1_w_scales = fc1_w_scales.view(
@@ -930,9 +927,7 @@ def fuser_forward(
 
             fc2_w_data = fc2_weight_for_gemm.rowwise_data
             fc2_w_data = fc2_w_data.view(dtype=torch.float4_e2m1fn_x2)
-            fc2_w_data = fc2_w_data.view(
-                num_groups, fc2_weight_shape[0], fc2_weight_shape[1] // 2
-            )
+            fc2_w_data = fc2_w_data.view(num_groups, fc2_weight_shape[0], fc2_weight_shape[1] // 2)
             fc2_w_data = fc2_w_data.permute(1, 2, 0)
 
             fc2_w_scales = fc2_weight_for_gemm.scale_inv.view(dtype=torch.float8_e4m3fn)