diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh index 2cf43b5b65..8f37229fd5 100644 --- a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh +++ b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh @@ -201,7 +201,7 @@ __device__ __forceinline__ void colwise_scaling(const IType *__restrict__ sIn_pt const int warp = threadIdx.x / THREADS_PER_WARP; const int thread_lane = threadIdx.x % THREADS_PER_WARP; - const int tid_Y_colwise = (thread_lane % 4 + warp) % 4; + const int tid_Y_colwise = (thread_lane / 2 + warp) % 4; const int tid_X_colwise = thread_lane; const int thread_offset_Y_colwise = tid_Y_colwise * SCALE_DIM;