From 94b74fe1b5492c526a632a53bc95eaa53c561243 Mon Sep 17 00:00:00 2001 From: Oleg Goncharov Date: Mon, 8 Jun 2026 16:02:31 +0000 Subject: [PATCH] Optimized shared memory stores in colwise path Signed-off-by: Oleg Goncharov --- .../nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh index 2cf43b5b65..8f37229fd5 100644 --- a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh +++ b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh @@ -201,7 +201,7 @@ __device__ __forceinline__ void colwise_scaling(const IType *__restrict__ sIn_pt const int warp = threadIdx.x / THREADS_PER_WARP; const int thread_lane = threadIdx.x % THREADS_PER_WARP; - const int tid_Y_colwise = (thread_lane % 4 + warp) % 4; + const int tid_Y_colwise = (thread_lane / 2 + warp) % 4; const int tid_X_colwise = thread_lane; const int thread_offset_Y_colwise = tid_Y_colwise * SCALE_DIM;