From 94b74fe1b5492c526a632a53bc95eaa53c561243 Mon Sep 17 00:00:00 2001
From: Oleg Goncharov <ogoncharov@nvidia.com>
Date: Mon, 8 Jun 2026 16:02:31 +0000
Subject: [PATCH] Optimized shared memory stores in colwise path

Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com>
---
 .../nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
index 2cf43b5b65..8f37229fd5 100644
--- a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
+++ b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
@@ -201,7 +201,7 @@ __device__ __forceinline__ void colwise_scaling(const IType *__restrict__ sIn_pt
   const int warp = threadIdx.x / THREADS_PER_WARP;
   const int thread_lane = threadIdx.x % THREADS_PER_WARP;
 
-  const int tid_Y_colwise = (thread_lane % 4 + warp) % 4;
+  const int tid_Y_colwise = (thread_lane / 2 + warp) % 4;
   const int tid_X_colwise = thread_lane;
 
   const int thread_offset_Y_colwise = tid_Y_colwise * SCALE_DIM;