NVIDIA · sudhakarsingh27 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/tests/pytorch/attention/run_attention_with_cp.py b/tests/pytorch/attention/run_attention_with_cp.py
@@ -626,9 +626,8 @@ def run_dpa_with_cp(
             cu_seqlens_q = get_cu_seqlens_on_cp_rank(
                 cu_seqlens_q, cu_seqlens_q_padded, world_size, rank, True, True
             )
-            num_pads_q = (cu_seqlens_q_padded - cu_seqlens_q)[1:] - (
-                cu_seqlens_q_padded - cu_seqlens_q
-            )[:-1]
+            cu_pads_q = cu_seqlens_q_padded - cu_seqlens_q
+            num_pads_q = cu_pads_q[1:] - cu_pads_q[:-1]
             cu_seqlens_kv_padded = cu_seqlens_kv_padded // world_size
             cu_seqlens_kv = get_cu_seqlens_on_cp_rank(
                 cu_seqlens_kv, cu_seqlens_kv_padded, world_size, rank, True, True

diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py
@@ -319,6 +319,14 @@ def test_cp_with_flash_attention(cp_pool, dtype, model, qkv_format, cp_comm_type
         if cp_comm_type == "a2a+p2p":
             pytest.skip("pad_between_seqs is not yet supported with A2A+P2P CP comm type!")
 
+    if pad_between_seqs:
+        if qkv_format != "thd":
+            pytest.skip("pad_between_seqs only applies to THD format!")
+        if not FlashAttentionUtils.v3_is_installed:
+            pytest.skip("pad_between_seqs with CP requires Flash Attention v3!")
+        if cp_comm_type == "a2a+p2p":
+            pytest.skip("pad_between_seqs is not yet supported with A2A+P2P CP comm type!")
+
     config = model_configs_flash_attn[model]
     config.context_parallel = True
     config.cp_comm_type = cp_comm_type
@@ -328,8 +336,17 @@ def test_cp_with_flash_attention(cp_pool, dtype, model, qkv_format, cp_comm_type
     if config.attn_bias_type != "no_bias" and cp_comm_type in ["all_gather", "a2a", "a2a+p2p"]:
         pytest.skip("No support for bias with cp_comm_type={all_gather, a2a, a2a+p2p}!")
 
-    if qkv_format == "thd" and cp_comm_type in ["all_gather", "a2a+p2p"]:
-        pytest.skip("No support for THD format with cp_comm_type={all_gather, a2a+p2p}!")
+    if qkv_format == "thd":
+        if cp_comm_type == "a2a+p2p":
+            pytest.skip(
+                "CP implementation with QKVO A2A+P2P (Hierarchical A2A) does not support THD format"
+                " yet!"
+            )
+        if cp_comm_type == "all_gather" and not FlashAttentionUtils.v3_is_installed:
+            pytest.skip(
+                "THD + all_gather requires FA3 (seqused_k) to separate tensor offsets from"
+                " visibility limits in the gathered KV buffer."
+            )
 
     if (
         config.window_size != (-1, 0)
@@ -538,8 +555,12 @@ def test_cp_with_fused_attention(
     if config.attn_bias_type != "no_bias" and cp_comm_type in ["all_gather", "a2a", "a2a+p2p"]:
         pytest.skip("No support for bias with cp_comm_type={all_gather, a2a, a2a+p2p}!")
 
-    if qkv_format == "thd" and cp_comm_type in ["all_gather", "a2a+p2p"]:
-        pytest.skip("No support for THD format with cp_comm_type={all_gather, a2a+p2p}!")
+    if qkv_format == "thd":
+        if cp_comm_type == "a2a+p2p":
+            pytest.skip(
+                "CP implementation with QKVO A2A+P2P (Hierarchical A2A) does not support THD format"
+                " yet!"
+            )
 
     if (config.window_size[0] != -1 or config.window_size[1] not in [-1, 0]) and cp_comm_type in [
         "p2p",

@@ -77,6 +77,18 @@ __forceinline__ __device__ int binary_search(int target, int *array, int len) {
   return left - 1;
 }
 
+// Dual-chunk source index for THD CP partitioning. cu_seqlens_s must already be divided by
+// world_size. Single source of truth shared by thd_partition_indices_kernel and
+// thd_reorder_kernel so the two never diverge.
+__forceinline__ __device__ int thd_partition_src_index(int token_id, int *cu_seqlens_s, int batch,
+                                                       int world_size, int rank) {
+  int seq_id = binary_search(token_id, cu_seqlens_s, batch + 1);
+  int seq_len = cu_seqlens_s[seq_id + 1] - cu_seqlens_s[seq_id];
+  int index = token_id - cu_seqlens_s[seq_id];
+  int offset = index < seq_len / 2 ? rank : (world_size - 1) * 2 - rank;
+  return index + cu_seqlens_s[seq_id] * world_size + seq_len / 2 * offset;
+}
+
 /***************************************************************************************************
  * Support THD format for Context Parallel: Generate partitioned indices for input tokens
  **************************************************************************************************/
@@ -96,12 +108,82 @@ __global__ void thd_partition_indices_kernel(int *output, int *cu_seqlens, int b
   int num_threads = blockDim.x * gridDim.x;
 
   for (int token_id = tid; token_id < total_tokens / world_size; token_id += num_threads) {
-    int seq_id = binary_search(token_id, cu_seqlens_s, batch + 1);
-    int seq_len = cu_seqlens_s[seq_id + 1] - cu_seqlens_s[seq_id];
-    int index = token_id - cu_seqlens_s[seq_id];
-    int offset = index < seq_len / 2 ? rank : (world_size - 1) * 2 - rank;
-    index += cu_seqlens_s[seq_id] * world_size + seq_len / 2 * offset;
-    output[token_id] = index;
+    output[token_id] = thd_partition_src_index(token_id, cu_seqlens_s, batch, world_size, rank);
+  }
+}
+
+/***************************************************************************************************
+ * Support THD format for Context Parallel: fused dual-chunk reorder (gather/scatter)
+ * out[gi] = inp[src(gi)] (gather, to_rank_sharded) or out[src(gi)] = inp[gi] (scatter,
+ * to_contiguous). src is computed inline (no materialized index tensor). Modeled on
+ * thd_read_half_tensor_kernel: warp-per-token, cu_seqlens_s in shared, float4 vectorized copy.
+ * hidden_size_in_bytes must be a multiple of 16 (same assumption as thd_read_half_tensor).
+ **************************************************************************************************/
+__global__ void thd_reorder_kernel(void *out, void *inp, int *cu_seqlens, int batch,
+                                   int total_tokens, int world_size, int hidden_size_in_bytes,
+                                   bool scatter) {
+  extern __shared__ int cu_seqlens_s[];
+  for (int i = threadIdx.x; i <= batch; i += blockDim.x) {
+    cu_seqlens_s[i] = cu_seqlens[i] / world_size;
+  }
+  __syncthreads();
+
+  int warpid = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  int laneid = threadIdx.x % 32;
+  int num_warps = (blockDim.x * gridDim.x) / 32;
+  int tpr = total_tokens / world_size;
+  int num_float4s_per_token = hidden_size_in_bytes / sizeof(float4);
+
+  for (int gi = warpid; gi < total_tokens; gi += num_warps) {
+    int rank = gi / tpr;
+    int token_id = gi % tpr;
+    int src = thd_partition_src_index(token_id, cu_seqlens_s, batch, world_size, rank);
+    int rd = scatter ? gi : src;
+    int wr = scatter ? src : gi;
+    float4 *src_tok = reinterpret_cast<float4 *>(reinterpret_cast<char *>(inp) +
+                                                 static_cast<size_t>(rd) * hidden_size_in_bytes);
+    float4 *dst_tok = reinterpret_cast<float4 *>(reinterpret_cast<char *>(out) +
+                                                 static_cast<size_t>(wr) * hidden_size_in_bytes);
+    for (int idx = laneid; idx < num_float4s_per_token; idx += 32) dst_tok[idx] = src_tok[idx];
+  }
+}
+
+/***************************************************************************************************
+ * Support THD format for Context Parallel: copy the VALID token rows of a per-step output/grad
+ * into the destination accumulator, leaving padded tails untouched. Sync-free replacement for the
+ * per-batch `.item()` slice-copy loops in the AllGather CP THD fwd/bwd. cu_seqlens_padded gives a
+ * token's segment + local offset in the padded layout; cu_seqlens gives each segment's valid
+ * length. Warp-per-token, float4 vectorized, modeled on thd_reorder_kernel.
+ **************************************************************************************************/
+__global__ void thd_valid_copy_kernel(void *out, void *inp, int *cu_seqlens_padded, int *cu_seqlens,
+                                      int batch, int total_tokens, int hidden_size_in_bytes) {
+  extern __shared__ int padded_s[];       // [0..batch] padded boundaries
+  int *valid_s = padded_s + (batch + 1);  // [0..batch] valid boundaries
+  for (int i = threadIdx.x; i <= batch; i += blockDim.x) {
+    padded_s[i] = cu_seqlens_padded[i];
+    valid_s[i] = cu_seqlens[i];
+  }
+  __syncthreads();
+
+  int warpid = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  int laneid = threadIdx.x % 32;
+  int num_warps = (blockDim.x * gridDim.x) / 32;
+  int num_float4s_per_token = hidden_size_in_bytes / sizeof(float4);
+
+  for (int token_id = warpid; token_id < total_tokens; token_id += num_warps) {
+    int seq_id = binary_search(token_id, padded_s, batch + 1);
+    int local = token_id - padded_s[seq_id];
+    int valid_len = valid_s[seq_id + 1] - valid_s[seq_id];
+    // local can be negative when a segment's padded start is shifted past earlier tokens (e.g.
+    // step-1 chunks: cu_seqlens_padded[:-1] += chunk_size). Those tokens are outside any valid
+    // run, so skip them -- otherwise the first chunk's already-written rows get clobbered.
+    if (local >= 0 && local < valid_len) {
+      float4 *src_tok = reinterpret_cast<float4 *>(
+          reinterpret_cast<char *>(inp) + static_cast<size_t>(token_id) * hidden_size_in_bytes);
+      float4 *dst_tok = reinterpret_cast<float4 *>(
+          reinterpret_cast<char *>(out) + static_cast<size_t>(token_id) * hidden_size_in_bytes);
+      for (int idx = laneid; idx < num_float4s_per_token; idx += 32) dst_tok[idx] = src_tok[idx];
+    }
   }
 }
 
@@ -678,6 +760,57 @@ void thd_get_partitioned_indices(const Tensor &cu_seqlens, Tensor output, int to
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
 
+void thd_reorder(const Tensor &inp, const Tensor &cu_seqlens, Tensor &out, int world_size,
+                 bool scatter, int total_tokens, cudaStream_t stream) {
+  using namespace transformer_engine;
+  NVTE_CHECK(cu_seqlens.dtype() == DType::kInt32);
+  NVTE_CHECK(cu_seqlens.dim() == 1);
+  auto cu_seqlens_shape = cu_seqlens.shape();
+  NVTE_CHECK(cu_seqlens_shape[0] >= 2);
+  NVTE_CHECK(world_size > 0);
+  NVTE_CHECK(total_tokens > 0 && total_tokens % (world_size * 2) == 0);
+
+  auto inp_shape = inp.shape();
+  size_t row_elems = 1;
+  for (int i = 1; i < inp.dim(); i++) row_elems *= inp_shape[i];
+  int hidden_size_in_bytes = (row_elems * typeToNumBits(inp.dtype())) / 8;
+  NVTE_CHECK(hidden_size_in_bytes % 16 == 0);  // 128-bit load/store
+
+  int batch = cu_seqlens_shape[0] - 1;
+  constexpr unsigned int block = 256;
+  unsigned int grid = (static_cast<unsigned int>(total_tokens) * 32 + block - 1) / block;
+  thd_reorder_kernel<<<grid, block, sizeof(int) * (batch + 1), stream>>>(
+      out.data.dptr, inp.data.dptr, reinterpret_cast<int *>(cu_seqlens.data.dptr), batch,
+      total_tokens, world_size, hidden_size_in_bytes, scatter);
+  NVTE_CHECK_CUDA(cudaGetLastError());
+}
+
+void thd_valid_copy(const Tensor &inp, const Tensor &cu_seqlens_padded, const Tensor &cu_seqlens,
+                    Tensor &out, int total_tokens, cudaStream_t stream) {
+  using namespace transformer_engine;
+  NVTE_CHECK(cu_seqlens.dtype() == DType::kInt32);
+  NVTE_CHECK(cu_seqlens_padded.dtype() == DType::kInt32);
+  NVTE_CHECK(cu_seqlens.dim() == 1 && cu_seqlens_padded.dim() == 1);
+  auto cu_seqlens_shape = cu_seqlens.shape();
+  NVTE_CHECK(cu_seqlens_shape[0] >= 2);
+  NVTE_CHECK(cu_seqlens_padded.shape()[0] == cu_seqlens_shape[0]);
+  NVTE_CHECK(total_tokens > 0);
+
+  auto inp_shape = inp.shape();
+  size_t row_elems = 1;
+  for (int i = 1; i < inp.dim(); i++) row_elems *= inp_shape[i];
+  int hidden_size_in_bytes = (row_elems * typeToNumBits(inp.dtype())) / 8;
+  NVTE_CHECK(hidden_size_in_bytes % 16 == 0);  // 128-bit load/store
+
+  int batch = cu_seqlens_shape[0] - 1;
+  constexpr unsigned int block = 256;
+  unsigned int grid = (static_cast<unsigned int>(total_tokens) * 32 + block - 1) / block;
+  thd_valid_copy_kernel<<<grid, block, sizeof(int) * 2 * (batch + 1), stream>>>(
+      out.data.dptr, inp.data.dptr, reinterpret_cast<int *>(cu_seqlens_padded.data.dptr),
+      reinterpret_cast<int *>(cu_seqlens.data.dptr), batch, total_tokens, hidden_size_in_bytes);
+  NVTE_CHECK_CUDA(cudaGetLastError());
+}
+
 }  // namespace context_parallel
 }  // namespace transformer_engine
 
@@ -750,3 +883,24 @@ void nvte_cp_thd_get_partitioned_indices(const NVTETensor &cu_seqlens, NVTETenso
                                                 *convertNVTETensorCheck(output), total_tokens,
                                                 world_size, rank, stream);
 }
+
+void nvte_cp_thd_reorder(const NVTETensor &inp, const NVTETensor &cu_seqlens, NVTETensor out,
+                         int world_size, int scatter, int total_tokens, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_cp_thd_reorder);
+  using namespace transformer_engine;
+
+  context_parallel::thd_reorder(*convertNVTETensorCheck(inp), *convertNVTETensorCheck(cu_seqlens),
+                                *convertNVTETensorCheck(out), world_size, scatter != 0,
+                                total_tokens, stream);
+}
+
+void nvte_cp_thd_valid_copy(const NVTETensor &inp, const NVTETensor &cu_seqlens_padded,
+                            const NVTETensor &cu_seqlens, NVTETensor out, int total_tokens,
+                            cudaStream_t stream) {
+  NVTE_API_CALL(nvte_cp_thd_valid_copy);
+  using namespace transformer_engine;
+
+  context_parallel::thd_valid_copy(
+      *convertNVTETensorCheck(inp), *convertNVTETensorCheck(cu_seqlens_padded),
+      *convertNVTETensorCheck(cu_seqlens), *convertNVTETensorCheck(out), total_tokens, stream);
+}
@@ -533,6 +533,41 @@ void nvte_cp_thd_get_partitioned_indices(const NVTETensor &cu_seqlens, NVTETenso
                                          int total_tokens, int world_size, int rank,
                                          cudaStream_t stream);
 
+/*!  \brief Fused dual-chunk THD reorder for Context Parallel (gather or scatter).
+ *
+ * Computes the dual-chunk source index inline (no materialized index tensor) and copies each
+ * token row. scatter=0: out[gi]=inp[src(gi)] (contiguous->rank-sharded); scatter=1:
+ * out[src(gi)]=inp[gi] (rank-sharded->contiguous). Row size must be a multiple of 16 bytes.
+ *
+ *  \param[in]     inp           Input THD tensor [total_tokens, ...].
+ *  \param[in]     cu_seqlens    Padded cumulative sequence lengths, [batch_size + 1], int32.
+ *  \param[out]    out           Output tensor, same shape/dtype as inp.
+ *  \param[in]     world_size    Context-parallel size.
+ *  \param[in]     scatter       0 = gather (rank-sharded), 1 = scatter (contiguous).
+ *  \param[in]     total_tokens  Total padded tokens (= inp.shape[0]).
+ *  \param[in]     stream        CUDA stream used for this operation.
+ */
+void nvte_cp_thd_reorder(const NVTETensor &inp, const NVTETensor &cu_seqlens, NVTETensor out,
+                         int world_size, int scatter, int total_tokens, cudaStream_t stream);
+
+/*!  \brief Copy valid token rows of a per-step THD output/grad into an accumulator (CP AllGather).
+ *
+ * Sync-free replacement for the per-batch `.item()` slice-copy loops in the AllGather CP THD
+ * fwd/bwd. For each segment, copies rows [cu_seqlens_padded[b], cu_seqlens_padded[b]+valid_len_b)
+ * from inp to out at identical indices, leaving padded tails of out untouched. Row size must be a
+ * multiple of 16 bytes.
+ *
+ *  \param[in]     inp                 Per-step THD source tensor [total_tokens, ...].
+ *  \param[in]     cu_seqlens_padded   Padded cumulative sequence lengths, [batch_size + 1], int32.
+ *  \param[in]     cu_seqlens          Valid cumulative sequence lengths, [batch_size + 1], int32.
+ *  \param[in,out] out                 Destination accumulator, same shape/dtype as inp.
+ *  \param[in]     total_tokens        Total padded tokens (= inp.shape[0]).
+ *  \param[in]     stream              CUDA stream used for this operation.
+ */
+void nvte_cp_thd_valid_copy(const NVTETensor &inp, const NVTETensor &cu_seqlens_padded,
+                            const NVTETensor &cu_seqlens, NVTETensor out, int total_tokens,
+                            cudaStream_t stream);
+
 /*!  \brief Convert tensor from THD to BSHD format.
  *
  * \warning   This API is **experimental** and subject to change.