[PyTorch] Preserve router leading dimensions

harryzhou2000 · harryzhou2000 · commit 49c6553641e6 · 2026-06-15T20:31:07.000-07:00
Signed-off-by: Harry Zhou &lt;hhanyu@nvidia.com&gt;
diff --git a/tests/pytorch/test_fused_router.py b/tests/pytorch/test_fused_router.py
@@ -390,6 +390,33 @@ def test_topk_softmax(
     )
 
 
+@pytest.mark.parametrize("topk_index_dtype", [None, torch.int16])
+def test_topk_preserves_leading_dims(topk_index_dtype):
+    num_tokens = 128
+    num_experts = 32
+    topk = 4
+    logits = torch.randn(num_tokens, 2, num_experts, device="cuda", dtype=torch.float32)
+    topk_indices = None
+    if topk_index_dtype is not None:
+        topk_indices = torch.empty(num_tokens, 2, topk, device="cuda", dtype=topk_index_dtype)
+
+    probs, routing_output = fused_topk_with_score_function(
+        logits=logits,
+        topk=topk,
+        use_pre_softmax=False,
+        num_groups=None,
+        group_topk=None,
+        scaling_factor=None,
+        score_function="softmax",
+        expert_bias=None,
+        topk_indices=topk_indices,
+    )
+
+    assert probs.shape == logits.shape
+    expected_routing_shape = topk_indices.shape if topk_indices is not None else logits.shape
+    assert routing_output.shape == expected_routing_shape
+
+
 @pytest.mark.parametrize("dtype", [torch.float32])
 @pytest.mark.parametrize("num_tokens", [2048, 7168])
 @pytest.mark.parametrize("num_experts", [1024, 256, 128, 32])
diff --git a/transformer_engine/pytorch/csrc/extensions/router.cpp b/transformer_engine/pytorch/csrc/extensions/router.cpp
@@ -47,19 +47,18 @@ static bool is_supported_dense_index_dtype(at::ScalarType dtype) {
 }
 
 static void check_dense_topk_indices(const at::Tensor &topk_indices, const at::Tensor &ref,
-                                     int64_t num_tokens, int topk) {
+                                     c10::IntArrayRef leading_dims, int topk) {
   TORCH_CHECK(topk_indices.is_cuda(), "topk_indices must be a CUDA tensor");
   TORCH_CHECK(topk_indices.device() == ref.device(), "topk_indices must be on the same device as ",
               "the logits/grad tensor");
   TORCH_CHECK(topk_indices.is_contiguous(), "topk_indices must be contiguous");
   TORCH_CHECK(is_supported_dense_index_dtype(topk_indices.scalar_type()),
               "topk_indices dtype must be int16, int32, or int64, got ",
               topk_indices.scalar_type());
-  TORCH_CHECK(topk_indices.numel() == num_tokens * static_cast<int64_t>(topk),
-              "topk_indices must contain num_tokens * topk elements, got ", topk_indices.numel(),
-              " but expected ", num_tokens * static_cast<int64_t>(topk));
-  TORCH_CHECK(topk_indices.dim() >= 1 && topk_indices.size(-1) == topk,
-              "topk_indices last dimension must be topk=", topk, ", got shape ",
+  std::vector<int64_t> expected_shape(leading_dims.begin(), leading_dims.end());
+  expected_shape.push_back(static_cast<int64_t>(topk));
+  TORCH_CHECK(topk_indices.sizes() == expected_shape,
+              "topk_indices shape must be [*leading_dims, topk]=", expected_shape, ", got ",
               topk_indices.sizes());
 }
 
@@ -97,7 +96,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> fused_topk_with_score_function_fw
     TORCH_CHECK(routing_map_format == NVTE_ROUTING_MAP_FORMAT_BYTEMAP,
                 "topk_indices output cannot be combined with non-default routing_map_format; "
                 "dense top-k indices are returned instead of a routing map.");
-    check_dense_topk_indices(topk_indices.value(), logits, num_tokens, topk);
+    check_dense_topk_indices(topk_indices.value(), logits, sizes.slice(0, sizes.size() - 1), topk);
   }
 
   // Reformat the input to make it compatible with the kernel
@@ -179,7 +178,7 @@ void fused_topk_with_score_function_bwd(at::Tensor routing_map, at::Tensor inter
   TORCH_CHECK(topk > 0 && topk <= num_experts, "topk must be in [1, num_experts], got topk=", topk,
               " num_experts=", num_experts);
   if (use_dense_indices) {
-    check_dense_topk_indices(routing_map, grad_probs, num_tokens, topk);
+    check_dense_topk_indices(routing_map, grad_probs, sizes.slice(0, sizes.size() - 1), topk);
   }
 
   auto scaling_factor_value = scaling_factor.has_value() ? scaling_factor.value() : 1.0f;
diff --git a/transformer_engine/pytorch/router.py b/transformer_engine/pytorch/router.py
@@ -86,10 +86,6 @@ def forward(
         topk_indices: Optional[torch.Tensor],
     ):
         # pylint: disable=missing-function-docstring
-        tensor_shape = logits.shape
-        logits = logits.view(-1, tensor_shape[-1])
-        num_tokens = logits.size(0)
-        num_experts = logits.size(1)
         probs, routing_output, intermediate_output = tex.fused_topk_with_score_function_fwd(
             logits,
             topk,
@@ -104,31 +100,25 @@ def forward(
         )
         if topk_indices is not None:
             routing_output = topk_indices
-        probs = probs.view(tensor_shape)
         if topk_indices is not None:
             ctx.mark_dirty(topk_indices)
         ctx.mark_non_differentiable(routing_output)
         ctx.save_for_backward(routing_output, intermediate_output)
-        ctx.num_tokens = num_tokens
-        ctx.num_experts = num_experts
-        ctx.tensor_shape = tensor_shape
         ctx.use_pre_softmax = use_pre_softmax
         ctx.topk = topk
         ctx.scaling_factor = scaling_factor
         ctx.score_function = score_function
         ctx.routing_map_format = routing_map_format
-        ctx.logits_dtype = logits.dtype
         ctx.use_dense_indices = topk_indices is not None
         return probs, routing_output
 
     @staticmethod
     def backward(ctx, grad_probs, _):
         # pylint: disable=missing-function-docstring
         routing_map, intermediate_output = ctx.saved_tensors
-        grad_probs = grad_probs.contiguous().view(-1, ctx.tensor_shape[-1])
-        grad_logits = torch.empty(
-            (ctx.num_tokens, ctx.num_experts), dtype=ctx.logits_dtype, device=grad_probs.device
-        )
+        if not grad_probs.is_contiguous():
+            grad_probs = grad_probs.contiguous()
+        grad_logits = torch.empty_like(grad_probs)
         tex.fused_topk_with_score_function_bwd(
             routing_map,
             intermediate_output,
@@ -141,7 +131,6 @@ def backward(ctx, grad_probs, _):
             ctx.use_dense_indices,
             ctx.routing_map_format,
         )
-        grad_logits = grad_logits.view(ctx.tensor_shape)
         return grad_logits, None, None, None, None, None, None, None, None, None