fix shape check cond and add tests to cover it

crcrpar · crcrpar · commit f428a9001bf6 · 2025-11-17T23:37:17.000-08:00
Signed-off-by: Masaki Kozuki &lt;mkozuki@nvidia.com&gt;
diff --git a/thunder/tests/test_ops.py b/thunder/tests/test_ops.py
@@ -439,15 +439,20 @@ def fn(a):
     MXFP8_GROUPED_MSG = "MXFP8 grouped GEMM is only supported when PyTorch is built with USE_FBGEMM_GENAI=1 on SM100+"
 
     @requiresCUDA
-    def test_scaled_grouped_mm_3d2d_rowwise():
+    @pytest.mark.parametrize(
+        "group_sizes,k,n",
+        [
+            ([8, 8], 16, 16),
+            ([16, 16], 16, 16),
+        ],
+    )
+    def test_scaled_grouped_mm_2d3d_rowwise(group_sizes, k, n):
+        """Test 2D x 3D grouped matmul with various dimensions."""
         if not bool(PLATFORM_SUPPORTS_FP8_GROUPED_GEMM):
             pytest.skip(F8_GROUPED_MSG)
         device = "cuda"
-        group_sizes = [16, 16]
         groups = len(group_sizes)
         total_rows = sum(group_sizes)
-        k = 16
-        n = 16
 
         mat_a = torch.randn(total_rows, k, device=device, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
         mat_b = torch.randn(groups, n, k, device=device, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
@@ -475,21 +480,29 @@ def fn(a, b, scale_a, scale_b, offs):
         assert_consistency_of_compiletime_and_runtime(jitted, result)
 
     @requiresCUDA
-    def test_scaled_grouped_mm_2d3d_rowwise():
+    @pytest.mark.parametrize(
+        "group_sizes,m,k,n",
+        [
+            ([8, 8], 16, 32, 16),  # k != n to catch the dimension check bug
+            ([8, 8], 16, 16, 16),  # k == n edge case
+        ],
+    )
+    def test_scaled_grouped_mm_3d2d_rowwise(group_sizes, m, k, n):
+        """Test 3D x 2D grouped matmul with various dimensions.
+
+        Note: k != n in first test case specifically catches the bug where
+        mat_a.shape[2] was incorrectly compared with mat_b.shape[1].
+        """
         if not bool(PLATFORM_SUPPORTS_FP8_GROUPED_GEMM):
             pytest.skip(F8_GROUPED_MSG)
         device = "cuda"
-        group_sizes = [8, 8]
         groups = len(group_sizes)
-        total_rows = sum(group_sizes)
-        k = 16
-        n = 16
 
-        mat_a = torch.randn(total_rows, k, device=device, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
-        mat_b = torch.randn(groups, n, k, device=device, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
+        mat_a = torch.randn(groups, m, k, device=device, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
+        mat_b = torch.randn(n, k, device=device, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
         offs = torch.tensor(group_sizes, device=device, dtype=torch.int32).cumsum(0, dtype=torch.int32)
-        scale_a = torch.ones(total_rows, device=device, dtype=torch.float32)
-        scale_b = torch.ones(groups, n, device=device, dtype=torch.float32)
+        scale_a = torch.ones(groups, m, device=device, dtype=torch.float32)
+        scale_b = torch.ones(n, device=device, dtype=torch.float32)
 
         def fn(a, b, scale_a, scale_b, offs):
             return torch.nn.functional.scaled_grouped_mm(
diff --git a/thunder/torch/__init__.py b/thunder/torch/__init__.py
@@ -5849,7 +5849,7 @@ def scaled_grouped_mm(
             out_shape = (offs.shape[0], mat_a.shape[0], mat_b.shape[1])
         elif mat_a.ndim == 3 and mat_b.ndim == 2:
             utils.check(
-                mat_a.shape[2] == mat_b.shape[1],
+                mat_a.shape[2] == mat_b.shape[0],
                 lambda: f"Inner dimension mismatch: {mat_a.shape} vs {mat_b.shape}",
             )
             utils.check(

Original file line number	Diff line number	Diff line change
`@@ -5849,7 +5849,7 @@ def scaled_grouped_mm(`
`5849`	`5849`	`out_shape = (offs.shape[0], mat_a.shape[0], mat_b.shape[1])`
`5850`	`5850`	`elif mat_a.ndim == 3 and mat_b.ndim == 2:`
`5851`	`5851`	`utils.check(`
`5852`		`- mat_a.shape[2] == mat_b.shape[1],`
	`5852`	`+ mat_a.shape[2] == mat_b.shape[0],`
`5853`	`5853`	`lambda: f"Inner dimension mismatch: {mat_a.shape} vs {mat_b.shape}",`
`5854`	`5854`	`)`
`5855`	`5855`	`utils.check(`