[ExecuTorch][WebGPU] 2D compute dispatch tests — prefill golden + fold unit test

JCNTH · JCNTH · commit 3d20719c7abc · 2026-07-04T10:35:16.000-07:00
Pull Request resolved: #20584 **Test coverage for the 2D dispatch fold, stacked above the cap-lift op.** **Problem**: The 2D fold is load-bearing index math — a wrong `{x, y}` means out-of-bounds writes or dropped threads — and the prefill shapes that exercise it previously threw at the 1D cap, so they were untested. **Solution**: A device-free unit test for the fold arithmetic, plus two single-shot prefill SDPA golden configs that fold each kernel family. - **Before**: no coverage for >65535-workgroup dispatch; `llama1b_prefill_512`/`_2048` shapes threw at the cap - **After**: `fold_workgroup_count_2d` unit-tested at the cap boundaries, and the two prefill shapes run as goldens **Implementation**: - `test/native/test_dispatch_2d.cpp` — device-free unit test for `utils::fold_workgroup_count_2d`: the 1D fast path, the 2D fold, the real Llama-1B QK counts at S=512 (`{65535, 3}`) and S=2048 (`{65535, 33}`), and the needs-3rd-dimension throw; asserts each `{x, y}` covers `[0, count)` - `llama1b_prefill_512` + `llama1b_prefill_2048` configs appended to the byte-mirrored `CONFIGS` (`test_sdpa.py`) and `kSdpaConfigs` (`test_webgpu_native.cpp`) - Registers `webgpu_dispatch_2d_test` in CMake + the native CI script **Constraints**: - The Python/C++ config entries byte-mirror each other (kept in sync) - `add` shares the element-form path with QK, so it is covered structurally; a dedicated >16M-element `add` fold case is omitted as disproportionate Co-authored-with: Claude Code. ghstack-source-id: 399812923 @exported-using-ghexport Differential Revision: [D109517683](https://our.internmc.facebook.com/intern/diff/D109517683/)
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
@@ -201,6 +201,14 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
       webgpu_dynamic_shape_test test/native/test_dynamic_shape.cpp
     )
     target_link_libraries(webgpu_dynamic_shape_test PRIVATE GTest::gtest)
+
+    # Device-free fold unit test (gtest_main provides main; no device needed).
+    add_webgpu_native_test(
+      webgpu_dispatch_2d_test test/native/test_dispatch_2d.cpp
+    )
+    target_link_libraries(
+      webgpu_dispatch_2d_test PRIVATE GTest::gtest GTest::gtest_main
+    )
   endif()
   add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
 endif()
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -143,7 +143,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 # ── Build + run every native test target that exists in this tree ────────────
-TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test)
+TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test webgpu_dispatch_2d_test)
 BIN_DIR="${BUILD_DIR}/backends/webgpu"
 
 # Which targets are defined depends on which diffs are landed (native_test +
@@ -212,6 +212,8 @@ if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then
   "${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}"
 fi
 [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
+# Device-free: pure 2D workgroup-count fold unit test (no .pte, no GPU).
+[[ -x "${BIN_DIR}/webgpu_dispatch_2d_test" ]] && "${BIN_DIR}/webgpu_dispatch_2d_test"
 
 echo "=== WebGPU native tests on Dawn: all run targets passed ==="
 
diff --git a/backends/webgpu/test/native/test_dispatch_2d.cpp b/backends/webgpu/test/native/test_dispatch_2d.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Device-free unit test for the pure 2D workgroup-count fold that lifts the
+// 65535 per-dim dispatch cap. Exercises the fold arithmetic only — no GPU.
+
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <cstdint>
+
+using executorch::backends::webgpu::utils::fold_workgroup_count_2d;
+using executorch::backends::webgpu::utils::WgCount;
+
+namespace {
+
+constexpr uint32_t kMax = 65535u;
+
+// count <= max -> {count, 1}: the 1D fast path, byte-identical to the old path.
+TEST(DispatchFold, FastPath1D) {
+  for (uint32_t count : {1u, kMax - 1u, kMax}) {
+    const WgCount got = fold_workgroup_count_2d(count, kMax, "test");
+    EXPECT_EQ(got.x, count);
+    EXPECT_EQ(got.y, 1u);
+  }
+}
+
+// count > max -> near-square {x, y}: fits the per-dim cap, covers every
+// workgroup, and stays near-square so few invocations are inactive (launched -
+// count is O(sqrt(count)); a flat {max, div_up} split would idle up to ~half).
+TEST(DispatchFold, NearSquareFold) {
+  // Includes prefill-scale QK counts (Hq*ceil(S/4)*ceil(ctx/4)/wg) that fold:
+  // 131072 = S=2048 (32*512*512/64); 2097152 = large-S stress.
+  for (uint32_t count :
+       {kMax + 1u, 2u * kMax, 2u * kMax + 1u, 131072u, 2097152u}) {
+    const WgCount got = fold_workgroup_count_2d(count, kMax, "test");
+    const uint64_t launched = static_cast<uint64_t>(got.x) * got.y;
+    const uint32_t root =
+        static_cast<uint32_t>(std::ceil(std::sqrt(static_cast<double>(count))));
+    EXPECT_LE(got.x, kMax) << "count=" << count;
+    EXPECT_LE(got.y, kMax) << "count=" << count;
+    EXPECT_GE(launched, count) << "count=" << count;
+    EXPECT_LT(launched - count, 2ull * root)
+        << "count=" << count << " launched=" << launched;
+  }
+}
+
+// count > max^2 needs a 3rd dispatch dimension -> throws (out of scope).
+TEST(DispatchFold, ThrowsWhenNeeds3rdDimension) {
+  EXPECT_ANY_THROW(fold_workgroup_count_2d(kMax * kMax + 1u, kMax, "test"));
+}
+
+} // namespace
diff --git a/backends/webgpu/test/ops/test_sdpa.py b/backends/webgpu/test/ops/test_sdpa.py
@@ -61,6 +61,9 @@ class SdpaConfig:
     SdpaConfig("llama1b_decode", 32, 8, 64, 1, 512, 127),
     # D=6 is not a multiple of 4: the WebGPU head_dim%4 guard must reject it at load.
     SdpaConfig("reject_d6", 4, 4, 6, 4, 16, 0),
+    # 2D-dispatch cap (>65535 wg): S=512 folds QK; S=2048 folds QK+softmax+AV (cap+1).
+    SdpaConfig("llama1b_prefill_512", 32, 8, 64, 512, 512, 0),
+    SdpaConfig("llama1b_prefill_2048", 32, 8, 64, 2048, 2048, 0),
 ]
 
 
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
@@ -758,6 +758,18 @@ static const SdpaConfig kSdpaConfigs[] = {
      16.0f,
      /*required=*/false,
      /*expect_reject=*/true},
+    // 2D-dispatch cap (>65535 wg): S=512 folds QK; S=2048 folds QK+softmax+AV
+    // (cap+1).
+    {"llama1b_prefill_512", 32, 8, 64, 512, 512, 0, 16.0f, /*required=*/true},
+    {"llama1b_prefill_2048",
+     32,
+     8,
+     64,
+     2048,
+     2048,
+     0,
+     16.0f,
+     /*required=*/true},
 };
 
 // Ramp denominator; mirror of test_sdpa.py::_RAMP_DENOM (keep in sync).

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,9 @@ class SdpaConfig:`
`61`	`61`	`SdpaConfig("llama1b_decode", 32, 8, 64, 1, 512, 127),`
`62`	`62`	`# D=6 is not a multiple of 4: the WebGPU head_dim%4 guard must reject it at load.`
`63`	`63`	`SdpaConfig("reject_d6", 4, 4, 6, 4, 16, 0),`
	`64`	`+ # 2D-dispatch cap (>65535 wg): S=512 folds QK; S=2048 folds QK+softmax+AV (cap+1).`
	`65`	`+ SdpaConfig("llama1b_prefill_512", 32, 8, 64, 512, 512, 0),`
	`66`	`+ SdpaConfig("llama1b_prefill_2048", 32, 8, 64, 2048, 2048, 0),`
`64`	`67`	`]`
`65`	`68`
`66`	`69`