pytorch · JulianCloudNTH · Jun 27, 2026 · Jun 23, 2026 · Jun 25, 2026 · Jun 26, 2026
@@ -194,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
     target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
     set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
   endif()
+  add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
 endif()
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 DISPATCH_ORDER_OK=1
 UPDATE_CACHE_DIR="/tmp/update_cache"
 UPDATE_CACHE_OK=1
+INDEX_DIR="/tmp/index"
+INDEX_OK=1
 EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
 EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
 EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
@@ -104,6 +106,11 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}')
 export_update_cache_negative('${UPDATE_CACHE_DIR}')
 " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; }
 
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models
+export_all_index_models('${INDEX_DIR}')
+" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; }
+
 # Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in
 # webgpu_native_test below (precise per-config error), so don't exit/mask here.
 $PYTHON_EXECUTABLE -c "
@@ -136,7 +143,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 # ── Build + run every native test target that exists in this tree ────────────
-TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
+TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test)
 BIN_DIR="${BUILD_DIR}/backends/webgpu"
 
 # Which targets are defined depends on which diffs are landed (native_test +
@@ -201,6 +208,9 @@ fi
 if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
   "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
 fi
+if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then
+  "${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}"
+fi
 [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
 
 echo "=== WebGPU native tests on Dawn: all run targets passed ==="

diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS
@@ -17,6 +17,19 @@ python_unittest(
     ],
 )
 
+python_unittest(
+    name = "test_index",
+    srcs = [
+        "ops/index/test_index.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
+        "//executorch/backends/vulkan:vulkan_preprocess",
+        "//executorch/exir:lib",
+    ],
+)
+
 runtime.python_library(
     name = "tester",
     srcs = ["tester.py"],

diff --git a/backends/webgpu/test/native/test_index.cpp b/backends/webgpu/test/native/test_index.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace executorch::backends::webgpu;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+namespace {
+
+// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case).
+constexpr const char* kIndexCases[] = {
+    "index_n16_m5",
+    "index_n8_rev",
+    "index_n32_m3",
+    "index_n4_rep",
+};
+
+std::vector<float> read_f32_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const size_t bytes =
+      static_cast<size_t>(f.tellg()) / sizeof(float) * sizeof(float);
+  f.seekg(0);
+  std::vector<float> data(bytes / sizeof(float));
+  f.read(
+      reinterpret_cast<char*>(data.data()),
+      static_cast<std::streamsize>(bytes));
+  return data;
+}
+
+std::vector<int32_t> read_i32_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const size_t bytes =
+      static_cast<size_t>(f.tellg()) / sizeof(int32_t) * sizeof(int32_t);
+  f.seekg(0);
+  std::vector<int32_t> data(bytes / sizeof(int32_t));
+  f.read(
+      reinterpret_cast<char*>(data.data()),
+      static_cast<std::streamsize>(bytes));
+  return data;
+}
+
+bool run_case(const std::string& dir, const char* name) {
+  printf("\n--- Test: %s ---\n", name);
+  const std::string base = dir + "/" + name;
+  std::vector<float> self_data = read_f32_bin(base + ".self.bin");
+  std::vector<int32_t> idx32 = read_i32_bin(base + ".idx.bin");
+  std::vector<float> golden = read_f32_bin(base + ".golden.bin");
+  if (self_data.empty() || idx32.empty() || golden.empty()) {
+    printf("FAIL: could not read self/idx/golden for %s\n", name);
+    return false;
+  }
+
+  Module module(base + ".pte");
+  if (module.load_forward() != Error::Ok) {
+    printf("FAIL: could not load %s.pte\n", name);
+    return false;
+  }
+
+  const int32_t n = static_cast<int32_t>(self_data.size());
+  const int32_t m = static_cast<int32_t>(idx32.size());
+  auto x = make_tensor_ptr({n}, std::vector<float>(self_data));
+  // int64 at the program boundary; copy_inputs narrows to the int32 buffer.
+  std::vector<int64_t> idx64(idx32.begin(), idx32.end());
+  auto idx = make_tensor_ptr({m}, std::vector<int64_t>(idx64));
+
+  auto result = module.forward({EValue(x), EValue(idx)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  // index.Tensor has exactly one output of shape [num_indices]; fail loud else.
+  if (outputs.size() != 1 || !outputs[0].isTensor()) {
+    printf("FAIL: expected exactly one tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.dim() != 1 || out_tensor.size(0) != m) {
+    printf(
+        "FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n",
+        (int)out_tensor.dim(),
+        (int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1),
+        m);
+    return false;
+  }
+  if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
+    printf(
+        "FAIL: output numel %zu != golden %zu\n",
+        (size_t)out_tensor.numel(),
+        golden.size());
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f;
+  float max_rel_err = 0.0f;
+  for (size_t i = 0; i < golden.size(); i++) {
+    const float abs_err = std::abs(out_data[i] - golden[i]);
+    max_abs_err = std::max(max_abs_err, abs_err);
+    const float denom = std::max(std::abs(golden[i]), 1e-6f);
+    max_rel_err = std::max(max_rel_err, abs_err / denom);
+  }
+  printf(
+      "Max abs error: %e   Max rel error: %e (%zu elements)\n",
+      max_abs_err,
+      max_rel_err,
+      golden.size());
+  if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
+    printf("FAIL: %s exceeds tolerance 1e-3\n", name);
+    return false;
+  }
+  printf("PASS: %s\n", name);
+  return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  std::string dir = "/tmp/index";
+  if (argc > 1) {
+    dir = argv[1];
+  }
+  if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) {
+    dir = env;
+  }
+
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    printf("SKIP: %s\n", e.what());
+    return 0;
+  }
+  set_default_webgpu_context(&ctx);
+  printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());
+
+  bool ok = true;
+  for (const char* name : kIndexCases) {
+    ok = run_case(dir, name) && ok;
+  }
+
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+
+  if (!ok) {
+    return 1;
+  }
+  printf("\nAll index tests passed\n");
+  return 0;
+}
diff --git a/backends/webgpu/test/ops/index/__init__.py b/backends/webgpu/test/ops/index/__init__.py
diff --git a/backends/webgpu/test/ops/index/test_index.py b/backends/webgpu/test/ops/index/test_index.py
@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.index.Tensor` export + goldens for the WebGPU backend.
+
+Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner --
+the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see
+op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index
+serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat
+indices make a wrong-gather bug visible. Each config writes `index_<name>.pte`,
+`index_<name>.self.bin` (fp32 self), `index_<name>.idx.bin` (int32 index), and
+`index_<name>.golden.bin` so the native `test_index` self-discovers them.
+"""
+
+import os
+import unittest
+
+import torch
+
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (self_len, index_values)
+CONFIGS = {
+    "n16_m5": (16, [0, 15, 7, 7, 2]),
+    "n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]),
+    "n32_m3": (32, [31, 0, 16]),
+    "n4_rep": (4, [2, 2, 2, 2, 0, 1]),
+}
+
+
+class IndexModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+        return x[idx]
+
+
+def _inputs(self_len, index_values):
+    # Distinct self values so a wrong-index gather is visible.
+    x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5
+    idx = torch.tensor(index_values, dtype=torch.int64)
+    return x, idx
+
+
+def _lower(x, idx):
+    ep = torch.export.export(IndexModule().eval(), (x, idx))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _export(x, idx):
+    return _lower(x, idx).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class TestIndex(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (n, iv) in CONFIGS.items():
+            edge = _lower(*_inputs(n, iv))
+            et = edge.to_executorch()
+            self.assertTrue(
+                _delegated(et), f"Expected a VulkanBackend delegate (index {name})"
+            )
+            self.assertTrue(
+                _op_delegated(edge, "index.Tensor"),
+                f"index.Tensor not delegated (fell back to CPU) for {name}",
+            )
+
+    def test_golden_matches_eager(self) -> None:
+        for _, (n, iv) in CONFIGS.items():
+            x, idx = _inputs(n, iv)
+            torch.testing.assert_close(IndexModule()(x, idx), x[idx])
+
+
+def export_all_index_models(out_dir: str) -> None:
+    """Write index_<name>.pte + .self/.idx/.golden.bin for every config."""
+    os.makedirs(out_dir, exist_ok=True)
+    for name, (n, iv) in CONFIGS.items():
+        x, idx = _inputs(n, iv)
+        golden = x[idx].contiguous().detach().numpy().astype("<f4")
+        et = _export(x, idx)
+        base = os.path.join(out_dir, f"index_{name}")
+        with open(base + ".pte", "wb") as f:
+            f.write(et.buffer)
+        x.numpy().astype("<f4").tofile(base + ".self.bin")
+        idx.numpy().astype("<i4").tofile(base + ".idx.bin")
+        golden.tofile(base + ".golden.bin")
+        print(f"Exported {base}.pte; self {n} -> golden {golden.size} floats")
+
+
+if __name__ == "__main__":
+    unittest.main()