Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -194,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
endif()
add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
endif()
12 changes: 11 additions & 1 deletion backends/webgpu/scripts/test_webgpu_native_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order"
DISPATCH_ORDER_OK=1
UPDATE_CACHE_DIR="/tmp/update_cache"
UPDATE_CACHE_OK=1
INDEX_DIR="/tmp/index"
INDEX_OK=1
EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
Expand Down Expand Up @@ -104,6 +106,11 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}')
export_update_cache_negative('${UPDATE_CACHE_DIR}')
" || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; }

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models
export_all_index_models('${INDEX_DIR}')
" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; }

# Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in
# webgpu_native_test below (precise per-config error), so don't exit/mask here.
$PYTHON_EXECUTABLE -c "
Expand Down Expand Up @@ -136,7 +143,7 @@ cmake \
"${EXECUTORCH_ROOT}"

# ── Build + run every native test target that exists in this tree ────────────
TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test)
BIN_DIR="${BUILD_DIR}/backends/webgpu"

# Which targets are defined depends on which diffs are landed (native_test +
Expand Down Expand Up @@ -201,6 +208,9 @@ fi
if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
"${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
fi
if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then
"${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}"
fi
[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"

echo "=== WebGPU native tests on Dawn: all run targets passed ==="
Expand Down
13 changes: 13 additions & 0 deletions backends/webgpu/test/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,19 @@ python_unittest(
],
)

python_unittest(
name = "test_index",
srcs = [
"ops/index/test_index.py",
],
deps = [
"//caffe2:torch",
"//executorch/backends/vulkan/partitioner:vulkan_partitioner",
"//executorch/backends/vulkan:vulkan_preprocess",
"//executorch/exir:lib",
],
)

runtime.python_library(
name = "tester",
srcs = ["tester.py"],
Expand Down
174 changes: 174 additions & 0 deletions backends/webgpu/test/native/test_index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>

using namespace executorch::backends::webgpu;
using namespace executorch::extension;
using namespace executorch::runtime;

namespace {

// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case).
constexpr const char* kIndexCases[] = {
"index_n16_m5",
"index_n8_rev",
"index_n32_m3",
"index_n4_rep",
};

std::vector<float> read_f32_bin(const std::string& path) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
if (!f) {
return {};
}
const size_t bytes =
static_cast<size_t>(f.tellg()) / sizeof(float) * sizeof(float);
f.seekg(0);
std::vector<float> data(bytes / sizeof(float));
f.read(
reinterpret_cast<char*>(data.data()),
static_cast<std::streamsize>(bytes));
return data;
}

std::vector<int32_t> read_i32_bin(const std::string& path) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
if (!f) {
return {};
}
const size_t bytes =
static_cast<size_t>(f.tellg()) / sizeof(int32_t) * sizeof(int32_t);
f.seekg(0);
std::vector<int32_t> data(bytes / sizeof(int32_t));
f.read(
reinterpret_cast<char*>(data.data()),
static_cast<std::streamsize>(bytes));
return data;
}

bool run_case(const std::string& dir, const char* name) {
printf("\n--- Test: %s ---\n", name);
const std::string base = dir + "/" + name;
std::vector<float> self_data = read_f32_bin(base + ".self.bin");
std::vector<int32_t> idx32 = read_i32_bin(base + ".idx.bin");
std::vector<float> golden = read_f32_bin(base + ".golden.bin");
if (self_data.empty() || idx32.empty() || golden.empty()) {
printf("FAIL: could not read self/idx/golden for %s\n", name);
return false;
}

Module module(base + ".pte");
if (module.load_forward() != Error::Ok) {
printf("FAIL: could not load %s.pte\n", name);
return false;
}

const int32_t n = static_cast<int32_t>(self_data.size());
const int32_t m = static_cast<int32_t>(idx32.size());
auto x = make_tensor_ptr({n}, std::vector<float>(self_data));
// int64 at the program boundary; copy_inputs narrows to the int32 buffer.
std::vector<int64_t> idx64(idx32.begin(), idx32.end());
auto idx = make_tensor_ptr({m}, std::vector<int64_t>(idx64));

auto result = module.forward({EValue(x), EValue(idx)});
if (!result.ok()) {
printf("FAIL: forward failed (error %d)\n", (int)result.error());
return false;
}

const auto& outputs = result.get();
// index.Tensor has exactly one output of shape [num_indices]; fail loud else.
if (outputs.size() != 1 || !outputs[0].isTensor()) {
printf("FAIL: expected exactly one tensor output\n");
return false;
}
const auto& out_tensor = outputs[0].toTensor();
if (out_tensor.dim() != 1 || out_tensor.size(0) != m) {
printf(
"FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n",
(int)out_tensor.dim(),
(int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1),
m);
return false;
}
if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
printf(
"FAIL: output numel %zu != golden %zu\n",
(size_t)out_tensor.numel(),
golden.size());
return false;
}
const float* out_data = out_tensor.const_data_ptr<float>();

float max_abs_err = 0.0f;
float max_rel_err = 0.0f;
for (size_t i = 0; i < golden.size(); i++) {
const float abs_err = std::abs(out_data[i] - golden[i]);
max_abs_err = std::max(max_abs_err, abs_err);
const float denom = std::max(std::abs(golden[i]), 1e-6f);
max_rel_err = std::max(max_rel_err, abs_err / denom);
}
printf(
"Max abs error: %e Max rel error: %e (%zu elements)\n",
max_abs_err,
max_rel_err,
golden.size());
if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
printf("FAIL: %s exceeds tolerance 1e-3\n", name);
return false;
}
printf("PASS: %s\n", name);
return true;
}

} // namespace

int main(int argc, char** argv) {
std::string dir = "/tmp/index";
if (argc > 1) {
dir = argv[1];
}
if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) {
dir = env;
}

WebGPUContext ctx;
try {
ctx = create_webgpu_context();
} catch (const std::exception& e) {
printf("SKIP: %s\n", e.what());
return 0;
}
set_default_webgpu_context(&ctx);
printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());

bool ok = true;
for (const char* name : kIndexCases) {
ok = run_case(dir, name) && ok;
}

set_default_webgpu_context(nullptr);
destroy_webgpu_context(ctx);

if (!ok) {
return 1;
}
printf("\nAll index tests passed\n");
return 0;
}
Empty file.
106 changes: 106 additions & 0 deletions backends/webgpu/test/ops/index/test_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""`aten.index.Tensor` export + goldens for the WebGPU backend.

Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner --
the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see
op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index
serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat
indices make a wrong-gather bug visible. Each config writes `index_<name>.pte`,
`index_<name>.self.bin` (fp32 self), `index_<name>.idx.bin` (int32 index), and
`index_<name>.golden.bin` so the native `test_index` self-discovers them.
"""

import os
import unittest

import torch

from executorch.backends.vulkan import VulkanPartitioner
from executorch.exir import to_edge_transform_and_lower

# name -> (self_len, index_values)
CONFIGS = {
"n16_m5": (16, [0, 15, 7, 7, 2]),
"n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]),
"n32_m3": (32, [31, 0, 16]),
"n4_rep": (4, [2, 2, 2, 2, 0, 1]),
}


class IndexModule(torch.nn.Module):
def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
return x[idx]


def _inputs(self_len, index_values):
# Distinct self values so a wrong-index gather is visible.
x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5
idx = torch.tensor(index_values, dtype=torch.int64)
return x, idx


def _lower(x, idx):
ep = torch.export.export(IndexModule().eval(), (x, idx))
return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])


def _export(x, idx):
return _lower(x, idx).to_executorch()


def _delegated(et) -> bool:
return any(
d.id == "VulkanBackend"
for plan in et.executorch_program.execution_plan
for d in plan.delegates
)


def _op_delegated(edge, op_substr: str) -> bool:
# op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
gm = edge.exported_program().graph_module
return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)


class TestIndex(unittest.TestCase):
def test_export_delegates(self) -> None:
for name, (n, iv) in CONFIGS.items():
edge = _lower(*_inputs(n, iv))
et = edge.to_executorch()
self.assertTrue(
_delegated(et), f"Expected a VulkanBackend delegate (index {name})"
)
self.assertTrue(
_op_delegated(edge, "index.Tensor"),
f"index.Tensor not delegated (fell back to CPU) for {name}",
)

def test_golden_matches_eager(self) -> None:
for _, (n, iv) in CONFIGS.items():
x, idx = _inputs(n, iv)
torch.testing.assert_close(IndexModule()(x, idx), x[idx])


def export_all_index_models(out_dir: str) -> None:
"""Write index_<name>.pte + .self/.idx/.golden.bin for every config."""
os.makedirs(out_dir, exist_ok=True)
for name, (n, iv) in CONFIGS.items():
x, idx = _inputs(n, iv)
golden = x[idx].contiguous().detach().numpy().astype("<f4")
et = _export(x, idx)
base = os.path.join(out_dir, f"index_{name}")
with open(base + ".pte", "wb") as f:
f.write(et.buffer)
x.numpy().astype("<f4").tofile(base + ".self.bin")
idx.numpy().astype("<i4").tofile(base + ".idx.bin")
golden.tofile(base + ".golden.bin")
print(f"Exported {base}.pte; self {n} -> golden {golden.size} floats")


if __name__ == "__main__":
unittest.main()
Loading