[ExecuTorch][WebGPU] Add permute_copy + IntList graph support (aten.permute_copy.default) (#20396)

JulianCloudNTH · web-flow · commit 11567a34bf90 · 2026-06-26T10:19:58.000-07:00
Stack from [ghstack](https://github.com/ezyang/ghstack/tree/0.15.0) (oldest at bottom): * #20465 * #20464 * #20463 * #20435 * #20399 * #20398 * #20397 * __->__ #20396 * #20395 * #20394 * #20393 * #20392 * #20391 * #20390 * #20363 * #20362 * #20361 * #20360 * #20359 Adds `aten.permute_copy.default` (a coordinate-reorder gather) to the WebGPU delegate, and the `IntList` graph value type it needs to read its `dims` argument. Composition: - `runtime/WebGPUGraph.{h,cpp}` — adds `ValueType::IntList` backed by `std::vector<std::vector<int64_t>> int_lists_` + `get_int_list(int)`; `build()` deserializes `vkgraph::GraphTypes::IntList` via `value_as_IntList()->items()` (int64, matching the FlatBuffer `[long]`); mirrors the existing scalar value plumbing. - `runtime/ops/permute/Permute.cpp` — reads the permutation via `get_int_list`, normalizes negative dims, validates it is a permutation of `[0, ndim)`, builds two `TensorMeta` UBOs + a `PermuteParams{perm: vec4<u32>}` uniform, guards fp32 + rank≤4, dispatches over `compute_1d_workgroup_count(out.numel)` with `override wg_size`; releases all uniforms after the bind group. - `runtime/ops/permute/permute.wgsl` — delinearizes the output index over the contiguous output strides, reads `input` at `in.strides[perm[d]]` per dim (mirrors Vulkan `permute_buffer.glsl`). - Registers both `aten.permute_copy.default` and `aten.permute.default` to the same handler. @exported-using-ghexport Differential Revision: [D108793162](https://our.internmc.facebook.com/intern/diff/D108793162/) Differential Revision: [D108793162](https://our.internmc.facebook.com/intern/diff/D108793162)
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
@@ -48,6 +48,7 @@ set(WEBGPU_SRCS
     runtime/ops/squeeze/Squeeze.cpp
     runtime/ops/unsqueeze/Unsqueeze.cpp
     runtime/ops/slice/Slice.cpp
+    runtime/ops/permute/Permute.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -245,6 +245,7 @@ void WebGPUGraph::build(
   tensors_.resize(num_vals);
   tensor_mem_obj_ids_.resize(num_vals, -1);
   ints_.resize(num_vals, 0);
+  int_lists_.resize(num_vals);
   doubles_.resize(num_vals, 0.0);
   bools_.resize(num_vals, false);
   value_lists_.resize(num_vals);
@@ -375,6 +376,14 @@ void WebGPUGraph::build(
         ints_[i] = val->value_as_Int()->int_val();
         break;
       }
+      case vkgraph::GraphTypes::IntList: {
+        value_types_[i] = ValueType::IntList;
+        const auto* items = val->value_as_IntList()->items();
+        if (items) {
+          int_lists_[i].assign(items->cbegin(), items->cend());
+        }
+        break;
+      }
       case vkgraph::GraphTypes::Double: {
         value_types_[i] = ValueType::Double;
         doubles_[i] = val->value_as_Double()->double_val();
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
@@ -131,6 +131,11 @@ class WebGPUGraph {
   int64_t get_int(int id) const {
     return ints_[id];
   }
+  // Int values of a serialized IntList (e.g. permute dims). int64 (FlatBuffer
+  // [long]) to match the schema and the get_int convention.
+  const std::vector<int64_t>& get_int_list(int id) const {
+    return int_lists_[id];
+  }
   bool get_bool(int id) const {
     return bools_[id];
   }
@@ -258,7 +263,8 @@ class WebGPUGraph {
     Null,
     String,
     SymInt,
-    ValueList
+    ValueList,
+    IntList
   };
 
   ValueType get_value_type(int id) const {
@@ -275,6 +281,7 @@ class WebGPUGraph {
   std::vector<ValueType> value_types_;
   std::vector<WebGPUTensor> tensors_;
   std::vector<int64_t> ints_;
+  std::vector<std::vector<int64_t>> int_lists_;
   std::vector<double> doubles_;
   std::vector<bool> bools_;
   std::vector<std::vector<int>> value_lists_;
diff --git a/backends/webgpu/runtime/ops/permute/Permute.cpp b/backends/webgpu/runtime/ops/permute/Permute.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/TensorMeta.h>
+#include <executorch/backends/webgpu/runtime/ops/permute/permute_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <cstring>
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct PermuteParams {
+  uint32_t perm[kTensorMetaMaxNdim];
+};
+static_assert(
+    sizeof(PermuteParams) == 16,
+    "PermuteParams must match the WGSL Params vec4<u32> (16 bytes)");
+
+// permute: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl, NCHW).
+void permute_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dims, out]; out is the last value-id.
+  const int in_id = args.at(0);
+  const int dims_id = args.at(1);
+  const int out_id = args.at(args.size() - 1);
+
+  if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor ||
+      graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("permute: in/out arg is not a tensor");
+  }
+  if (graph.get_value_type(dims_id) != WebGPUGraph::ValueType::IntList) {
+    throw std::runtime_error("permute: dims arg is not an IntList");
+  }
+
+  WGPUDevice device = graph.device();
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  const int ndim = static_cast<int>(in_tensor.dims.size());
+
+  const std::vector<int64_t>& dims = graph.get_int_list(dims_id);
+  if (static_cast<int>(dims.size()) != ndim ||
+      static_cast<int>(out_tensor.dims.size()) != ndim) {
+    throw std::runtime_error("permute: perm length != input/output rank");
+  }
+
+  // Normalize negative dims and verify perm is a permutation of [0, ndim).
+  uint32_t perm[kTensorMetaMaxNdim];
+  bool seen[kTensorMetaMaxNdim] = {};
+  if (ndim > static_cast<int>(kTensorMetaMaxNdim)) {
+    throw std::runtime_error("permute: tensor rank exceeds 4 (MAX_NDIM)");
+  }
+  for (int d = 0; d < ndim; d++) {
+    int64_t p = dims[d];
+    if (p < 0) {
+      p += ndim;
+    }
+    if (p < 0 || p >= ndim || seen[p]) {
+      throw std::runtime_error("permute: dims is not a valid permutation");
+    }
+    seen[p] = true;
+    perm[d] = static_cast<uint32_t>(p);
+  }
+  for (int d = ndim; d < static_cast<int>(kTensorMetaMaxNdim); d++) {
+    perm[d] = static_cast<uint32_t>(d);
+  }
+
+  TensorMeta out_meta;
+  TensorMeta in_meta;
+  fill_tensor_meta(out_tensor, &out_meta);
+  fill_tensor_meta(in_tensor, &in_meta);
+  if (out_tensor.nbytes !=
+          static_cast<size_t>(out_meta.numel) * sizeof(float) ||
+      in_tensor.nbytes != static_cast<size_t>(in_meta.numel) * sizeof(float)) {
+    throw std::runtime_error("permute: non-fp32 operand (nbytes != numel * 4)");
+  }
+
+  PermuteParams params = {};
+  std::memcpy(params.perm, perm, sizeof(perm));
+
+  uint32_t wg_size =
+      utils::clamp_workgroup_size(device, kPermuteWorkgroupSizeX);
+  uint32_t workgroup_count = utils::compute_1d_workgroup_count(
+      device, out_meta.numel, wg_size, "permute");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  WGPUBuffer out_meta_buf =
+      utils::make_uniform(device, &out_meta, sizeof(TensorMeta));
+  WGPUBuffer in_meta_buf =
+      utils::make_uniform(device, &in_meta, sizeof(TensorMeta));
+  WGPUBuffer params_buf =
+      utils::make_uniform(device, &params, sizeof(PermuteParams));
+  graph.add_uniform_buffer_bytes(
+      2 * sizeof(TensorMeta) + sizeof(PermuteParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kPermuteWGSL, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms).
+  WGPUBindGroupLayoutEntry entries[5] = {};
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[4].binding = 4;
+  entries[4].visibility = WGPUShaderStage_Compute;
+  entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 5;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[5] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in_tensor.buffer;
+  bg_entries[0].size = in_tensor.nbytes;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = out_meta_buf;
+  bg_entries[2].size = sizeof(TensorMeta);
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = in_meta_buf;
+  bg_entries[3].size = sizeof(TensorMeta);
+  bg_entries[4].binding = 4;
+  bg_entries[4].buffer = params_buf;
+  bg_entries[4].size = sizeof(PermuteParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 5;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our refs; the bind group keeps the uniforms alive until release.
+  wgpuBufferRelease(out_meta_buf);
+  wgpuBufferRelease(in_meta_buf);
+  wgpuBufferRelease(params_buf);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.permute_copy.default, permute_impl);
+  WEBGPU_REGISTER_OP(aten.permute.default, permute_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/permute/permute.wgsl b/backends/webgpu/runtime/ops/permute/permute.wgsl
@@ -0,0 +1,36 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  perm: vec4<u32>,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl).
+    var rem = out_bufi;
+    var in_bufi: u32 = 0u;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        let coord = rem / out_meta.strides[d];
+        rem = rem % out_meta.strides[d];
+        in_bufi = in_bufi + coord * in_meta.strides[params.perm[d]];
+    }
+    output[out_bufi] = input[in_bufi];
+}
diff --git a/backends/webgpu/runtime/ops/permute/permute_wgsl.h b/backends/webgpu/runtime/ops/permute/permute_wgsl.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from permute.wgsl - DO NOT EDIT.
+// wgsl-sha256: d34f59730cda7317589b6ed5691a1ccab8666b9c94e17ac2cb3658b036300197
+inline constexpr const char* kPermuteWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  perm: vec4<u32>,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl).
+    var rem = out_bufi;
+    var in_bufi: u32 = 0u;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        let coord = rem / out_meta.strides[d];
+        rem = rem % out_meta.strides[d];
+        in_bufi = in_bufi + coord * in_meta.strides[params.perm[d]];
+    }
+    output[out_bufi] = input[in_bufi];
+}
+)";
+
+inline constexpr uint32_t kPermuteWorkgroupSizeX = 64;
+inline constexpr uint32_t kPermuteWorkgroupSizeY = 1;
+inline constexpr uint32_t kPermuteWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ set(WEBGPU_SRCS`
`48`	`48`	`runtime/ops/squeeze/Squeeze.cpp`
`49`	`49`	`runtime/ops/unsqueeze/Unsqueeze.cpp`
`50`	`50`	`runtime/ops/slice/Slice.cpp`
	`51`	`+ runtime/ops/permute/Permute.cpp`
`51`	`52`	`)`
`52`	`53`
`53`	`54`	`add_library(webgpu_backend ${WEBGPU_SRCS})`