[ExecuTorch][WebGPU] Dynamic resize hooks for add and mul

JCNTH · JCNTH · commit fd16aade5b1a · 2026-07-04T10:35:39.000-07:00
Pull Request resolved: #20577 **Make the elementwise add and mul ops serve any live shape from one graph.** **Problem:** `aten.add.Tensor` and `aten.mul.Tensor` baked their element count + param UBO(s) + output shape at `build()` for the max shape. On a dynamic-shape graph at a smaller live shape they would over-dispatch and leave the output sized at the max. **Solution:** - Before: one fixed dispatch sized for the build-time shape. - After: each registers a resize hook on BOTH operands (the dynamic one may be either operand by arg order). The hook recomputes the live element count, rewrites the param UBO(s), updates the dispatch `workgroup_count_x`, and sets the output `cur_dims`. Inert until an operand is resized. **Implementation:** - `add`: out follows the larger operand (robust when one input is a static residual and the other is the dynamic-S tensor); rewrites `AddParams`. - `mul`: recomputes the broadcast output shape and rebuilds all three `TensorMeta` UBOs via `fill_tensor_meta_broadcast`. - Each keeps its uniform buffer(s) alive via `own_uniform_buffer` instead of releasing at build. - Mirrors Vulkan per-op `resize_*_node` (recompute sizes + dispatch each execute). **Constraints:** Behavior-neutral on static graphs (the hook fires only when an operand's live shape differs from the max). No kernel/WGSL/numerics change. Co-authored-with: Claude Code. ghstack-source-id: 399812828 @exported-using-ghexport Differential Revision: [D109906093](https://our.internmc.facebook.com/intern/diff/D109906093/)
diff --git a/backends/webgpu/runtime/ops/add/BinaryOp.cpp b/backends/webgpu/runtime/ops/add/BinaryOp.cpp
@@ -159,13 +159,48 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
 
   graph.add_dispatch({pipeline, bind_group, workgroup_count});
+  const size_t dispatch_idx = graph.num_dispatches() - 1;
+
+  // Dynamic shapes: recompute numel/dispatch; out follows the larger operand.
+  WGPUBuffer params_buf = uniform_buffer;
+  auto add_resize = [in1_id,
+                     in2_id,
+                     out_id,
+                     alpha,
+                     wg_size,
+                     dispatch_idx,
+                     params_buf](WebGPUGraph& g) {
+    const auto& d1 = g.cur_dims(in1_id);
+    const auto& d2 = g.cur_dims(in2_id);
+    const uint64_t n1 = utils::numel_of(d1);
+    const uint64_t n2 = utils::numel_of(d2);
+    const uint64_t numel = n2 > n1 ? n2 : n1;
+    const uint64_t n_min = n2 > n1 ? n1 : n2;
+    // The flat add follows the larger operand and broadcasts the smaller; valid
+    // only when the smaller tiles evenly into it (rejects e.g. [4,1] vs [1,3],
+    // whose true [4,3] result this flat kernel cannot produce).
+    if (n_min == 0u || numel % n_min != 0u) {
+      throw std::runtime_error(
+          "add(resize): operands are not broadcast-compatible by numel");
+    }
+    g.set_cur_dims(out_id, n2 > n1 ? d2 : d1);
+    AddParams p = {};
+    p.num_elements = static_cast<uint32_t>(numel);
+    p.alpha = alpha;
+    wgpuQueueWriteBuffer(g.queue(), params_buf, 0, &p, sizeof(p));
+    g.dispatch_at(dispatch_idx).workgroup_count_x =
+        utils::compute_1d_workgroup_count(
+            g.device(), static_cast<uint32_t>(numel), wg_size, "add(resize)");
+  };
+  graph.add_tensor_resize_hook(in1_id, add_resize);
+  graph.add_tensor_resize_hook(in2_id, add_resize);
 
   // Release intermediate objects (pipeline and bind_group are kept by dispatch)
   wgpuShaderModuleRelease(shader);
   wgpuBindGroupLayoutRelease(bgl);
   wgpuPipelineLayoutRelease(pipeline_layout);
-  // Drop our ref; the bind group keeps the uniform buffer alive until release.
-  wgpuBufferRelease(uniform_buffer);
+  // Graph owns it so a resize hook can rewrite it; freed in the dtor.
+  graph.own_uniform_buffer(uniform_buffer);
 }
 
 } // namespace
diff --git a/backends/webgpu/runtime/ops/mul/BinaryOp.cpp b/backends/webgpu/runtime/ops/mul/BinaryOp.cpp
@@ -14,6 +14,7 @@
 
 #include <webgpu/webgpu.h>
 
+#include <algorithm>
 #include <stdexcept>
 #include <vector>
 
@@ -164,15 +165,54 @@ void mul_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   bg_desc.entries = bg_entries;
   WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
 
-  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+  const size_t dispatch_idx =
+      graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  // Dynamic shapes: rebuild all 3 broadcast TensorMeta UBOs + dispatch.
+  WGPUBuffer o_buf = out_meta_buf, a_buf = in1_meta_buf, b_buf = in2_meta_buf;
+  auto mul_resize =
+      [in1_id, in2_id, out_id, wg_size, dispatch_idx, o_buf, a_buf, b_buf](
+          WebGPUGraph& g) {
+        const auto& a = g.cur_dims(in1_id);
+        const auto& b = g.cur_dims(in2_id);
+        const size_t r = std::max(a.size(), b.size());
+        std::vector<int64_t> out_d(r, 1);
+        for (size_t i = 0; i < r; i++) {
+          const int64_t av = (i + a.size() < r) ? 1 : a[i - (r - a.size())];
+          const int64_t bv = (i + b.size() < r) ? 1 : b[i - (r - b.size())];
+          if (av != bv && av != 1 && bv != 1) {
+            throw std::runtime_error(
+                "mul(resize): operands are not broadcast-compatible");
+          }
+          out_d[i] = av > bv ? av : bv;
+        }
+        g.set_cur_dims(out_id, out_d);
+        const uint32_t out_ndim = static_cast<uint32_t>(r);
+        WebGPUTensor ta, tb, to;
+        ta.dims = a;
+        tb.dims = b;
+        to.dims = out_d;
+        TensorMeta om, am, bm;
+        fill_tensor_meta_broadcast(to, out_ndim, &om);
+        fill_tensor_meta_broadcast(ta, out_ndim, &am);
+        fill_tensor_meta_broadcast(tb, out_ndim, &bm);
+        wgpuQueueWriteBuffer(g.queue(), o_buf, 0, &om, sizeof(om));
+        wgpuQueueWriteBuffer(g.queue(), a_buf, 0, &am, sizeof(am));
+        wgpuQueueWriteBuffer(g.queue(), b_buf, 0, &bm, sizeof(bm));
+        g.dispatch_at(dispatch_idx).workgroup_count_x =
+            utils::compute_1d_workgroup_count(
+                g.device(), om.numel, wg_size, "mul(resize)");
+      };
+  graph.add_tensor_resize_hook(in1_id, mul_resize);
+  graph.add_tensor_resize_hook(in2_id, mul_resize);
 
   wgpuShaderModuleRelease(shader);
   wgpuBindGroupLayoutRelease(bgl);
   wgpuPipelineLayoutRelease(pipeline_layout);
-  // Drop our refs; the bind group keeps the uniforms alive until release.
-  wgpuBufferRelease(out_meta_buf);
-  wgpuBufferRelease(in1_meta_buf);
-  wgpuBufferRelease(in2_meta_buf);
+  // Graph owns them so a resize hook can rewrite them; freed in the dtor.
+  graph.own_uniform_buffer(out_meta_buf);
+  graph.own_uniform_buffer(in1_meta_buf);
+  graph.own_uniform_buffer(in2_meta_buf);
 }
 
 } // namespace