pytorch
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp‎
Lines changed: 27 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ops/impl/Q8taBinary.cpp‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp‎
Lines changed: 50 additions & 2 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp‎
Lines changed: 50 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h‎
Lines changed: 6 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp‎
Lines changed: 47 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp‎
Lines changed: 47 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp‎
Lines changed: 67 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp‎
Lines changed: 67 additions & 5 deletions
@@ -9,10 +9,36 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+//
+// Resize
+//
+
+// resize_args = { block_config_ref } (unused here)
+//
+// Elementwise binary with broadcasting: output = broadcast(in_a, in_b). Without
+// this the DynamicDispatchNode freezes the output at the build-time upper
+// bound. Mirrors the fp32 resize_binary_op_node (same arg-group layout: inputs
+// are args[1].refs[0] and [1]).
+void resize_q8ta_binary_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in_a = args.at(1).refs.at(0);
+  const ValueRef in_b = args.at(1).refs.at(1);
+
+  const std::vector<int64_t> a_sizes = graph->sizes_of(in_a);
+  const std::vector<int64_t> b_sizes = graph->sizes_of(in_b);
+  graph->virtual_resize(
+      out, calculate_broadcasted_output_size(a_sizes, b_sizes));
+}
+
 //
 // Dispatch nodes
 //
@@ -111,7 +137,7 @@ void add_q8ta_binary_node(
       // Resize args
       {block_config_ref},
       // Resizing Logic
-      nullptr));
+      resize_q8ta_binary_node));
 }
 
 //
 
@@ -13,6 +13,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -218,6 +219,51 @@ ValueRef prepack_quantized_conv2d_weight(
   return packed_weight;
 }
 
+//
+// Resize
+//
+
+// resize_args = { input, kernel_size, stride, padding, dilation }
+//
+// The q8ta_conv2d output is statically allocated at the build-time upper-bound
+// shape. Without this resize function the DynamicDispatchNode would never
+// virtual_resize the output on trigger_resize(), so a dynamic-shape graph would
+// freeze the conv output at its upper bound — feeding e.g. a 238-row input into
+// a 241-row buffer leaves garbage rows that GroupNorm's global statistics then
+// smear across the whole tensor. Recompute H/W from the current input (N and C
+// are shape-independent and stay as currently allocated).
+void resize_q8ta_conv2d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+
+  // H/W from the current input via the shared conv-output helper. kernel dims
+  // come from the kernel_size IntList (kernel_size_only=true); the args[3] slot
+  // is consulted only as an optional ceil_mode and dilation (non-bool) resolves
+  // it to false. transposed=false.
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, dilation},
+      /*transposed=*/false);
+
+  std::vector<int64_t> new_sizes = graph->sizes_of(out);
+  const size_t ndim = new_sizes.size();
+  new_sizes.at(ndim - 2) = out_hw.at(0);
+  new_sizes.at(ndim - 1) = out_hw.at(1);
+  graph->virtual_resize(out, new_sizes);
+}
+
 //
 // Dispatch nodes
 //
@@ -327,8 +373,10 @@ void add_q8ta_conv2d_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {}));
+      // Resize args: { input, kernel_size, stride, padding, dilation }
+      {packed_int8_input, kernel_size, stride, padding, dilation},
+      // Resize function: propagate dynamic H/W to the output.
+      resize_q8ta_conv2d_node));
 }
 
 //
 
@@ -123,7 +123,12 @@ void add_q8ta_conv2d_pw_node(
     const ValueRef packed_bias,
     const uint32_t activation_type,
     const ValueRef packed_int8_output,
-    const int32_t groups = 1);
+    const int32_t groups = 1,
+    const ValueRef conv_input = kDummyValueRef,
+    const ValueRef kernel_size = kDummyValueRef,
+    const ValueRef stride = kDummyValueRef,
+    const ValueRef padding = kDummyValueRef,
+    const ValueRef dilation = kDummyValueRef);
 
 std::vector<int64_t> calculate_q8ta_im2col_sizes(
     ComputeGraph* graph,
 
@@ -12,6 +12,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -172,6 +173,45 @@ ValueRef prepack_quantized_conv2d_dw_weight(
   return packed_weight;
 }
 
+//
+// Resize
+//
+
+// resize_args = { input, kernel_size, stride, padding, dilation }
+//
+// Depthwise conv output H/W follows the same formula as a regular conv (channel
+// count is unchanged: groups == in_channels == out_channels). Without this the
+// DynamicDispatchNode freezes the output at the build-time upper bound. N/C are
+// shape-independent and stay as currently allocated. Mirrors the regular q8ta
+// conv resize (resize_q8ta_conv2d_node).
+void resize_q8ta_conv2d_dw_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, dilation},
+      /*transposed=*/false);
+
+  std::vector<int64_t> new_sizes = graph->sizes_of(out);
+  const size_t ndim = new_sizes.size();
+  new_sizes.at(ndim - 2) = out_hw.at(0);
+  new_sizes.at(ndim - 1) = out_hw.at(1);
+  graph->virtual_resize(out, new_sizes);
+}
+
 //
 // Dispatch nodes
 //
@@ -258,10 +298,10 @@ void add_conv2d_dw_q8ta_q8csw_q8to_4w4c_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {},
+      // Resize args: { input, kernel_size, stride, padding, dilation }
+      {packed_int8_input, kernel_size, stride, padding, dilation},
       // Resizing Logic
-      nullptr));
+      resize_q8ta_conv2d_dw_node));
 }
 
 void add_q8ta_conv2d_dw_node(
@@ -363,8 +403,10 @@ void add_q8ta_conv2d_dw_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {}));
+      // Resize args: { input, kernel_size, stride, padding, dilation }
+      {packed_int8_input, kernel_size, stride, padding, dilation},
+      // Resizing Logic
+      resize_q8ta_conv2d_dw_node));
 }
 
 //
 
@@ -13,6 +13,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -95,6 +96,59 @@ std::vector<int64_t> calculate_q8ta_im2col_sizes(
   return {K, H, W};
 }
 
+//
+// Resize
+//
+
+// resize_args = { input, kernel_size, stride, padding, dilation, groups }
+//
+// The im2col scratch tensor is [K, H_out, align_up_4(W_out)] where K (the
+// flattened conv window, channel/kernel-derived) is shape-independent and
+// H_out/W_out are the conv output spatial dims. The downstream PW GEMM that
+// consumes this scratch is resized separately (it preserves H/W). Without this,
+// the scratch freezes at the build-time upper bound and feeds garbage rows into
+// the GEMM. Recompute H_out/W_out from the CURRENT input (NOT the conv output
+// tensor, which may itself still be frozen at this point in the resize order).
+void resize_q8ta_im2col_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef im2col_out = args.at(0).refs.at(0);
+  const ValueRef in = resize_args.at(0);
+  const ValueRef kernel_size = resize_args.at(1);
+  const ValueRef stride = resize_args.at(2);
+  const ValueRef padding = resize_args.at(3);
+  const ValueRef dilation = resize_args.at(4);
+  const ValueRef groups = resize_args.at(5);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+
+  // Conv output H/W from the current input.
+  const std::vector<int64_t> out_hw = calc_out_sizes_hw(
+      *graph,
+      in_sizes,
+      kernel_size,
+      /*kernel_size_only=*/true,
+      {stride, padding, dilation, dilation},
+      /*transposed=*/false);
+  const int64_t out_height = out_hw.at(0);
+  const int64_t out_width = out_hw.at(1);
+
+  // K (flattened conv window) is shape-independent — recompute from channels +
+  // kernel exactly as calculate_q8ta_im2col_sizes does.
+  const int64_t in_channels = utils::val_at(-3, in_sizes);
+  const int64_t groups_val = graph->extract_scalar<int64_t>(groups);
+  const int64_t in_channels_per_group = in_channels / groups_val;
+  const auto kernel_size_list = graph->get_int_list(kernel_size);
+  const int64_t flattened_kernel_len = utils::align_up_4(
+      in_channels_per_group * kernel_size_list->at(0) *
+      kernel_size_list->at(1));
+  const int64_t K = flattened_kernel_len * groups_val;
+  const int64_t W = utils::align_up_4(out_width);
+
+  graph->virtual_resize(im2col_out, {K, out_height, W});
+}
+
 //
 // Dispatch nodes
 //
@@ -168,10 +222,11 @@ void add_q8ta_im2col_node(
       push_constants,
       // Specialization Constants
       spec_constants,
-      // Resize args
-      {},
-      // Resizing Logic
-      nullptr));
+      // Resize args: { input, kernel_size, stride, padding, dilation, groups }
+      {packed_int8_input, kernel_size, stride, padding, dilation, groups},
+      // Resizing Logic: recompute the im2col scratch dims from the current
+      // input
+      resize_q8ta_im2col_node));
 }
 
 //
@@ -272,7 +327,14 @@ void q8ta_conv2d_im2col(
       packed_bias,
       activation_type_val,
       packed_int8_output,
-      groups_val);
+      groups_val,
+      // Original activation + conv geometry so the PW output H/W is recomputed
+      // from the true conv result, not the width-padded im2col scratch.
+      packed_int8_input,
+      kernel_size,
+      stride,
+      padding,
+      dilation);
 }
 
 REGISTER_OPERATORS {