diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl
new file mode 100644
index 00000000000..623de3a5d9a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+#extension GL_EXT_integer_dot_product : require
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+#include "common.glslh"
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", "texture2d", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
+
+// Metadata for input/output tensors (memory layout agnostic)
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float input_scale;
+  int input_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "1")}
+
+// Layout specialization constants
+${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
+${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
+
+// Load weight block for a given (ic4, kx, ky, oc4) position.
+// Weight texture layout (from pack_q8_conv2d_weights.glsl):
+//   block_x = oc4 * K_w + kx
+//   block_y = ky * IC4 + ic4
+// Each texel ivec4 has 4 components (4 output channels), each component is
+// a packed int32 containing 4 int8 values for 4 consecutive input channels.
+ivec4 load_weight_block(int ic4, int kx, int ky, int oc4, int IC4, int KW) {
+  const int block_x = oc4 * KW + kx;
+  const int block_y = ky * IC4 + ic4;
+  return texelFetch(t_packed_int8_weight, ivec2(block_x, block_y), 0);
+}
+
+ivec4 quantize(const vec4 texel, const float inv_scale, const int zp) {
+  vec4 quantized = round(texel * inv_scale) + zp;
+  return clamp(ivec4(quantized), -128, 127);
+}
+
+void main() {
+  // Thread mapping
+  int oc4 = int(gl_GlobalInvocationID.z);
+  int w4 = int(gl_GlobalInvocationID.x);
+
+  // Initialize output tensor index (WHCN order)
+  // Each thread handles 4 adjacent widths starting at base_out_w
+  TensorIndex4D outp_tidx;
+  outp_tidx.data[0] = w4 * 4;
+  outp_tidx.data[1] = int(gl_GlobalInvocationID.y);
+  outp_tidx.data[2] = oc4 * 4;
+  outp_tidx.data[3] = 0;
+
+  const int W = int(outp.sizes[0][0]);
+  const int OC = int(outp.sizes[0][2]);
+  const int OC4 = int(div_up_4(OC));
+
+  // Bounds check
+  if (any(greaterThanEqual(outp_tidx.data, ivec4(outp.sizes[0])))) {
+    return;
+  }
+
+  // Input dimensions
+  const int inp_W = int(inp.sizes[0][0]);
+  const int inp_H = int(inp.sizes[0][1]);
+  const int IC = int(inp.sizes[0][2]);
+
+  // Compute channels per group
+  const int OC_per_group = OC / conv2d_params.groups;
+  const int IC_per_group = IC / conv2d_params.groups;
+  const int IC4_per_group = div_up_4(IC_per_group);
+
+  // Determine which group this output channel block belongs to
+  const int group_idx = outp_tidx.data[2] / OC_per_group;
+  const int ic_group_start = group_idx * IC_per_group;
+
+  // Get strides for efficient indexing
+  const int inp_w_stride = int(inp.strides[0][0]);
+  const int inp_h_stride = int(inp.strides[0][1]);
+  const int inp_c_stride = int(inp.strides[0][2]);
+  const int w_texel_step = conv2d_params.dilation.x * inp_w_stride;
+  const int h_texel_step = conv2d_params.dilation.y * inp_h_stride;
+  const int subtile_w_step = conv2d_params.stride.x * inp_w_stride;
+
+  // Compute base input position (for subtile_w=0, ic4=0)
+  TensorIndex4D inp_tidx;
+  inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x;
+  inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y;
+  inp_tidx.data[2] = ic_group_start;
+  inp_tidx.data[3] = 0;
+
+  int base_inp_texel_idx;
+  if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+    base_inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
+  }
+
+  // Store base width to reset at beginning of each loop
+  const int base_inp_w = inp_tidx.data[0];
+
+  // Create packed input zero point (4 copies of input_zp packed into int32)
+  const int input_zp_packed = pack_into_int32(ivec4(input_zp));
+
+  // Initialize accumulators for 4 width positions × 4 output channels each
+  ivec4 acc[4];
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    acc[i] = ivec4(0);
+  }
+
+  // Perform convolution using packed int8 dot products
+  for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
+    const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H);
+
+    // Process input channels in blocks of 4
+    for (int ic4 = 0; ic4 < IC4_per_group; ic4++) {
+      // Input channel index for this block (base channel of the 4-channel block)
+      inp_tidx.data[2] = ic_group_start + ic4 * 4;
+
+      // Reset width coordinate at start of each ic4 iteration
+      inp_tidx.data[0] = base_inp_w;
+
+      for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
+        // Load weight block: 4 output channels × 4 input channels
+        // weight_block[oc] contains packed weights for ic4*4 to ic4*4+3 -> oc
+        const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, conv2d_params.kernel_size.x);
+
+        // Process 4 adjacent width positions
+        [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+          // Load packed input (4 consecutive channels packed into one int32)
+          // Use input_zp_packed for out-of-bounds positions
+          int packed_input = input_zp_packed;
+          if (h_in_bounds && inp_tidx.data[0] >= 0 && inp_tidx.data[0] < inp_W) {
+            // Compute input texel index using base + offsets
+            int inp_texel_idx;
+            if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+              inp_texel_idx = base_inp_texel_idx + ic4 * inp_c_stride + kx * w_texel_step + subtile_w * subtile_w_step;
+            } else {
+              // inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
+              const int w4 = div_4(inp_tidx.data[0]);
+              const int inp_c4 = div_4(inp_tidx.data[2]);
+              inp_texel_idx = (inp_tidx.data[1] * inp_h_stride + w4 * inp_w_stride + inp_c4) * 4 + mod_4(inp_tidx.data[0]);
+            }
+            packed_input = t_packed_int8_input[inp_texel_idx];
+          }
+
+          // Accumulate using packed int8 dot product for each output channel
+          // dotPacked4x8AccSatEXT computes: acc + dot(unpack(a), unpack(b))
+          [[unroll]] for (int oc_offset = 0; oc_offset < 4; ++oc_offset) {
+            acc[subtile_w][oc_offset] = dotPacked4x8AccSatEXT(
+                packed_input,
+                weight_block[oc_offset],
+                acc[subtile_w][oc_offset]);
+          }
+
+          // Advance to next output position's input coordinate
+          inp_tidx.data[0] += conv2d_params.stride.x;
+        }
+
+        // Adjust for net dilation step
+        inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x;
+      }
+    }
+
+    // Advance height by dilation for next kernel row
+    inp_tidx.data[1] += conv2d_params.dilation.y;
+
+    if (get_outer_packed_dim_block_size(inp_layout) == 1) {
+      // Advance base index by height step for next kernel row
+      base_inp_texel_idx += h_texel_step;
+    }
+  }
+
+  // Apply input zero point correction via weight_sums
+  const vec4 weight_sums = vec4(t_weight_sums[oc4]);
+  const vec4 weight_scales = vec4(t_weight_scales[oc4]);
+
+  // Convert to float, apply dequantization, and optionally add bias
+  vec4 facc[4];
+  [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+    facc[subtile_w] = vec4(acc[subtile_w]);
+    facc[subtile_w] -= weight_sums * input_zp;
+    facc[subtile_w] *= weight_scales * input_scale;
+  }
+
+  // Apply bias if enabled
+  if (apply_bias > 0) {
+    const vec4 bias = vec4(t_bias[oc4]);
+    [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+      facc[subtile_w] += bias;
+    }
+  }
+
+  // Compute base output texel index (for subtile_w=0)
+  const int base_outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout);
+  const int out_w_stride = int(outp.strides[0][0]);
+
+  // Quantize and store outputs using stride offsets
+  [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
+    // Skip out-of-bounds width positions
+    if (outp_tidx.data[0] >= W) {
+      continue;
+    }
+
+    const ivec4 quantized_out = quantize(facc[subtile_w], output_inv_scale, output_zp);
+    const int packed_out = pack_into_int32(quantized_out);
+
+    // Store using stride offset from base
+    int outp_texel_idx;
+    if (get_outer_packed_dim_block_size(outp_layout) == 1) {
+      outp_texel_idx = base_outp_texel_idx + subtile_w * out_w_stride;
+    } else {
+      outp_texel_idx = base_outp_texel_idx + subtile_w;
+    }
+
+    t_packed_int8_output[outp_texel_idx] = packed_out;
+
+    outp_tidx.data[0] += 1;
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.yaml
new file mode 100644
index 00000000000..dc21e6da0c5
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q8ta_conv2d:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: q8ta_conv2d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
new file mode 100644
index 00000000000..2ee57551235
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/ConvolutionUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+bool q8ta_conv2d_check_packed_dim_info(const api::PackedDimInfo& info) {
+  return info.packed_dim == WHCN::kChannelsDim &&
+      info.packed_dim_block_size == 4 &&
+      info.outer_packed_dim == WHCN::kWidthDim &&
+      (info.outer_packed_dim_block_size == 1 ||
+       info.outer_packed_dim_block_size == 4);
+}
+
+//
+// Workgroup size selection functions
+//
+
+/**
+ * Computes a global workgroup size for q8ta_conv2d where:
+ *   - For channels-fastest output (e.g., 4C): x = C4, y = H, z = W4
+ *   - For width-fastest output (e.g., 4C1W): x = W4, y = H, z = C4
+ *
+ * The x/z assignment matches the shader's dynamic thread assignment based on
+ * fastest_dim (dim_order[0]), ensuring consecutive threads access consecutive
+ * elements along the fastest moving dimension for optimal memory coalescing.
+ *
+ * Each thread processes a 4Wx4C tile of output elements.
+ */
+utils::uvec3 pick_q8ta_conv2d_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef output = args.at(0).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, output);
+  const uint32_t H = graph->size_at<uint32_t>(-2, output);
+  const uint32_t C = graph->size_at<uint32_t>(-3, output);
+
+  // Each thread processes 4 adjacent width positions and 4 channels (4Wx4C
+  // tile)
+  const uint32_t W4 = utils::div_up_4(W);
+  const uint32_t C4 = utils::div_up_4(C);
+
+  return {W4, H, C4};
+}
+
+/**
+ * Picks a local workgroup size for q8ta_conv2d with adaptive sizing based on
+ * tensor dimensions. Uses experimentation results:
+ *   - {4, 2, 8} for medium tensors: +57% improvement on 81x81
+ *   - {8, 1, 8} for very large tensors: best baseline performance
+ *   - {64, 1, 1} for narrow channel dimensions: minimize inactive invocations
+ */
+utils::uvec3 pick_q8ta_conv2d_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef output = args.at(0).refs.at(0);
+
+  // Get actual tensor dimensions for adaptive sizing
+  const uint32_t H = graph->size_at<uint32_t>(-2, output);
+
+  // For very large tensors (H >= 100 and large x/z), use {8, 1, 8}
+  // This configuration performed best for 128x128 tensors in experiments
+  if (H >= 100 && global_workgroup_size[0u] >= 24 &&
+      global_workgroup_size[2u] >= 24) {
+    return {8u, 1u, 8u};
+  }
+
+  // For medium-sized tensors, use {4, 2, 8} for better height parallelism
+  // This configuration showed +57% improvement on 81x81 tensors
+  if (global_workgroup_size[0u] >= 4 && global_workgroup_size[1u] >= 2 &&
+      global_workgroup_size[2u] >= 8) {
+    return {4u, 2u, 8u};
+  }
+
+  // For tensors with sufficient x and z dimensions, use square configuration
+  if (global_workgroup_size[0u] >= 6 && global_workgroup_size[2u] >= 6) {
+    return {8u, 1u, 8u};
+  }
+
+  // If x dimension is very small, bias towards z dimension
+  if (global_workgroup_size[0u] < 2u) {
+    return {1u, 1u, 64u};
+  }
+
+  // If z dimension is very small, bias towards x dimension
+  if (global_workgroup_size[2u] < 2u) {
+    return {64u, 1u, 1u};
+  }
+
+  return {16u, 1u, 4u};
+}
+
+//
+// Prepack nodes
+//
+
+ValueRef prepack_quantized_conv2d_weight(
+    ComputeGraph& graph,
+    const QuantizationConfig& weight_quant_config,
+    const ValueRef weight_data,
+    const ValueRef input,
+    const ValueRef output,
+    const ValueRef groups,
+    const ValueRef kernel_size) {
+  VK_CHECK_COND(weight_quant_config.nbits == 8);
+  VK_CHECK_COND(weight_quant_config.is_symmetric);
+
+  const int32_t groups_val = graph.get_int(groups);
+
+  const int64_t OC = graph.size_at<int64_t>(-3, output);
+  const int64_t IC = graph.size_at<int64_t>(-3, input) / groups_val;
+
+  int64_t K_h;
+  int64_t K_w;
+
+  {
+    const auto kernel_size_list = graph.get_int_list(kernel_size);
+    K_h = kernel_size_list->at(0);
+    K_w = kernel_size_list->at(1);
+  }
+
+  const int64_t num_blocks_OC = utils::div_up_4(OC);
+  const int64_t num_blocks_IC = utils::div_up_4(IC);
+
+  const int64_t num_blocks_y = num_blocks_IC * K_h;
+  const int64_t num_blocks_x = K_w * num_blocks_OC;
+
+  // The packed tensor arranges blocks as [OC_blocks * K_total, IC_blocks]
+  const int64_t output_height = num_blocks_y;
+  const int64_t output_width = num_blocks_x * 4;
+
+  // Store the original sizes of the weight data to pass to the shader
+  utils::ivec4 orig_sizes = {
+      utils::safe_downcast<int32_t>(OC),
+      utils::safe_downcast<int32_t>(K_h),
+      utils::safe_downcast<int32_t>(K_w),
+      utils::safe_downcast<int32_t>(IC)};
+
+  std::vector<int64_t> packed_weight_sizes{output_height, output_width};
+
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width > max_extent * 4 || output_height > max_extent) {
+    storage_type = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      packed_weight_sizes,
+      vkcompute::vkapi::kInt,
+      storage_type,
+      utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(num_blocks_x),
+      utils::safe_downcast<uint32_t>(num_blocks_y),
+      1u};
+
+  std::string kernel_name = "pack_q8_conv2d_weights";
+  add_storage_type_suffix(kernel_name, storage_type);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      weight_data,
+      packed_weight,
+      // UBOs
+      {},
+      // Specialization Constants
+      {},
+      // Push Constants
+      {graph.sizes_pc_of(packed_weight),
+       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec4))}));
+
+  return packed_weight;
+}
+
+//
+// Dispatch nodes
+//
+
+void add_q8ta_conv2d_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef packed_int8_input_im2col,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output) {
+  (void)packed_int8_input_im2col; // Not used in general shader
+
+  Conv2DParams conv_params = create_conv2d_params(
+      graph,
+      packed_int8_input,
+      packed_int8_output,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups);
+
+  // The implementation requires that for grouped convolutions, the input
+  // channels per group is a multiple of 4.
+  if (conv_params.groups > 1) {
+    VK_CHECK_COND(conv_params.in_channels_per_group % 4 == 0);
+  }
+
+  // Validate packed dim info for input and output tensors
+  VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
+      graph.packed_dim_info_of(packed_int8_input)));
+  VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
+      graph.packed_dim_info_of(packed_int8_output)));
+
+  // Validate dtype is kInt8x4
+  VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4);
+  VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4);
+
+  float input_scale_val = graph.extract_scalar<float>(input_scale);
+  int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
+
+  float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
+  int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
+
+  uint32_t apply_bias = 1;
+  if (graph.val_is_none(bias_data)) {
+    apply_bias = 0;
+  }
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
+      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
+      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
+      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
+  };
+
+  // Select shader based on layout
+  std::string kernel_name = "q8ta_conv2d";
+  add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
+
+  // Pass metadata for both output and input tensors
+  vkapi::ParamsBindList param_buffers = {
+      graph.buffer_meta_ubo(packed_int8_output),
+      graph.buffer_meta_ubo(packed_int8_input),
+      graph.create_params_buffer(conv_params)};
+
+  // Build spec constants: apply_bias + layout constants
+  vkapi::SpecVarList spec_constants = {
+      apply_bias,
+      // Layout specialization constants
+      graph.hashed_layout_of(packed_int8_input),
+      graph.hashed_layout_of(packed_int8_output),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_q8ta_conv2d_global_wg_size,
+      pick_q8ta_conv2d_local_wg_size,
+      // Inputs and Outputs
+      {{packed_int8_output, vkapi::kWrite},
+       {{packed_int8_input,
+         packed_weight,
+         packed_weight_sums,
+         packed_weight_scales,
+         packed_bias},
+        vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_constants,
+      // Resize args
+      {}));
+}
+
+//
+// High level operator impl
+//
+
+void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef packed_int8_input = args.at(idx++);
+  const ValueRef input_scale = args.at(idx++);
+  const ValueRef input_zp = args.at(idx++);
+  const ValueRef weight_data = args.at(idx++);
+  const ValueRef weight_sums_data = args.at(idx++);
+  const ValueRef weight_scales_data = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef bias_data = args.at(idx++);
+  const ValueRef kernel_size = args.at(idx++);
+  const ValueRef stride = args.at(idx++);
+  const ValueRef padding = args.at(idx++);
+  const ValueRef dilation = args.at(idx++);
+  const ValueRef groups = args.at(idx++);
+  const ValueRef packed_int8_output = args.at(idx++);
+
+  QuantizationConfig weight_quant_config(8, kPerChannel, {});
+
+  // Prepack weight using the conv2d weight packing for the general shader
+  ValueRef packed_weight = prepack_quantized_conv2d_weight(
+      graph,
+      weight_quant_config,
+      weight_data,
+      packed_int8_input,
+      packed_int8_output,
+      groups,
+      kernel_size);
+
+  ValueRef packed_weight_sums = prepack_standard(
+      graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
+
+  ValueRef packed_weight_scales = prepack_standard(
+      graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked);
+
+  // Create a dummy tensor to fill the binding slot of the bias tensor if it is
+  // not provided. This helps simplify dispatch logic and makes it so that
+  // fewer shader variants need to be generated.
+  TmpTensor dummy_bias(
+      &graph,
+      {},
+      graph.dtype_of(weight_scales_data),
+      utils::kBuffer,
+      utils::kWidthPacked);
+
+  ValueRef packed_bias = dummy_bias.vref;
+  if (graph.val_is_not_none(bias_data)) {
+    packed_bias =
+        prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked);
+  }
+
+  // The general q8ta_conv2d shader does not use im2col, so pass input as im2col
+  add_q8ta_conv2d_node(
+      graph,
+      packed_int8_input,
+      packed_int8_input, // packed_int8_input_im2col - not used in general
+                         // shader
+      input_scale,
+      input_zp,
+      packed_weight,
+      packed_weight_sums,
+      packed_weight_scales,
+      output_scale,
+      output_zp,
+      bias_data,
+      packed_bias,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups,
+      packed_int8_output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(etvk.q8ta_conv2d.default, q8ta_conv2d);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
index 5f028caec12..2c66537acc5 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
@@ -13,6 +13,17 @@
 
 namespace vkcompute {
 
+bool q8ta_conv2d_check_packed_dim_info(const api::PackedDimInfo& info);
+
+ValueRef prepack_quantized_conv2d_weight(
+    ComputeGraph& graph,
+    const QuantizationConfig& weight_quant_config,
+    const ValueRef weight_data,
+    const ValueRef input,
+    const ValueRef output,
+    const ValueRef groups,
+    const ValueRef kernel_size);
+
 ValueRef prepack_quantized_conv2d_weight(
     ComputeGraph& graph,
     const QuantizationConfig& weight_quant_config,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
index 5e1e8aab599..121a577555f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp
@@ -292,6 +292,16 @@ void add_q8ta_conv2d_dw_node(
       dilation,
       groups);
 
+  // Validate packed dim info for input and output tensors
+  VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
+      graph.packed_dim_info_of(packed_int8_input)));
+  VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
+      graph.packed_dim_info_of(packed_int8_output)));
+
+  // Validate dtype is kInt8x4
+  VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4);
+  VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4);
+
   // Verify this is actually a depthwise convolution
   const int64_t groups_val = graph.extract_scalar<int64_t>(groups);
   const int64_t in_channels = graph.size_at<int64_t>(-3, packed_int8_input);
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
index a4d959aec41..1bfff6f1342 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -364,93 +364,6 @@ utils::uvec3 pick_static_quantized_conv2d_local_wg_size(
       graph, shader, global_workgroup_size, args, resize_args);
 }
 
-//
-// Prepack nodes
-//
-
-ValueRef prepack_quantized_conv2d_weight(
-    ComputeGraph& graph,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef weight_data,
-    const ValueRef input,
-    const ValueRef output,
-    const ValueRef groups,
-    const ValueRef kernel_size) {
-  VK_CHECK_COND(weight_quant_config.nbits == 8);
-  VK_CHECK_COND(weight_quant_config.is_symmetric);
-
-  const int32_t groups_val = graph.get_int(groups);
-
-  const int64_t OC = graph.size_at<int64_t>(-3, output);
-  const int64_t IC = graph.size_at<int64_t>(-3, input) / groups_val;
-
-  int64_t K_h;
-  int64_t K_w;
-
-  {
-    const auto kernel_size_list = graph.get_int_list(kernel_size);
-    K_h = kernel_size_list->at(0);
-    K_w = kernel_size_list->at(1);
-  }
-
-  const int64_t num_blocks_OC = utils::div_up_4(OC);
-  const int64_t num_blocks_IC = utils::div_up_4(IC);
-
-  const int64_t num_blocks_y = num_blocks_IC * K_h;
-  const int64_t num_blocks_x = K_w * num_blocks_OC;
-
-  // The packed tensor arranges blocks as [OC_blocks * K_total, IC_blocks]
-  const int64_t output_height = num_blocks_y;
-  const int64_t output_width = num_blocks_x * 4;
-
-  // Store the original sizes of the weight data to pass to the shader
-  utils::ivec4 orig_sizes = {
-      utils::safe_downcast<int32_t>(OC),
-      utils::safe_downcast<int32_t>(K_h),
-      utils::safe_downcast<int32_t>(K_w),
-      utils::safe_downcast<int32_t>(IC)};
-
-  std::vector<int64_t> packed_weight_sizes{output_height, output_width};
-
-  utils::StorageType storage_type = utils::kTexture2D;
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  if (output_width > max_extent * 4 || output_height > max_extent) {
-    storage_type = utils::kBuffer;
-  }
-
-  ValueRef packed_weight = graph.add_tensor(
-      packed_weight_sizes,
-      vkcompute::vkapi::kInt,
-      storage_type,
-      utils::kWidthPacked);
-
-  utils::uvec3 global_wg_size = {
-      utils::safe_downcast<uint32_t>(num_blocks_x),
-      utils::safe_downcast<uint32_t>(num_blocks_y),
-      1u};
-
-  std::string kernel_name = "pack_q8_conv2d_weights";
-  add_storage_type_suffix(kernel_name, storage_type);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      weight_data,
-      packed_weight,
-      // UBOs
-      {},
-      // Specialization Constants
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(packed_weight),
-       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec4))}));
-
-  return packed_weight;
-}
-
 //
 // Dispatch nodes
 //
@@ -824,7 +737,7 @@ void add_conv2d_q8ta_q8csw_linear_node(
       nullptr));
 }
 
-void add_conv2d_q8ta_q8csw_q8to_node(
+void add_conv2d_q8ta_q8csw_q8to_4w4c_node(
     ComputeGraph& graph,
     const ValueRef packed_int8_input,
     const ValueRef packed_int8_input_im2col,
@@ -853,38 +766,38 @@ void add_conv2d_q8ta_q8csw_q8to_node(
       dilation,
       groups);
 
-  const bool use_im2col = should_use_im2col(&graph, kernel_size, groups);
-
   float input_scale_val = graph.extract_scalar<float>(input_scale);
   int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
 
   float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
   int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
 
+  uint32_t apply_bias = 1;
+  if (graph.val_is_none(bias_data)) {
+    apply_bias = 0;
+  }
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
+      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
+      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
+      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
+  };
+
+  // Use the optimized im2col or direct shader for 4W4C layout
+  const bool use_im2col = should_use_im2col(&graph, kernel_size, groups);
+
   std::string kernel_name = use_im2col ? "conv2d_q8ta_q8csw_q8to_linear_tiled"
                                        : "conv2d_q8ta_q8csw_q8to";
   add_storage_type_suffix(
       kernel_name, graph.storage_type_of(packed_int8_output));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
   add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
   vkapi::ParamsBindList param_buffers = {
       graph.sizes_ubo(packed_int8_output),
       graph.sizes_ubo(packed_int8_input_im2col)};
 
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
-      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
-      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
-      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
-  };
-
-  uint32_t apply_bias = 1;
-  if (graph.val_is_none(bias_data)) {
-    apply_bias = 0;
-  }
-
   vkapi::SpecVarList spec_constants =
       GenerateSpecConstants(graph, conv_params, groups, apply_bias);
 
@@ -913,6 +826,78 @@ void add_conv2d_q8ta_q8csw_q8to_node(
       nullptr));
 }
 
+void add_conv2d_q8ta_q8csw_q8to_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef packed_int8_input_im2col,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output) {
+  // Check if the input/output layout is 4W4C (optimized path)
+  const utils::GPUMemoryLayout inp_layout =
+      graph.estimate_memory_layout_of(packed_int8_input);
+  const utils::GPUMemoryLayout outp_layout =
+      graph.estimate_memory_layout_of(packed_int8_output);
+
+  const bool use_optimized_shader =
+      (inp_layout == utils::kPackedInt8_4W4C &&
+       outp_layout == utils::kPackedInt8_4W4C);
+
+  if (use_optimized_shader) {
+    add_conv2d_q8ta_q8csw_q8to_4w4c_node(
+        graph,
+        packed_int8_input,
+        packed_int8_input_im2col,
+        input_scale,
+        input_zp,
+        packed_weight,
+        packed_weight_sums,
+        packed_weight_scales,
+        output_scale,
+        output_zp,
+        bias_data,
+        packed_bias,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        packed_int8_output);
+  } else {
+    add_q8ta_conv2d_node(
+        graph,
+        packed_int8_input,
+        packed_int8_input_im2col,
+        input_scale,
+        input_zp,
+        packed_weight,
+        packed_weight_sums,
+        packed_weight_scales,
+        output_scale,
+        output_zp,
+        bias_data,
+        packed_bias,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        packed_int8_output);
+  }
+}
+
 //
 // High level operator impl
 //
@@ -1171,7 +1156,18 @@ void static_quantized_conv2d_impl(
   // same 4Wx4C block have the same group index.
   const bool is_depthwise = (groups_val == in_channels);
 
-  const bool use_im2col = should_use_im2col(&graph, kernel_size, groups);
+  // Check if input/output layouts are 4W4C (optimized im2col path)
+  const utils::GPUMemoryLayout inp_layout =
+      graph.estimate_memory_layout_of(packed_int8_input);
+  const utils::GPUMemoryLayout outp_layout =
+      graph.estimate_memory_layout_of(packed_int8_output);
+  const bool is_optimized_layout =
+      (inp_layout == utils::kPackedInt8_4W4C &&
+       outp_layout == utils::kPackedInt8_4W4C);
+
+  // Only use im2col path for 4W4C layouts
+  const bool use_im2col =
+      is_optimized_layout && should_use_im2col(&graph, kernel_size, groups);
   // For pointwise convolution with stride = 1, padding = 0, dilation = 1, the
   // input tensor is already equivalent to its im2col representation. In this
   // case we can skip the im2col procedure and pass in the input image to the
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index 0deea46c292..5e794e3b48b 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -99,7 +99,7 @@ if(TARGET vulkan_backend)
   add_operator_prototype(choose_qparams_per_row)
   add_operator_prototype(test_q8ta_qdq)
   add_operator_prototype(test_q8ta_clone)
-  add_operator_prototype(q8ta_q8csw_q8to_conv2d)
+  add_operator_prototype(test_q8ta_conv2d)
   add_operator_prototype(test_q8ta_conv2d_dw)
   add_operator_prototype(q8ta_q8ta_q8to_add)
 endif()
diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp
index 861f25c0606..fca82ef3eee 100644
--- a/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp
+++ b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp
@@ -89,8 +89,83 @@ void test_q8ta_conv2d_dw(
       graph, packed_int8_output, output_scale, output_zp, fp_output);
 }
 
+void test_q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input = args.at(idx++);
+  const ValueRef input_scale = args.at(idx++);
+  const ValueRef input_zp = args.at(idx++);
+  const ValueRef weight_data = args.at(idx++);
+  const ValueRef weight_sums_data = args.at(idx++);
+  const ValueRef weight_scales_data = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef bias_data = args.at(idx++);
+  const ValueRef kernel_size = args.at(idx++);
+  const ValueRef stride = args.at(idx++);
+  const ValueRef padding = args.at(idx++);
+  const ValueRef dilation = args.at(idx++);
+  const ValueRef groups = args.at(idx++);
+  const ValueRef layout_int = args.at(idx++);
+  const ValueRef impl_selector_str = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  // Extract the layout parameter and cast to GPUMemoryLayout
+  int32_t layout_value = graph.extract_scalar<int32_t>(layout_int);
+  utils::GPUMemoryLayout layout =
+      static_cast<utils::GPUMemoryLayout>(layout_value);
+
+  // Extract the impl_selector string
+  std::string impl_selector = graph.extract_string(impl_selector_str);
+
+  // Create temporary packed int8 tensors for input and output
+  TmpTensor packed_int8_input(
+      &graph, graph.sizes_of(fp_input), vkapi::kInt8x4, utils::kBuffer, layout);
+
+  TmpTensor packed_int8_output(
+      &graph,
+      graph.sizes_of(fp_output),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      layout);
+
+  // Quantize floating point input to packed int8
+  add_q8ta_quantize_node(
+      graph, fp_input, input_scale, input_zp, packed_int8_input);
+
+  // Build args for conv operator
+  std::vector<ValueRef> conv_args = {
+      packed_int8_input,
+      input_scale,
+      input_zp,
+      weight_data,
+      weight_sums_data,
+      weight_scales_data,
+      output_scale,
+      output_zp,
+      bias_data,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups,
+      packed_int8_output};
+
+  if (impl_selector == "legacy_4w4c") {
+    // Use the general quantized conv2d operator for legacy path
+    VK_GET_OP_FN("et_vk.conv2d_q8ta_q8csw_q8to.default")(graph, conv_args);
+  } else {
+    // Use the new general q8ta_conv2d operator
+    VK_GET_OP_FN("etvk.q8ta_conv2d.default")(graph, conv_args);
+  }
+
+  // Dequantize packed int8 output to floating point
+  add_q8ta_dequantize_node(
+      graph, packed_int8_output, output_scale, output_zp, fp_output);
+}
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(test_etvk.test_q8ta_conv2d_dw.default, test_q8ta_conv2d_dw);
+  VK_REGISTER_OP(test_etvk.test_q8ta_conv2d.default, test_q8ta_conv2d);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 1bf8be41854..bb84a94399b 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -93,6 +93,6 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("q4gsw_linear")
     define_custom_op_test_binary("test_q8ta_qdq")
     define_custom_op_test_binary("test_q8ta_clone")
-    define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d")
+    define_custom_op_test_binary("test_q8ta_conv2d")
     define_custom_op_test_binary("test_q8ta_conv2d_dw")
     define_custom_op_test_binary("q8ta_q8ta_q8to_add")
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp
similarity index 83%
rename from backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
rename to backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp
index daf99db06c9..6fe5db293c8 100644
--- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
+++ b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp
@@ -24,24 +24,13 @@ using namespace vkcompute;
 static constexpr int64_t kRefDimSizeLimit = 100;
 
 // Utility function to create a test case from a Conv2dConfig
-TestCase create_test_case_from_config(
+static TestCase create_test_case_from_config(
     const Conv2dConfig& config,
     vkapi::ScalarType input_dtype,
     utils::StorageType fp_storage_type,
-    utils::StorageType int8_storage_type) {
+    utils::GPUMemoryLayout int8_memory_layout,
+    const std::string& impl_selector = "general") {
   TestCase test_case;
-  test_case.set_name(config.test_case_name);
-
-  std::string operator_suffix = ".test";
-  if (int8_storage_type == utils::kTexture3D) {
-    operator_suffix += "_texture";
-  } else {
-    operator_suffix += "_buffer";
-  }
-
-  // Set the operator name for the test case
-  std::string operator_name = "etvk." + config.op_name + operator_suffix;
-  test_case.set_operator_name(operator_name);
 
   // Calculate output dimensions
   int64_t H_out = config.get_output_height();
@@ -55,6 +44,26 @@ TestCase create_test_case_from_config(
       ? utils::kWidthPacked
       : utils::kChannelsPacked;
 
+  // Create test case name
+  // Format: ACCU/PERF  OC->IC  I=H,W  g=groups  k=kernel  Tex(CP)->Buf(4C1W)
+  std::string prefix = config.test_case_name.substr(0, 4); // "ACCU" or "PERF"
+  std::string test_name = prefix + "  " + std::to_string(config.channels.out) +
+      "->" + std::to_string(config.channels.in) + "  " +
+      "I=" + std::to_string(config.input_size.h) + "," +
+      std::to_string(config.input_size.w) + "  " +
+      "g=" + std::to_string(config.groups) + "  " +
+      "k=" + std::to_string(config.kernel.h) + "  " +
+      repr_str(fp_storage_type, fp_memory_layout) + "->" +
+      repr_str(utils::kBuffer, int8_memory_layout);
+  if (!impl_selector.empty()) {
+    test_name += " [" + impl_selector + "]";
+  }
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case - use the unified test operator
+  std::string operator_name = "test_etvk.test_q8ta_conv2d.default";
+  test_case.set_operator_name(operator_name);
+
   ValueSpec input_tensor(
       input_size,
       input_dtype,
@@ -170,10 +179,26 @@ TestCase create_test_case_from_config(
   test_case.add_input_spec(dilation);
   test_case.add_input_spec(groups);
 
+  // Add memory layout parameter for the quantized tensors
+  ValueSpec layout_int(static_cast<int32_t>(int8_memory_layout));
+  test_case.add_input_spec(layout_int);
+
+  // Add impl_selector string
+  ValueSpec impl_selector_spec = ValueSpec::make_string(impl_selector);
+  test_case.add_input_spec(impl_selector_spec);
+
   test_case.add_output_spec(output);
 
   test_case.set_abs_tolerance(output_scale_val + 1e-4f);
 
+  // Filter out quantize/dequantize shaders from timing measurements
+  test_case.set_shader_filter({
+      "nchw_to",
+      "to_nchw",
+      "q8ta_quantize",
+      "q8ta_dequantize",
+  });
+
   return test_case;
 }
 
@@ -184,25 +209,41 @@ std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
   // Single simple configuration for debugging
   Conv2dConfig config = {
       OutInChannels(16, 8), // channels (out, in)
-      InputSize2D(21, 17), // input_size (h, w)
+      InputSize2D(5, 5), // input_size (h, w)
       KernelSize(3, 3), // kernel
       Stride(1, 1), // stride
       Padding(1, 1), // padding
       Dilation(1, 1), // dilation
-      2, // groups
+      1, // groups
   };
   config.op_name = "conv2d_q8ta_q8csw_q8to";
 
-  std::vector<utils::StorageType> storage_types = {
+  std::vector<utils::StorageType> fp_storage_types = {
       utils::kTexture3D, utils::kBuffer};
 
+  // Memory layouts for int8 tensors - test both optimized (4W4C) and general
+  // paths
+  std::vector<utils::GPUMemoryLayout> int8_memory_layouts = {
+      utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C};
+
   // Generate test cases for each combination
-  for (const utils::StorageType fp_storage_type : storage_types) {
-    for (const utils::StorageType int8_storage_type : storage_types) {
-      config.test_case_name = make_test_case_name(
-          config, false, fp_storage_type, int8_storage_type);
+  for (const utils::StorageType fp_storage_type : fp_storage_types) {
+    for (const utils::GPUMemoryLayout int8_memory_layout :
+         int8_memory_layouts) {
+      config.test_case_name =
+          make_test_case_name(config, false, fp_storage_type, utils::kBuffer);
       test_cases.push_back(create_test_case_from_config(
-          config, vkapi::kFloat, fp_storage_type, int8_storage_type));
+          config, vkapi::kFloat, fp_storage_type, int8_memory_layout));
+
+      // For 4W4C layout, also test the legacy implementation
+      if (int8_memory_layout == utils::kPackedInt8_4W4C) {
+        test_cases.push_back(create_test_case_from_config(
+            config,
+            vkapi::kFloat,
+            fp_storage_type,
+            int8_memory_layout,
+            /*impl_selector=*/"legacy_4w4c"));
+      }
     }
   }
 
@@ -210,7 +251,7 @@ std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
 }
 
 // Generate test cases for quantized conv2d operation
-std::vector<TestCase> generate_quantized_conv2d_test_cases() {
+static std::vector<TestCase> generate_quantized_conv2d_test_cases() {
   std::vector<TestCase> test_cases;
   if (!vkcompute::api::context()->adapter_ptr()->supports_int8_dot_product()) {
     return test_cases;
@@ -371,10 +412,15 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
        Dilation(1, 1),
        4}};
 
-  // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {
+  // Test with different storage types and memory layouts
+  std::vector<utils::StorageType> fp_storage_types = {
       utils::kTexture3D, utils::kBuffer};
 
+  // Memory layouts for int8 tensors - test both optimized (4W4C) and general
+  // paths
+  std::vector<utils::GPUMemoryLayout> int8_memory_layouts = {
+      utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C};
+
   // Generate test cases for each combination
   for (auto& config : configs) {
     bool is_performance = config.channels.out > kRefDimSizeLimit ||
@@ -384,12 +430,23 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
 
     config.op_name = "conv2d_q8ta_q8csw_q8to";
 
-    for (const utils::StorageType fp_storage_type : storage_types) {
-      for (const utils::StorageType int8_storage_type : storage_types) {
+    for (const utils::StorageType fp_storage_type : fp_storage_types) {
+      for (const utils::GPUMemoryLayout int8_memory_layout :
+           int8_memory_layouts) {
         config.test_case_name = make_test_case_name(
-            config, is_performance, fp_storage_type, int8_storage_type);
+            config, is_performance, fp_storage_type, utils::kBuffer);
         test_cases.push_back(create_test_case_from_config(
-            config, vkapi::kFloat, fp_storage_type, int8_storage_type));
+            config, vkapi::kFloat, fp_storage_type, int8_memory_layout));
+
+        // For 4W4C layout, also test the legacy implementation
+        if (int8_memory_layout == utils::kPackedInt8_4W4C) {
+          test_cases.push_back(create_test_case_from_config(
+              config,
+              vkapi::kFloat,
+              fp_storage_type,
+              int8_memory_layout,
+              /*impl_selector=*/"legacy_4w4c"));
+        }
       }
     }
   }
@@ -398,7 +455,7 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
 }
 
 // Reference implementation for activation, weight, and output quantized conv2d
-void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) {
+static void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) {
   // Extract input specifications
   int32_t idx = 0;
   const ValueSpec& input_spec = test_case.inputs()[idx++];
@@ -416,6 +473,10 @@ void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) {
   const ValueSpec& padding_spec = test_case.inputs()[idx++];
   const ValueSpec& dilation_spec = test_case.inputs()[idx++];
   const ValueSpec& groups_spec = test_case.inputs()[idx++];
+  const ValueSpec& layout_spec = test_case.inputs()[idx++];
+  (void)layout_spec; // Not used in reference implementation
+  const ValueSpec& impl_selector_spec = test_case.inputs()[idx++];
+  (void)impl_selector_spec; // Not used in reference implementation
 
   // Extract output specification (mutable reference)
   ValueSpec& output_spec = test_case.outputs()[0];
@@ -591,12 +652,12 @@ void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) {
   }
 }
 
-void reference_impl(TestCase& test_case) {
+static void reference_impl(TestCase& test_case) {
   conv2d_q8ta_q8csw_q8to_reference_impl(test_case);
 }
 
 // Custom FLOP calculator for quantized conv2d operation
-int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) {
+static int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) {
   int kernel_idx = 9; // kernel_size is at index 9 for q8ta_q8csw_q8to
 
   // Get input and weight dimensions