diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl new file mode 100644 index 00000000000..623de3a5d9a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl @@ -0,0 +1,249 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions("buffer", DTYPE)} + +#extension GL_EXT_control_flow_attributes : require +#extension GL_EXT_integer_dot_product : require + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, "buffer")} +#define T ${texel_load_component_type(DTYPE, "buffer")} + +${define_active_storage_type("buffer")} + +layout(std430) buffer; + +#include "indexing.glslh" +#include "common.glslh" +#include "conv2d_common.glslh" + +${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)} +${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)} +${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", "texture2d", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)} + +// Metadata for input/output tensors (memory layout agnostic) +${layout_declare_ubo(B, "BufferMetadata", "outp")} +${layout_declare_ubo(B, "BufferMetadata", "inp")} +${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")} + +layout(push_constant) uniform restrict Block { + float input_scale; + int input_zp; + float output_inv_scale; + int output_zp; +}; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "apply_bias", "1")} + +// Layout specialization constants +${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")} +${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} + +// Load weight block for a given (ic4, kx, ky, oc4) position. +// Weight texture layout (from pack_q8_conv2d_weights.glsl): +// block_x = oc4 * K_w + kx +// block_y = ky * IC4 + ic4 +// Each texel ivec4 has 4 components (4 output channels), each component is +// a packed int32 containing 4 int8 values for 4 consecutive input channels. +ivec4 load_weight_block(int ic4, int kx, int ky, int oc4, int IC4, int KW) { + const int block_x = oc4 * KW + kx; + const int block_y = ky * IC4 + ic4; + return texelFetch(t_packed_int8_weight, ivec2(block_x, block_y), 0); +} + +ivec4 quantize(const vec4 texel, const float inv_scale, const int zp) { + vec4 quantized = round(texel * inv_scale) + zp; + return clamp(ivec4(quantized), -128, 127); +} + +void main() { + // Thread mapping + int oc4 = int(gl_GlobalInvocationID.z); + int w4 = int(gl_GlobalInvocationID.x); + + // Initialize output tensor index (WHCN order) + // Each thread handles 4 adjacent widths starting at base_out_w + TensorIndex4D outp_tidx; + outp_tidx.data[0] = w4 * 4; + outp_tidx.data[1] = int(gl_GlobalInvocationID.y); + outp_tidx.data[2] = oc4 * 4; + outp_tidx.data[3] = 0; + + const int W = int(outp.sizes[0][0]); + const int OC = int(outp.sizes[0][2]); + const int OC4 = int(div_up_4(OC)); + + // Bounds check + if (any(greaterThanEqual(outp_tidx.data, ivec4(outp.sizes[0])))) { + return; + } + + // Input dimensions + const int inp_W = int(inp.sizes[0][0]); + const int inp_H = int(inp.sizes[0][1]); + const int IC = int(inp.sizes[0][2]); + + // Compute channels per group + const int OC_per_group = OC / conv2d_params.groups; + const int IC_per_group = IC / conv2d_params.groups; + const int IC4_per_group = div_up_4(IC_per_group); + + // Determine which group this output channel block belongs to + const int group_idx = outp_tidx.data[2] / OC_per_group; + const int ic_group_start = group_idx * IC_per_group; + + // Get strides for efficient indexing + const int inp_w_stride = int(inp.strides[0][0]); + const int inp_h_stride = int(inp.strides[0][1]); + const int inp_c_stride = int(inp.strides[0][2]); + const int w_texel_step = conv2d_params.dilation.x * inp_w_stride; + const int h_texel_step = conv2d_params.dilation.y * inp_h_stride; + const int subtile_w_step = conv2d_params.stride.x * inp_w_stride; + + // Compute base input position (for subtile_w=0, ic4=0) + TensorIndex4D inp_tidx; + inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x; + inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y; + inp_tidx.data[2] = ic_group_start; + inp_tidx.data[3] = 0; + + int base_inp_texel_idx; + if (get_outer_packed_dim_block_size(inp_layout) == 1) { + base_inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout); + } + + // Store base width to reset at beginning of each loop + const int base_inp_w = inp_tidx.data[0]; + + // Create packed input zero point (4 copies of input_zp packed into int32) + const int input_zp_packed = pack_into_int32(ivec4(input_zp)); + + // Initialize accumulators for 4 width positions × 4 output channels each + ivec4 acc[4]; + [[unroll]] for (int i = 0; i < 4; ++i) { + acc[i] = ivec4(0); + } + + // Perform convolution using packed int8 dot products + for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) { + const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H); + + // Process input channels in blocks of 4 + for (int ic4 = 0; ic4 < IC4_per_group; ic4++) { + // Input channel index for this block (base channel of the 4-channel block) + inp_tidx.data[2] = ic_group_start + ic4 * 4; + + // Reset width coordinate at start of each ic4 iteration + inp_tidx.data[0] = base_inp_w; + + for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) { + // Load weight block: 4 output channels × 4 input channels + // weight_block[oc] contains packed weights for ic4*4 to ic4*4+3 -> oc + const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, conv2d_params.kernel_size.x); + + // Process 4 adjacent width positions + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + // Load packed input (4 consecutive channels packed into one int32) + // Use input_zp_packed for out-of-bounds positions + int packed_input = input_zp_packed; + if (h_in_bounds && inp_tidx.data[0] >= 0 && inp_tidx.data[0] < inp_W) { + // Compute input texel index using base + offsets + int inp_texel_idx; + if (get_outer_packed_dim_block_size(inp_layout) == 1) { + inp_texel_idx = base_inp_texel_idx + ic4 * inp_c_stride + kx * w_texel_step + subtile_w * subtile_w_step; + } else { + // inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout); + const int w4 = div_4(inp_tidx.data[0]); + const int inp_c4 = div_4(inp_tidx.data[2]); + inp_texel_idx = (inp_tidx.data[1] * inp_h_stride + w4 * inp_w_stride + inp_c4) * 4 + mod_4(inp_tidx.data[0]); + } + packed_input = t_packed_int8_input[inp_texel_idx]; + } + + // Accumulate using packed int8 dot product for each output channel + // dotPacked4x8AccSatEXT computes: acc + dot(unpack(a), unpack(b)) + [[unroll]] for (int oc_offset = 0; oc_offset < 4; ++oc_offset) { + acc[subtile_w][oc_offset] = dotPacked4x8AccSatEXT( + packed_input, + weight_block[oc_offset], + acc[subtile_w][oc_offset]); + } + + // Advance to next output position's input coordinate + inp_tidx.data[0] += conv2d_params.stride.x; + } + + // Adjust for net dilation step + inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x; + } + } + + // Advance height by dilation for next kernel row + inp_tidx.data[1] += conv2d_params.dilation.y; + + if (get_outer_packed_dim_block_size(inp_layout) == 1) { + // Advance base index by height step for next kernel row + base_inp_texel_idx += h_texel_step; + } + } + + // Apply input zero point correction via weight_sums + const vec4 weight_sums = vec4(t_weight_sums[oc4]); + const vec4 weight_scales = vec4(t_weight_scales[oc4]); + + // Convert to float, apply dequantization, and optionally add bias + vec4 facc[4]; + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + facc[subtile_w] = vec4(acc[subtile_w]); + facc[subtile_w] -= weight_sums * input_zp; + facc[subtile_w] *= weight_scales * input_scale; + } + + // Apply bias if enabled + if (apply_bias > 0) { + const vec4 bias = vec4(t_bias[oc4]); + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + facc[subtile_w] += bias; + } + } + + // Compute base output texel index (for subtile_w=0) + const int base_outp_texel_idx = tensor4d_idx_to_texel_idx(outp, outp_tidx, outp_layout); + const int out_w_stride = int(outp.strides[0][0]); + + // Quantize and store outputs using stride offsets + [[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) { + // Skip out-of-bounds width positions + if (outp_tidx.data[0] >= W) { + continue; + } + + const ivec4 quantized_out = quantize(facc[subtile_w], output_inv_scale, output_zp); + const int packed_out = pack_into_int32(quantized_out); + + // Store using stride offset from base + int outp_texel_idx; + if (get_outer_packed_dim_block_size(outp_layout) == 1) { + outp_texel_idx = base_outp_texel_idx + subtile_w * out_w_stride; + } else { + outp_texel_idx = base_outp_texel_idx + subtile_w; + } + + t_packed_int8_output[outp_texel_idx] = packed_out; + + outp_tidx.data[0] += 1; + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.yaml new file mode 100644 index 00000000000..dc21e6da0c5 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.yaml @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q8ta_conv2d: + parameter_names_with_default_values: + DTYPE: float + generate_variant_forall: + DTYPE: + - VALUE: float + shader_variants: + - NAME: q8ta_conv2d diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp new file mode 100644 index 00000000000..2ee57551235 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include + +namespace vkcompute { + +bool q8ta_conv2d_check_packed_dim_info(const api::PackedDimInfo& info) { + return info.packed_dim == WHCN::kChannelsDim && + info.packed_dim_block_size == 4 && + info.outer_packed_dim == WHCN::kWidthDim && + (info.outer_packed_dim_block_size == 1 || + info.outer_packed_dim_block_size == 4); +} + +// +// Workgroup size selection functions +// + +/** + * Computes a global workgroup size for q8ta_conv2d where: + * - For channels-fastest output (e.g., 4C): x = C4, y = H, z = W4 + * - For width-fastest output (e.g., 4C1W): x = W4, y = H, z = C4 + * + * The x/z assignment matches the shader's dynamic thread assignment based on + * fastest_dim (dim_order[0]), ensuring consecutive threads access consecutive + * elements along the fastest moving dimension for optimal memory coalescing. + * + * Each thread processes a 4Wx4C tile of output elements. + */ +utils::uvec3 pick_q8ta_conv2d_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + + const ValueRef output = args.at(0).refs.at(0); + + const uint32_t W = graph->size_at(-1, output); + const uint32_t H = graph->size_at(-2, output); + const uint32_t C = graph->size_at(-3, output); + + // Each thread processes 4 adjacent width positions and 4 channels (4Wx4C + // tile) + const uint32_t W4 = utils::div_up_4(W); + const uint32_t C4 = utils::div_up_4(C); + + return {W4, H, C4}; +} + +/** + * Picks a local workgroup size for q8ta_conv2d with adaptive sizing based on + * tensor dimensions. Uses experimentation results: + * - {4, 2, 8} for medium tensors: +57% improvement on 81x81 + * - {8, 1, 8} for very large tensors: best baseline performance + * - {64, 1, 1} for narrow channel dimensions: minimize inactive invocations + */ +utils::uvec3 pick_q8ta_conv2d_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + + const ValueRef output = args.at(0).refs.at(0); + + // Get actual tensor dimensions for adaptive sizing + const uint32_t H = graph->size_at(-2, output); + + // For very large tensors (H >= 100 and large x/z), use {8, 1, 8} + // This configuration performed best for 128x128 tensors in experiments + if (H >= 100 && global_workgroup_size[0u] >= 24 && + global_workgroup_size[2u] >= 24) { + return {8u, 1u, 8u}; + } + + // For medium-sized tensors, use {4, 2, 8} for better height parallelism + // This configuration showed +57% improvement on 81x81 tensors + if (global_workgroup_size[0u] >= 4 && global_workgroup_size[1u] >= 2 && + global_workgroup_size[2u] >= 8) { + return {4u, 2u, 8u}; + } + + // For tensors with sufficient x and z dimensions, use square configuration + if (global_workgroup_size[0u] >= 6 && global_workgroup_size[2u] >= 6) { + return {8u, 1u, 8u}; + } + + // If x dimension is very small, bias towards z dimension + if (global_workgroup_size[0u] < 2u) { + return {1u, 1u, 64u}; + } + + // If z dimension is very small, bias towards x dimension + if (global_workgroup_size[2u] < 2u) { + return {64u, 1u, 1u}; + } + + return {16u, 1u, 4u}; +} + +// +// Prepack nodes +// + +ValueRef prepack_quantized_conv2d_weight( + ComputeGraph& graph, + const QuantizationConfig& weight_quant_config, + const ValueRef weight_data, + const ValueRef input, + const ValueRef output, + const ValueRef groups, + const ValueRef kernel_size) { + VK_CHECK_COND(weight_quant_config.nbits == 8); + VK_CHECK_COND(weight_quant_config.is_symmetric); + + const int32_t groups_val = graph.get_int(groups); + + const int64_t OC = graph.size_at(-3, output); + const int64_t IC = graph.size_at(-3, input) / groups_val; + + int64_t K_h; + int64_t K_w; + + { + const auto kernel_size_list = graph.get_int_list(kernel_size); + K_h = kernel_size_list->at(0); + K_w = kernel_size_list->at(1); + } + + const int64_t num_blocks_OC = utils::div_up_4(OC); + const int64_t num_blocks_IC = utils::div_up_4(IC); + + const int64_t num_blocks_y = num_blocks_IC * K_h; + const int64_t num_blocks_x = K_w * num_blocks_OC; + + // The packed tensor arranges blocks as [OC_blocks * K_total, IC_blocks] + const int64_t output_height = num_blocks_y; + const int64_t output_width = num_blocks_x * 4; + + // Store the original sizes of the weight data to pass to the shader + utils::ivec4 orig_sizes = { + utils::safe_downcast(OC), + utils::safe_downcast(K_h), + utils::safe_downcast(K_w), + utils::safe_downcast(IC)}; + + std::vector packed_weight_sizes{output_height, output_width}; + + utils::StorageType storage_type = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + if (output_width > max_extent * 4 || output_height > max_extent) { + storage_type = utils::kBuffer; + } + + ValueRef packed_weight = graph.add_tensor( + packed_weight_sizes, + vkcompute::vkapi::kInt, + storage_type, + utils::kWidthPacked); + + utils::uvec3 global_wg_size = { + utils::safe_downcast(num_blocks_x), + utils::safe_downcast(num_blocks_y), + 1u}; + + std::string kernel_name = "pack_q8_conv2d_weights"; + add_storage_type_suffix(kernel_name, storage_type); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + // Inputs and Outputs + weight_data, + packed_weight, + // UBOs + {}, + // Specialization Constants + {}, + // Push Constants + {graph.sizes_pc_of(packed_weight), + PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec4))})); + + return packed_weight; +} + +// +// Dispatch nodes +// + +void add_q8ta_conv2d_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef packed_int8_input_im2col, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output) { + (void)packed_int8_input_im2col; // Not used in general shader + + Conv2DParams conv_params = create_conv2d_params( + graph, + packed_int8_input, + packed_int8_output, + kernel_size, + stride, + padding, + dilation, + groups); + + // The implementation requires that for grouped convolutions, the input + // channels per group is a multiple of 4. + if (conv_params.groups > 1) { + VK_CHECK_COND(conv_params.in_channels_per_group % 4 == 0); + } + + // Validate packed dim info for input and output tensors + VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info( + graph.packed_dim_info_of(packed_int8_input))); + VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info( + graph.packed_dim_info_of(packed_int8_output))); + + // Validate dtype is kInt8x4 + VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4); + VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4); + + float input_scale_val = graph.extract_scalar(input_scale); + int32_t input_zp_val = graph.extract_scalar(input_zp); + + float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); + int32_t output_zp_val = graph.extract_scalar(output_zp); + + uint32_t apply_bias = 1; + if (graph.val_is_none(bias_data)) { + apply_bias = 0; + } + + std::vector push_constants = { + PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), + PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), + PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), + PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), + }; + + // Select shader based on layout + std::string kernel_name = "q8ta_conv2d"; + add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); + + // Pass metadata for both output and input tensors + vkapi::ParamsBindList param_buffers = { + graph.buffer_meta_ubo(packed_int8_output), + graph.buffer_meta_ubo(packed_int8_input), + graph.create_params_buffer(conv_params)}; + + // Build spec constants: apply_bias + layout constants + vkapi::SpecVarList spec_constants = { + apply_bias, + // Layout specialization constants + graph.hashed_layout_of(packed_int8_input), + graph.hashed_layout_of(packed_int8_output), + }; + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + pick_q8ta_conv2d_global_wg_size, + pick_q8ta_conv2d_local_wg_size, + // Inputs and Outputs + {{packed_int8_output, vkapi::kWrite}, + {{packed_int8_input, + packed_weight, + packed_weight_sums, + packed_weight_scales, + packed_bias}, + vkapi::kRead}}, + // Shader params buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + spec_constants, + // Resize args + {})); +} + +// +// High level operator impl +// + +void q8ta_conv2d(ComputeGraph& graph, const std::vector& args) { + int32_t idx = 0; + const ValueRef packed_int8_input = args.at(idx++); + const ValueRef input_scale = args.at(idx++); + const ValueRef input_zp = args.at(idx++); + const ValueRef weight_data = args.at(idx++); + const ValueRef weight_sums_data = args.at(idx++); + const ValueRef weight_scales_data = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef bias_data = args.at(idx++); + const ValueRef kernel_size = args.at(idx++); + const ValueRef stride = args.at(idx++); + const ValueRef padding = args.at(idx++); + const ValueRef dilation = args.at(idx++); + const ValueRef groups = args.at(idx++); + const ValueRef packed_int8_output = args.at(idx++); + + QuantizationConfig weight_quant_config(8, kPerChannel, {}); + + // Prepack weight using the conv2d weight packing for the general shader + ValueRef packed_weight = prepack_quantized_conv2d_weight( + graph, + weight_quant_config, + weight_data, + packed_int8_input, + packed_int8_output, + groups, + kernel_size); + + ValueRef packed_weight_sums = prepack_standard( + graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked); + + ValueRef packed_weight_scales = prepack_standard( + graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked); + + // Create a dummy tensor to fill the binding slot of the bias tensor if it is + // not provided. This helps simplify dispatch logic and makes it so that + // fewer shader variants need to be generated. + TmpTensor dummy_bias( + &graph, + {}, + graph.dtype_of(weight_scales_data), + utils::kBuffer, + utils::kWidthPacked); + + ValueRef packed_bias = dummy_bias.vref; + if (graph.val_is_not_none(bias_data)) { + packed_bias = + prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked); + } + + // The general q8ta_conv2d shader does not use im2col, so pass input as im2col + add_q8ta_conv2d_node( + graph, + packed_int8_input, + packed_int8_input, // packed_int8_input_im2col - not used in general + // shader + input_scale, + input_zp, + packed_weight, + packed_weight_sums, + packed_weight_scales, + output_scale, + output_zp, + bias_data, + packed_bias, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(etvk.q8ta_conv2d.default, q8ta_conv2d); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h index 5f028caec12..2c66537acc5 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h @@ -13,6 +13,17 @@ namespace vkcompute { +bool q8ta_conv2d_check_packed_dim_info(const api::PackedDimInfo& info); + +ValueRef prepack_quantized_conv2d_weight( + ComputeGraph& graph, + const QuantizationConfig& weight_quant_config, + const ValueRef weight_data, + const ValueRef input, + const ValueRef output, + const ValueRef groups, + const ValueRef kernel_size); + ValueRef prepack_quantized_conv2d_weight( ComputeGraph& graph, const QuantizationConfig& weight_quant_config, diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp index 5e1e8aab599..121a577555f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dDW.cpp @@ -292,6 +292,16 @@ void add_q8ta_conv2d_dw_node( dilation, groups); + // Validate packed dim info for input and output tensors + VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info( + graph.packed_dim_info_of(packed_int8_input))); + VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info( + graph.packed_dim_info_of(packed_int8_output))); + + // Validate dtype is kInt8x4 + VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4); + VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4); + // Verify this is actually a depthwise convolution const int64_t groups_val = graph.extract_scalar(groups); const int64_t in_channels = graph.size_at(-3, packed_int8_input); diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp index a4d959aec41..1bfff6f1342 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp @@ -364,93 +364,6 @@ utils::uvec3 pick_static_quantized_conv2d_local_wg_size( graph, shader, global_workgroup_size, args, resize_args); } -// -// Prepack nodes -// - -ValueRef prepack_quantized_conv2d_weight( - ComputeGraph& graph, - const QuantizationConfig& weight_quant_config, - const ValueRef weight_data, - const ValueRef input, - const ValueRef output, - const ValueRef groups, - const ValueRef kernel_size) { - VK_CHECK_COND(weight_quant_config.nbits == 8); - VK_CHECK_COND(weight_quant_config.is_symmetric); - - const int32_t groups_val = graph.get_int(groups); - - const int64_t OC = graph.size_at(-3, output); - const int64_t IC = graph.size_at(-3, input) / groups_val; - - int64_t K_h; - int64_t K_w; - - { - const auto kernel_size_list = graph.get_int_list(kernel_size); - K_h = kernel_size_list->at(0); - K_w = kernel_size_list->at(1); - } - - const int64_t num_blocks_OC = utils::div_up_4(OC); - const int64_t num_blocks_IC = utils::div_up_4(IC); - - const int64_t num_blocks_y = num_blocks_IC * K_h; - const int64_t num_blocks_x = K_w * num_blocks_OC; - - // The packed tensor arranges blocks as [OC_blocks * K_total, IC_blocks] - const int64_t output_height = num_blocks_y; - const int64_t output_width = num_blocks_x * 4; - - // Store the original sizes of the weight data to pass to the shader - utils::ivec4 orig_sizes = { - utils::safe_downcast(OC), - utils::safe_downcast(K_h), - utils::safe_downcast(K_w), - utils::safe_downcast(IC)}; - - std::vector packed_weight_sizes{output_height, output_width}; - - utils::StorageType storage_type = utils::kTexture2D; - uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); - if (output_width > max_extent * 4 || output_height > max_extent) { - storage_type = utils::kBuffer; - } - - ValueRef packed_weight = graph.add_tensor( - packed_weight_sizes, - vkcompute::vkapi::kInt, - storage_type, - utils::kWidthPacked); - - utils::uvec3 global_wg_size = { - utils::safe_downcast(num_blocks_x), - utils::safe_downcast(num_blocks_y), - 1u}; - - std::string kernel_name = "pack_q8_conv2d_weights"; - add_storage_type_suffix(kernel_name, storage_type); - - graph.prepack_nodes().emplace_back(new PrepackNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - graph.create_local_wg_size(global_wg_size), - // Inputs and Outputs - weight_data, - packed_weight, - // UBOs - {}, - // Specialization Constants - {}, - // Push Constants - {graph.sizes_pc_of(packed_weight), - PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec4))})); - - return packed_weight; -} - // // Dispatch nodes // @@ -824,7 +737,7 @@ void add_conv2d_q8ta_q8csw_linear_node( nullptr)); } -void add_conv2d_q8ta_q8csw_q8to_node( +void add_conv2d_q8ta_q8csw_q8to_4w4c_node( ComputeGraph& graph, const ValueRef packed_int8_input, const ValueRef packed_int8_input_im2col, @@ -853,38 +766,38 @@ void add_conv2d_q8ta_q8csw_q8to_node( dilation, groups); - const bool use_im2col = should_use_im2col(&graph, kernel_size, groups); - float input_scale_val = graph.extract_scalar(input_scale); int32_t input_zp_val = graph.extract_scalar(input_zp); float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); int32_t output_zp_val = graph.extract_scalar(output_zp); + uint32_t apply_bias = 1; + if (graph.val_is_none(bias_data)) { + apply_bias = 0; + } + + std::vector push_constants = { + PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), + PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), + PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), + PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), + }; + + // Use the optimized im2col or direct shader for 4W4C layout + const bool use_im2col = should_use_im2col(&graph, kernel_size, groups); + std::string kernel_name = use_im2col ? "conv2d_q8ta_q8csw_q8to_linear_tiled" : "conv2d_q8ta_q8csw_q8to"; add_storage_type_suffix( kernel_name, graph.storage_type_of(packed_int8_output)); add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight)); add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); vkapi::ParamsBindList param_buffers = { graph.sizes_ubo(packed_int8_output), graph.sizes_ubo(packed_int8_input_im2col)}; - std::vector push_constants = { - PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), - PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), - PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), - PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), - }; - - uint32_t apply_bias = 1; - if (graph.val_is_none(bias_data)) { - apply_bias = 0; - } - vkapi::SpecVarList spec_constants = GenerateSpecConstants(graph, conv_params, groups, apply_bias); @@ -913,6 +826,78 @@ void add_conv2d_q8ta_q8csw_q8to_node( nullptr)); } +void add_conv2d_q8ta_q8csw_q8to_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef packed_int8_input_im2col, + const ValueRef input_scale, + const ValueRef input_zp, + const ValueRef packed_weight, + const ValueRef packed_weight_sums, + const ValueRef packed_weight_scales, + const ValueRef output_scale, + const ValueRef output_zp, + const ValueRef bias_data, + const ValueRef packed_bias, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output) { + // Check if the input/output layout is 4W4C (optimized path) + const utils::GPUMemoryLayout inp_layout = + graph.estimate_memory_layout_of(packed_int8_input); + const utils::GPUMemoryLayout outp_layout = + graph.estimate_memory_layout_of(packed_int8_output); + + const bool use_optimized_shader = + (inp_layout == utils::kPackedInt8_4W4C && + outp_layout == utils::kPackedInt8_4W4C); + + if (use_optimized_shader) { + add_conv2d_q8ta_q8csw_q8to_4w4c_node( + graph, + packed_int8_input, + packed_int8_input_im2col, + input_scale, + input_zp, + packed_weight, + packed_weight_sums, + packed_weight_scales, + output_scale, + output_zp, + bias_data, + packed_bias, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output); + } else { + add_q8ta_conv2d_node( + graph, + packed_int8_input, + packed_int8_input_im2col, + input_scale, + input_zp, + packed_weight, + packed_weight_sums, + packed_weight_scales, + output_scale, + output_zp, + bias_data, + packed_bias, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output); + } +} + // // High level operator impl // @@ -1171,7 +1156,18 @@ void static_quantized_conv2d_impl( // same 4Wx4C block have the same group index. const bool is_depthwise = (groups_val == in_channels); - const bool use_im2col = should_use_im2col(&graph, kernel_size, groups); + // Check if input/output layouts are 4W4C (optimized im2col path) + const utils::GPUMemoryLayout inp_layout = + graph.estimate_memory_layout_of(packed_int8_input); + const utils::GPUMemoryLayout outp_layout = + graph.estimate_memory_layout_of(packed_int8_output); + const bool is_optimized_layout = + (inp_layout == utils::kPackedInt8_4W4C && + outp_layout == utils::kPackedInt8_4W4C); + + // Only use im2col path for 4W4C layouts + const bool use_im2col = + is_optimized_layout && should_use_im2col(&graph, kernel_size, groups); // For pointwise convolution with stride = 1, padding = 0, dilation = 1, the // input tensor is already equivalent to its im2col representation. In this // case we can skip the im2col procedure and pass in the input image to the diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index 0deea46c292..5e794e3b48b 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -99,7 +99,7 @@ if(TARGET vulkan_backend) add_operator_prototype(choose_qparams_per_row) add_operator_prototype(test_q8ta_qdq) add_operator_prototype(test_q8ta_clone) - add_operator_prototype(q8ta_q8csw_q8to_conv2d) + add_operator_prototype(test_q8ta_conv2d) add_operator_prototype(test_q8ta_conv2d_dw) add_operator_prototype(q8ta_q8ta_q8to_add) endif() diff --git a/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp index 861f25c0606..fca82ef3eee 100644 --- a/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp +++ b/backends/vulkan/test/custom_ops/impl/TestQ8taConv2d.cpp @@ -89,8 +89,83 @@ void test_q8ta_conv2d_dw( graph, packed_int8_output, output_scale, output_zp, fp_output); } +void test_q8ta_conv2d(ComputeGraph& graph, const std::vector& args) { + int32_t idx = 0; + const ValueRef fp_input = args.at(idx++); + const ValueRef input_scale = args.at(idx++); + const ValueRef input_zp = args.at(idx++); + const ValueRef weight_data = args.at(idx++); + const ValueRef weight_sums_data = args.at(idx++); + const ValueRef weight_scales_data = args.at(idx++); + const ValueRef output_scale = args.at(idx++); + const ValueRef output_zp = args.at(idx++); + const ValueRef bias_data = args.at(idx++); + const ValueRef kernel_size = args.at(idx++); + const ValueRef stride = args.at(idx++); + const ValueRef padding = args.at(idx++); + const ValueRef dilation = args.at(idx++); + const ValueRef groups = args.at(idx++); + const ValueRef layout_int = args.at(idx++); + const ValueRef impl_selector_str = args.at(idx++); + const ValueRef fp_output = args.at(idx++); + + // Extract the layout parameter and cast to GPUMemoryLayout + int32_t layout_value = graph.extract_scalar(layout_int); + utils::GPUMemoryLayout layout = + static_cast(layout_value); + + // Extract the impl_selector string + std::string impl_selector = graph.extract_string(impl_selector_str); + + // Create temporary packed int8 tensors for input and output + TmpTensor packed_int8_input( + &graph, graph.sizes_of(fp_input), vkapi::kInt8x4, utils::kBuffer, layout); + + TmpTensor packed_int8_output( + &graph, + graph.sizes_of(fp_output), + vkapi::kInt8x4, + utils::kBuffer, + layout); + + // Quantize floating point input to packed int8 + add_q8ta_quantize_node( + graph, fp_input, input_scale, input_zp, packed_int8_input); + + // Build args for conv operator + std::vector conv_args = { + packed_int8_input, + input_scale, + input_zp, + weight_data, + weight_sums_data, + weight_scales_data, + output_scale, + output_zp, + bias_data, + kernel_size, + stride, + padding, + dilation, + groups, + packed_int8_output}; + + if (impl_selector == "legacy_4w4c") { + // Use the general quantized conv2d operator for legacy path + VK_GET_OP_FN("et_vk.conv2d_q8ta_q8csw_q8to.default")(graph, conv_args); + } else { + // Use the new general q8ta_conv2d operator + VK_GET_OP_FN("etvk.q8ta_conv2d.default")(graph, conv_args); + } + + // Dequantize packed int8 output to floating point + add_q8ta_dequantize_node( + graph, packed_int8_output, output_scale, output_zp, fp_output); +} + REGISTER_OPERATORS { VK_REGISTER_OP(test_etvk.test_q8ta_conv2d_dw.default, test_q8ta_conv2d_dw); + VK_REGISTER_OP(test_etvk.test_q8ta_conv2d.default, test_q8ta_conv2d); } } // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 1bf8be41854..bb84a94399b 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -93,6 +93,6 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("q4gsw_linear") define_custom_op_test_binary("test_q8ta_qdq") define_custom_op_test_binary("test_q8ta_clone") - define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d") + define_custom_op_test_binary("test_q8ta_conv2d") define_custom_op_test_binary("test_q8ta_conv2d_dw") define_custom_op_test_binary("q8ta_q8ta_q8to_add") diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp similarity index 83% rename from backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp rename to backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp index daf99db06c9..6fe5db293c8 100644 --- a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp +++ b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp @@ -24,24 +24,13 @@ using namespace vkcompute; static constexpr int64_t kRefDimSizeLimit = 100; // Utility function to create a test case from a Conv2dConfig -TestCase create_test_case_from_config( +static TestCase create_test_case_from_config( const Conv2dConfig& config, vkapi::ScalarType input_dtype, utils::StorageType fp_storage_type, - utils::StorageType int8_storage_type) { + utils::GPUMemoryLayout int8_memory_layout, + const std::string& impl_selector = "general") { TestCase test_case; - test_case.set_name(config.test_case_name); - - std::string operator_suffix = ".test"; - if (int8_storage_type == utils::kTexture3D) { - operator_suffix += "_texture"; - } else { - operator_suffix += "_buffer"; - } - - // Set the operator name for the test case - std::string operator_name = "etvk." + config.op_name + operator_suffix; - test_case.set_operator_name(operator_name); // Calculate output dimensions int64_t H_out = config.get_output_height(); @@ -55,6 +44,26 @@ TestCase create_test_case_from_config( ? utils::kWidthPacked : utils::kChannelsPacked; + // Create test case name + // Format: ACCU/PERF OC->IC I=H,W g=groups k=kernel Tex(CP)->Buf(4C1W) + std::string prefix = config.test_case_name.substr(0, 4); // "ACCU" or "PERF" + std::string test_name = prefix + " " + std::to_string(config.channels.out) + + "->" + std::to_string(config.channels.in) + " " + + "I=" + std::to_string(config.input_size.h) + "," + + std::to_string(config.input_size.w) + " " + + "g=" + std::to_string(config.groups) + " " + + "k=" + std::to_string(config.kernel.h) + " " + + repr_str(fp_storage_type, fp_memory_layout) + "->" + + repr_str(utils::kBuffer, int8_memory_layout); + if (!impl_selector.empty()) { + test_name += " [" + impl_selector + "]"; + } + test_case.set_name(test_name); + + // Set the operator name for the test case - use the unified test operator + std::string operator_name = "test_etvk.test_q8ta_conv2d.default"; + test_case.set_operator_name(operator_name); + ValueSpec input_tensor( input_size, input_dtype, @@ -170,10 +179,26 @@ TestCase create_test_case_from_config( test_case.add_input_spec(dilation); test_case.add_input_spec(groups); + // Add memory layout parameter for the quantized tensors + ValueSpec layout_int(static_cast(int8_memory_layout)); + test_case.add_input_spec(layout_int); + + // Add impl_selector string + ValueSpec impl_selector_spec = ValueSpec::make_string(impl_selector); + test_case.add_input_spec(impl_selector_spec); + test_case.add_output_spec(output); test_case.set_abs_tolerance(output_scale_val + 1e-4f); + // Filter out quantize/dequantize shaders from timing measurements + test_case.set_shader_filter({ + "nchw_to", + "to_nchw", + "q8ta_quantize", + "q8ta_dequantize", + }); + return test_case; } @@ -184,25 +209,41 @@ std::vector generate_quantized_conv2d_easy_cases() { // Single simple configuration for debugging Conv2dConfig config = { OutInChannels(16, 8), // channels (out, in) - InputSize2D(21, 17), // input_size (h, w) + InputSize2D(5, 5), // input_size (h, w) KernelSize(3, 3), // kernel Stride(1, 1), // stride Padding(1, 1), // padding Dilation(1, 1), // dilation - 2, // groups + 1, // groups }; config.op_name = "conv2d_q8ta_q8csw_q8to"; - std::vector storage_types = { + std::vector fp_storage_types = { utils::kTexture3D, utils::kBuffer}; + // Memory layouts for int8 tensors - test both optimized (4W4C) and general + // paths + std::vector int8_memory_layouts = { + utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C}; + // Generate test cases for each combination - for (const utils::StorageType fp_storage_type : storage_types) { - for (const utils::StorageType int8_storage_type : storage_types) { - config.test_case_name = make_test_case_name( - config, false, fp_storage_type, int8_storage_type); + for (const utils::StorageType fp_storage_type : fp_storage_types) { + for (const utils::GPUMemoryLayout int8_memory_layout : + int8_memory_layouts) { + config.test_case_name = + make_test_case_name(config, false, fp_storage_type, utils::kBuffer); test_cases.push_back(create_test_case_from_config( - config, vkapi::kFloat, fp_storage_type, int8_storage_type)); + config, vkapi::kFloat, fp_storage_type, int8_memory_layout)); + + // For 4W4C layout, also test the legacy implementation + if (int8_memory_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + vkapi::kFloat, + fp_storage_type, + int8_memory_layout, + /*impl_selector=*/"legacy_4w4c")); + } } } @@ -210,7 +251,7 @@ std::vector generate_quantized_conv2d_easy_cases() { } // Generate test cases for quantized conv2d operation -std::vector generate_quantized_conv2d_test_cases() { +static std::vector generate_quantized_conv2d_test_cases() { std::vector test_cases; if (!vkcompute::api::context()->adapter_ptr()->supports_int8_dot_product()) { return test_cases; @@ -371,10 +412,15 @@ std::vector generate_quantized_conv2d_test_cases() { Dilation(1, 1), 4}}; - // Test with different storage types and data types - std::vector storage_types = { + // Test with different storage types and memory layouts + std::vector fp_storage_types = { utils::kTexture3D, utils::kBuffer}; + // Memory layouts for int8 tensors - test both optimized (4W4C) and general + // paths + std::vector int8_memory_layouts = { + utils::kPackedInt8_4C1W, utils::kPackedInt8_4W4C, utils::kPackedInt8_4C}; + // Generate test cases for each combination for (auto& config : configs) { bool is_performance = config.channels.out > kRefDimSizeLimit || @@ -384,12 +430,23 @@ std::vector generate_quantized_conv2d_test_cases() { config.op_name = "conv2d_q8ta_q8csw_q8to"; - for (const utils::StorageType fp_storage_type : storage_types) { - for (const utils::StorageType int8_storage_type : storage_types) { + for (const utils::StorageType fp_storage_type : fp_storage_types) { + for (const utils::GPUMemoryLayout int8_memory_layout : + int8_memory_layouts) { config.test_case_name = make_test_case_name( - config, is_performance, fp_storage_type, int8_storage_type); + config, is_performance, fp_storage_type, utils::kBuffer); test_cases.push_back(create_test_case_from_config( - config, vkapi::kFloat, fp_storage_type, int8_storage_type)); + config, vkapi::kFloat, fp_storage_type, int8_memory_layout)); + + // For 4W4C layout, also test the legacy implementation + if (int8_memory_layout == utils::kPackedInt8_4W4C) { + test_cases.push_back(create_test_case_from_config( + config, + vkapi::kFloat, + fp_storage_type, + int8_memory_layout, + /*impl_selector=*/"legacy_4w4c")); + } } } } @@ -398,7 +455,7 @@ std::vector generate_quantized_conv2d_test_cases() { } // Reference implementation for activation, weight, and output quantized conv2d -void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) { +static void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) { // Extract input specifications int32_t idx = 0; const ValueSpec& input_spec = test_case.inputs()[idx++]; @@ -416,6 +473,10 @@ void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) { const ValueSpec& padding_spec = test_case.inputs()[idx++]; const ValueSpec& dilation_spec = test_case.inputs()[idx++]; const ValueSpec& groups_spec = test_case.inputs()[idx++]; + const ValueSpec& layout_spec = test_case.inputs()[idx++]; + (void)layout_spec; // Not used in reference implementation + const ValueSpec& impl_selector_spec = test_case.inputs()[idx++]; + (void)impl_selector_spec; // Not used in reference implementation // Extract output specification (mutable reference) ValueSpec& output_spec = test_case.outputs()[0]; @@ -591,12 +652,12 @@ void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) { } } -void reference_impl(TestCase& test_case) { +static void reference_impl(TestCase& test_case) { conv2d_q8ta_q8csw_q8to_reference_impl(test_case); } // Custom FLOP calculator for quantized conv2d operation -int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) { +static int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) { int kernel_idx = 9; // kernel_size is at index 9 for q8ta_q8csw_q8to // Get input and weight dimensions