diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index bbfa4c9d0392dc..c5d05bd6292c4a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -269,12 +269,14 @@ REGISTER_FACTORY(v13, BitwiseOr); REGISTER_FACTORY(v13, BitwiseXor); REGISTER_FACTORY(v13, FakeConvert); + // ------------------------------ Supported v15 ops ----------------------------- // REGISTER_FACTORY(v15, ROIAlignRotated); REGISTER_FACTORY(v15, BitwiseRightShift); REGISTER_FACTORY(v15, BitwiseLeftShift); REGISTER_FACTORY(v15, SearchSorted); REGISTER_FACTORY(v15, STFT); +REGISTER_FACTORY(v15, Col2Im); // --------------------------- Supported internal ops --------------------------- // REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp new file mode 100644 index 00000000000000..922457ea0ba1d3 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp @@ -0,0 +1,104 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" + +namespace cldnn { + +/// @brief +/// @details +struct col_to_im : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(col_to_im) + + col_to_im() : primitive_base("", {}) {} + + /// @brief Constructs col_to_im primitive. + /// @param id This primitive id. + /// @param input Input dictionary primitive id. + /// @param output_size Input + /// @param kernel_size Input + /// @param stride Defines shift in input buffer + /// @param dilation Defines gaps in the input + /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis). + /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis). + /// @param output_shape Defines the output tensor the output image + /// @param kernel_shape Defines size of the sliding blocks + col_to_im(const primitive_id& id, + const input_info& input, + const input_info& output_size, + const input_info& kernel_size, + ov::Strides stride, + ov::Strides dilation, + ov::CoordinateDiff padding_begin, + ov::CoordinateDiff padding_end, + ov::Shape output_shape, + ov::Shape kernel_shape) + : primitive_base(id, {input, output_size, kernel_size}) + , stride(stride) + , dilation(dilation) + , padding_begin(padding_begin) + , padding_end(padding_end) + , output_shape(output_shape) + , kernel_shape(kernel_shape) {} + + /// @brief Defines shift in input buffer + ov::Strides stride; + // @brief Defines gaps in the input + ov::Strides dilation; + /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis). + ov::CoordinateDiff padding_begin; + /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis). + ov::CoordinateDiff padding_end; + ov::Shape output_shape; + ov::Shape kernel_shape; + + size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_range(seed, padding_end.begin(), padding_end.end()); + seed = hash_range(seed, padding_begin.begin(), padding_begin.end()); + seed = hash_range(seed, dilation.begin(), dilation.end()); + seed = hash_range(seed, stride.begin(), stride.end()); + seed = hash_range(seed, output_shape.begin(), output_shape.end()); + seed = hash_range(seed, kernel_shape.begin(), kernel_shape.end()); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + + #define cmp_fields(name) name == rhs_casted.name + return cmp_fields(stride) && + cmp_fields(dilation) && + cmp_fields(padding_begin) && + cmp_fields(padding_end) && + cmp_fields(output_shape) && + cmp_fields(kernel_shape); + #undef cmp_fields + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << stride; + ob << dilation; + ob << padding_begin; + ob << padding_end; + ob << output_shape; + ob << kernel_shape; + } + + void load(BinaryInputBuffer& ib) override { + primitive_base::load(ib); + ib >> stride; + ib >> dilation; + ib >> padding_begin; + ib >> padding_end; + ib >> output_shape; + ib >> kernel_shape; + } +}; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/col_to_im.cpp b/src/plugins/intel_gpu/src/graph/col_to_im.cpp new file mode 100644 index 00000000000000..4d96813bd7561d --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/col_to_im.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_inst.h" +#include "col2im_shape_inference.hpp" + +#include "primitive_type_base.h" +#include "intel_gpu/runtime/error_handler.hpp" +#include "json_object.h" +#include + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(col_to_im) + +layout col_to_im_inst::calc_output_layout(col_to_im_node const& node, kernel_impl_params const& impl_param) { + auto desc = impl_param.typed_desc(); + + auto input_layout = impl_param.get_input_layout(); + auto input_format = input_layout.format; + + // TODO : do sth here for col2im.(Copied dummy from depth_to_space) + auto out_size = input_layout.get_tensor(); + if (format::spatial_num(input_layout.format) == 3) { + // const size_t feature = input_layout.feature() / block_size / block_size / block_size; + // const size_t z = input_layout.spatial(2) * block_size; + // const size_t y = input_layout.spatial(1) * block_size; + // const size_t x = input_layout.spatial(0) * block_size; + // out_size = tensor(TensorValue(input_layout.batch()), TensorValue(feature), TensorValue(x), TensorValue(y), TensorValue(z)); + } else { + // const size_t feature = input_layout.feature() / block_size / block_size; + // const size_t y = input_layout.spatial(1) * block_size; + // const size_t x = input_layout.spatial(0) * block_size; + // out_size = tensor(TensorValue(input_layout.batch()), TensorValue(feature), TensorValue(x), TensorValue(y)); + } + + if (impl_param.has_fused_primitives()) { + input_layout.data_type = impl_param.get_output_element_type(); + } + + return layout{input_layout.data_type, input_format, out_size}; +} + +template +std::vector col_to_im_inst::calc_output_layouts(col_to_im_node const& node, kernel_impl_params const& impl_param) { + auto desc = impl_param.typed_desc(); + auto input_layout = impl_param.get_input_layout(0); + auto output_type = desc->output_data_types[0].value_or(input_layout.data_type); + auto output_format = input_layout.format; + + ov::op::v15::Col2Im op; + + std::vector input_shapes = { + input_layout.get(), + impl_param.get_input_layout(1).get(), + impl_param.get_input_layout(2).get(), + }; + std::vector output_shapes = ov::op::v15::shape_infer(&op, input_shapes); + + // XXX: quick and dirty implementation of output shape inference. It should have been fed into shape_infer function + output_shapes[0][-1] = node.get_primitive()->output_shape[1]; + output_shapes[0][-2] = node.get_primitive()->output_shape[0]; + size_t prod = 1; + for (auto t: node.get_primitive()->kernel_shape) { + prod *= t; + } + auto C = input_shapes[0][-2] / prod; + output_shapes[0][-3] = C; + + // std::cout << __FILE__ << ":" << __LINE__ << " " << node.id() << " " << output_shapes[0] << " " << input_shapes[1] << " x "<< std::endl; + return { layout{output_shapes[0], output_type, output_format} }; +} + +template std::vector col_to_im_inst::calc_output_layouts(col_to_im_node const& node, const kernel_impl_params& impl_param); + +std::string col_to_im_inst::to_string(col_to_im_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + auto strd = desc->stride; + + std::stringstream primitive_description; + + json_composite col_to_im_info; + col_to_im_info.add("input id", input.id()); + col_to_im_info.add("stride", cldnn::to_string(strd)); + col_to_im_info.add("dilation", cldnn::to_string(desc->dilation)); + col_to_im_info.add("padding begin", cldnn::to_string(desc->padding_begin)); + col_to_im_info.add("padding end", cldnn::to_string(desc->padding_end)); + + node_info->add("col_to_im info", col_to_im_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +col_to_im_inst::typed_primitive_inst(network& network, col_to_im_node const& node) + : parent(network, node) {} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 622b7cff4101ad..a8f7715c5a9574 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -371,6 +371,9 @@ void prepare_primitive_fusing::fuse_bias(program &p) { if (replace_candidate.is_type()) { auto& conv = replace_candidate.as(); auto desc = conv.get_primitive(); + // XXX: deformable convolution does not support bias fusing at this moment. It is just not tested and deformable_mode value is not properly handled below. + if (desc->deformable_mode) + continue; primitive_id biases = bias_name; // If the primitive has biases, then we try to combine the values, or do nothing and keep as fused sum. diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 2eba5148acfa26..7a98f3c37ab6c9 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -115,7 +115,7 @@ struct travel_direction_wrapper { }; static format get_target_output_format(layout_optimizer& lo, const std::map& fmt_map, program_node *node, program_node *next) { - auto user_idx = node->get_user_index(*next); + auto user_idx = next->get_dependency_output_port(*node); // 1. Check selected preferred_output_format auto ret = node->get_preferred_output_fmt(user_idx); @@ -541,16 +541,11 @@ static bool is_weights_dependency(program_node* predecessor, program_node* succe // If there is layout mismatch between two layers, add reorder template void insert_reorders_in_dir(program& p, const std::map& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) { - auto fmt = fmt_map.at(node); - auto next_cpy = travel_direction_wrapper::next_nodes(node); for (auto next : next_cpy) { if (!next->is_in_data_flow()) continue; - if (fmt_map.count(next) > 0 && fmt_map.at(next) == fmt) - continue; - if (is_weights_dependency(node, next)) continue; @@ -567,6 +562,8 @@ void insert_reorders_in_dir(program& p, const std::mapid() << " --> " << next->id() << " ## " << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp new file mode 100644 index 00000000000000..ed4f862902d04c --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp @@ -0,0 +1,75 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "primitive_base.hpp" + +#include "col_to_im_inst.h" +#include "col_to_im/col_to_im_kernel_selector.h" +#include "col_to_im/col_to_im_kernel_ref.h" + +#include "intel_gpu/plugin/common_utils.hpp" + +namespace cldnn { +namespace ocl { +struct col_to_im_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::col_to_im_kernel_selector; + using kernel_params_t = kernel_selector::col_to_im_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::col_to_im_impl) + + std::unique_ptr clone() const override { + return make_deep_copy(*this); + } + + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { + const auto& primitive = impl_param.typed_desc(); + auto col2im_params = get_default_params(impl_param); + + // Attributes + uint32_t stride_x, stride_y, stride_z; + uint32_t dilation_x, dilation_y, dilation_z; + std::tie(stride_x, stride_y, stride_z) = ov::intel_gpu::get_xyz(primitive->stride, 1); + col2im_params.stride = {stride_x, stride_y, stride_z}; + std::tie(dilation_x, dilation_y, dilation_z) = ov::intel_gpu::get_xyz(primitive->dilation, 1); + col2im_params.dilation = {dilation_x, dilation_y, dilation_z}; + + // padding being & end + uint32_t pad_begin_x, pad_begin_y, pad_begin_z; + std::tie(pad_begin_x, pad_begin_y, pad_begin_z) = ov::intel_gpu::get_xyz(primitive->padding_begin, 0); + col2im_params.padding_begin = {pad_begin_x, pad_begin_y, pad_begin_z}; + uint32_t pad_end_x, pad_end_y, pad_end_z; + std::tie(pad_end_x, pad_end_y, pad_end_z) = ov::intel_gpu::get_xyz(primitive->padding_end, 0); + col2im_params.padding_end = {pad_end_x, pad_end_y, pad_end_z}; + + // Col2Im-15 implementation : required + // output size is 1D tensor of two positive integer numbers (height and width) + std::vector output_size(primitive->output_shape.begin(), primitive->output_shape.end()); + std::vector kernel_size(primitive->kernel_shape.begin(), primitive->kernel_shape.end()); + col2im_params.output_size = {output_size[0], output_size[1], (uint32_t)1}; + col2im_params.kernel_size = {kernel_size[0], kernel_size[1], (uint32_t)1}; + + return col2im_params; + } +}; + +namespace detail { + +attach_col_to_im_impl::attach_col_to_im_impl() { + std::vector dt = { + data_types::f16, + }; + std::vector fmt = { + format::bfyx, + }; + implementation_map::add(impl_types::ocl, typed_primitive_impl_ocl::create, dt, fmt); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::col_to_im_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::col_to_im) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index cfb01239f7709e..3a2115ae92f296 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -15,6 +15,7 @@ void register_implementations() { REGISTER_OCL(border); REGISTER_OCL(broadcast); REGISTER_OCL(bucketize); + REGISTER_OCL(col_to_im); REGISTER_OCL(concatenation); REGISTER_OCL(crop); REGISTER_OCL(custom_gpu_primitive); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index 4f975539a003fc..612b75e76a1dd9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -10,6 +10,7 @@ #include "intel_gpu/primitives/border.hpp" #include "intel_gpu/primitives/broadcast.hpp" #include "intel_gpu/primitives/bucketize.hpp" +#include "intel_gpu/primitives/col_to_im.hpp" #include "intel_gpu/primitives/concatenation.hpp" #include "intel_gpu/primitives/convert_color.hpp" #include "intel_gpu/primitives/crop.hpp" @@ -88,6 +89,7 @@ REGISTER_OCL(batch_to_space); REGISTER_OCL(border); REGISTER_OCL(broadcast); REGISTER_OCL(bucketize); +REGISTER_OCL(col_to_im); REGISTER_OCL(concatenation); REGISTER_OCL(crop); REGISTER_OCL(custom_gpu_primitive); diff --git a/src/plugins/intel_gpu/src/graph/include/col_to_im_inst.h b/src/plugins/intel_gpu/src/graph/include/col_to_im_inst.h new file mode 100644 index 00000000000000..c843e92605ee6d --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/col_to_im_inst.h @@ -0,0 +1,45 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/col_to_im.hpp" +#include "primitive_inst.h" + +#include +#include + +namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } + std::shared_ptr get_fuse_params() const override { + return std::make_shared(col_to_im::type_id()); + } + std::vector get_shape_infer_dependencies() const override { return {}; } +}; + +using col_to_im_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(col_to_im_node const& node, kernel_impl_params const& impl_param); + static layout calc_output_layout(col_to_im_node const& node, kernel_impl_params const& impl_param); + + static std::string to_string(col_to_im_node const& node); + + typed_primitive_inst(network& network, col_to_im_node const& desc); +}; + +using col_to_im_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/registry/registry.hpp b/src/plugins/intel_gpu/src/graph/registry/registry.hpp index 1eec48b9eb9971..11cc58cebddf89 100644 --- a/src/plugins/intel_gpu/src/graph/registry/registry.hpp +++ b/src/plugins/intel_gpu/src/graph/registry/registry.hpp @@ -163,6 +163,7 @@ REGISTER_DEFAULT_IMPLS(adaptive_pooling, OCL_S); REGISTER_DEFAULT_IMPLS(batch_to_space, OCL_S); REGISTER_DEFAULT_IMPLS(border, OCL_S, OCL_D); REGISTER_DEFAULT_IMPLS(bucketize, OCL_S); +REGISTER_DEFAULT_IMPLS(col_to_im, OCL_S); REGISTER_DEFAULT_IMPLS(custom_gpu_primitive, OCL_S); REGISTER_DEFAULT_IMPLS(data, COMMON_S, COMMON_D); REGISTER_DEFAULT_IMPLS(depth_to_space, OCL_S); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl new file mode 100644 index 00000000000000..be3390317d0f84 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +KERNEL(col_to_im_opt)(const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) +{ + const uint output_size[2] = {OUT_SIZE_X, OUT_SIZE_Y}; + const uint kernel_size[2] = {KERNEL_SIZE_X, KERNEL_SIZE_Y}; + const uint strides[2] = {STRIDE_SIZE_X, STRIDE_SIZE_Y}; + const uint dilations[2] = {DILATION_SIZE_X, DILATION_SIZE_Y}; + const uint pads_begin[2] = {PAD_BEGIN_SIZE_X, PAD_BEGIN_SIZE_Y}; + + const uint batch = get_global_id(2); + const uint channel_idx = get_global_id(0); + + const int channel_offset = batch * NUM_CHANNELS + channel_idx; + + for (int idx = 0; idx < KERNEL_PRODUCT; ++idx) { + const int width_offset = idx % kernel_size[1]; + const int height_offset = (idx / kernel_size[1]) % kernel_size[0]; + const int column = channel_idx * KERNEL_PRODUCT + idx; + const int column_offset = batch * NUM_ELEMENTS_FOR_BLOCK + column; + + for (int column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { + const int image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; + + if (image_height_idx >= 0 && image_height_idx < output_size[0]) { + for (int column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { + const int image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; + + if (image_width_idx >= 0 && image_width_idx < output_size[1]) { + const int img_idx = (channel_offset * output_size[0] + image_height_idx) * output_size[1] + image_width_idx; + const int data_idx = (column_offset * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx; + + // sum the overlapping values + output[img_idx] += input[data_idx]; + } + } + } + } + } +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl new file mode 100644 index 00000000000000..7a4f52060ca671 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +KERNEL(col_to_im_ref)(const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) +{ + const uint output_size[2] = {OUT_SIZE_X, OUT_SIZE_Y}; + const uint kernel_size[2] = {KERNEL_SIZE_X, KERNEL_SIZE_Y}; + const uint strides[2] = {STRIDE_SIZE_X, STRIDE_SIZE_Y}; + const uint dilations[2] = {DILATION_SIZE_X, DILATION_SIZE_Y}; + const uint pads_begin[2] = {PAD_BEGIN_SIZE_X, PAD_BEGIN_SIZE_Y}; + + const uint num_blocks = INPUT0_SIZE_Y; + const uint kernel_product = KERNEL_SIZE_X * KERNEL_SIZE_Y; + const uint channels_per_column = INPUT0_FEATURE_NUM; + const uint channel_count = channels_per_column / KERNEL_PRODUCT; + + const uint batch = get_global_id(2); + + for (int column = 0; column < channels_per_column; ++column) { + const int width_offset = column % kernel_size[1]; + const int height_offset = (column / kernel_size[1]) % kernel_size[0]; + const int channel_idx = column / kernel_product; + + for (int column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { + const int image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; + if (image_height_idx >= 0 && image_height_idx < output_size[0]) { + for (int column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { + const int image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; + if (image_width_idx >= 0 && image_width_idx < output_size[1]) { + const int img_idx = ((batch * channel_count + channel_idx) * output_size[0] + image_height_idx) * output_size[1] + image_width_idx; + const int data_idx = ((batch * channels_per_column + column) * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx; + + // sum the overlapping values + output[img_idx] += input[data_idx]; + } + } + } + } + } +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index 2797212463f087..caca5b3e961b7b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -104,7 +104,8 @@ enum class KernelType { ROPE, DYNAMIC_QUANTIZE, SEARCH_SORTED, - STFT + STFT, + COL_TO_IM }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp new file mode 100644 index 00000000000000..c39e47288dd157 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_base.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +bool ColToImKernelBase::Validate(const Params& p) const { + if (p.GetType() != KernelType::COL_TO_IM) { + return false; + } + + const col_to_im_params& params = static_cast(p); + for (auto& fused_op : params.fused_ops) { + if (!IsFusedPrimitiveSupported(fused_op)) + return false; + } + + if (params.inputs[0].Dimentions() > 5) + return false; + + return true; +} + +JitConstants ColToImKernelBase::GetJitConstants(const col_to_im_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + auto input = params.inputs[0]; + const auto& output_size = params.output_size; + const auto& stride = params.stride; + const auto& dilation = params.dilation; + const auto& pads_begin = params.padding_begin; + const auto& pads_end = params.padding_begin; + + const auto orig_height = (output_size.x + pads_begin.x + pads_end.x - (dilation.x * (params.kernel_size.x - 1) + 1)) / stride.x + 1; + const auto orig_width = (output_size.y + pads_begin.y + pads_end.y - (dilation.y * (params.kernel_size.y - 1) + 1)) / stride.y + 1; + jit.AddConstant(MakeJitConstant("ORIG_HEIGHT", orig_height)); + jit.AddConstant(MakeJitConstant("ORIG_WIDTH", orig_width)); + + // Consider input tensor : (N, C * Product(kernel_size), L) + const auto num_elements_for_block = input.Feature().v; + const auto num_blocks = input.Y().v; + const auto kernel_product = params.kernel_size.x * params.kernel_size.y; + const auto num_channels = num_elements_for_block / kernel_product; + jit.AddConstant(MakeJitConstant("NUM_ELEMENTS_FOR_BLOCK", num_elements_for_block)); + jit.AddConstant(MakeJitConstant("KERNEL_PRODUCT", kernel_product)); + jit.AddConstant(MakeJitConstant("NUM_CHANNELS", num_channels)); + jit.AddConstant(MakeJitConstant("NUM_BLOCKS", num_blocks)); + + jit.AddConstant(MakeJitConstant("OUT", params.output_size)); + jit.AddConstant(MakeJitConstant("KERNEL", params.kernel_size)); + jit.AddConstant(MakeJitConstant("STRIDE", stride)); + jit.AddConstant(MakeJitConstant("DILATION", dilation)); + jit.AddConstant(MakeJitConstant("PAD_BEGIN", pads_begin)); + jit.AddConstant(MakeJitConstant("PAD_END", pads_end)); + + return jit; +} + +KernelsData ColToImKernelBase::GetCommonKernelsData(const Params& params) const { + KernelData kd = KernelData::Default(params); + col_to_im_params& newParams = *static_cast(kd.params.get()); + + if (!Validate(params)) { + return {}; + } + + auto dispatchData = SetDefault(newParams); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params); + auto cldnn_jit = GetJitConstants(newParams); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, + EXE_MODE_DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params)); + + return { kd }; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h new file mode 100644 index 00000000000000..f3d3904c139cbf --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "kernel_selector_params.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// col_to_im_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct col_to_im_params : public base_params { + col_to_im_params() + : base_params(KernelType::COL_TO_IM) {} + // Required + uSize output_size; + uSize kernel_size; + // Optional + uSize stride; + uSize dilation; + uSize padding_begin; + uSize padding_end; +}; + +struct col_to_im_fuse_params : fuse_params { + col_to_im_fuse_params() : fuse_params(KernelType::COL_TO_IM) {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ColToImKernelBase +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class ColToImKernelBase : public KernelBaseOpenCL { +public: + using KernelBaseOpenCL::KernelBaseOpenCL; + virtual ~ColToImKernelBase() {} + + struct DispatchData : public CommonDispatchData { + }; + +protected: + bool Validate(const Params&) const override; + virtual JitConstants GetJitConstants(const col_to_im_params& params) const; + virtual CommonDispatchData SetDefault(const col_to_im_params& params) const = 0; + KernelsData GetCommonKernelsData(const Params& params) const; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp new file mode 100644 index 00000000000000..9a9b3e3b996eee --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_opt.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +ParamsKey ColToImKernelOpt::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; +} + +CommonDispatchData ColToImKernelOpt::SetDefault(const col_to_im_params& params) const { + CommonDispatchData dispatchData; + + auto input = params.inputs[0]; + const auto num_elements_for_block = input.Feature().v; + const auto kernel_product = params.kernel_size.x * params.kernel_size.y; + const auto num_channels = num_elements_for_block / kernel_product; + + dispatchData.gws = {num_channels, 1, params.outputs[0].Batch().v}; + dispatchData.lws = {1, 1, 1}; + + return dispatchData; +} + +KernelsData ColToImKernelOpt::GetKernelsData(const Params& params) const { + return GetCommonKernelsData(params); +} + +KernelsPriority ColToImKernelOpt::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_2; +} + +JitConstants ColToImKernelOpt::GetJitConstants(const col_to_im_params& params) const { + auto jit = Parent::GetJitConstants(params); + auto input = params.inputs[0]; + auto input_dt = input.GetDType(); + + // TODO : implement for col_to_im_gpu_opt + if (!params.fused_ops.empty()) { + std::vector idx_order; + if (input.Dimentions() == 5) { + idx_order = {"batch", "feature", "z", "y", "x"}; + } else if (input.Dimentions() == 4) { + idx_order = {"batch", "feature", "y", "x"}; + } + FusedOpsConfiguration conf = {"", idx_order, "in_val", input_dt, 1}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + return jit; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.h new file mode 100644 index 00000000000000..e05ec67b8db9e2 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.h @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "col_to_im_kernel_base.h" + +namespace kernel_selector { +class ColToImKernelOpt : public ColToImKernelBase { +public: + using Parent = ColToImKernelBase; + + ColToImKernelOpt() : ColToImKernelBase("col_to_im_opt") {} + virtual ~ColToImKernelOpt() {} + + CommonDispatchData SetDefault(const col_to_im_params& params) const override; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + +protected: + JitConstants GetJitConstants(const col_to_im_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::ELTWISE, + FusedOpType::QUANTIZE, + FusedOpType::REORDER, + FusedOpType::ACTIVATION }; + } +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp new file mode 100644 index 00000000000000..eb5208d24437fd --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp @@ -0,0 +1,67 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_ref.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +ParamsKey ColToImKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; +} + +CommonDispatchData ColToImKernelRef::SetDefault(const col_to_im_params& params) const { + CommonDispatchData dispatchData; + + dispatchData.gws = {1, 1, params.outputs[0].Batch().v}; + dispatchData.lws = {1, 1, 1}; + + return dispatchData; +} + +KernelsData ColToImKernelRef::GetKernelsData(const Params& params) const { + return GetCommonKernelsData(params); +} + +KernelsPriority ColToImKernelRef::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_9; +} + +JitConstants ColToImKernelRef::GetJitConstants(const col_to_im_params& params) const { + auto jit = Parent::GetJitConstants(params); + auto input = params.inputs[0]; + auto input_dt = input.GetDType(); + + if (!params.fused_ops.empty()) { + std::vector idx_order; + if (input.Dimentions() == 5) { + idx_order = {"batch", "feature", "z", "y", "x"}; + } else if (input.Dimentions() == 4) { + idx_order = {"batch", "feature", "y", "x"}; + } + FusedOpsConfiguration conf = {"", idx_order, "in_val", input_dt, 1}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + return jit; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.h new file mode 100644 index 00000000000000..3d507f1be090d9 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.h @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "col_to_im_kernel_base.h" + +namespace kernel_selector { +class ColToImKernelRef : public ColToImKernelBase { +public: + using Parent = ColToImKernelBase; + + ColToImKernelRef() : ColToImKernelBase("col_to_im_ref") {} + virtual ~ColToImKernelRef() {} + + CommonDispatchData SetDefault(const col_to_im_params& params) const override; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + +protected: + JitConstants GetJitConstants(const col_to_im_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::ELTWISE, + FusedOpType::QUANTIZE, + FusedOpType::REORDER, + FusedOpType::ACTIVATION }; + } +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp new file mode 100644 index 00000000000000..a10a632f5d4c4e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_selector.h" +#include "col_to_im_kernel_ref.h" +#include "col_to_im_kernel_opt.h" + +namespace kernel_selector { + +col_to_im_kernel_selector::col_to_im_kernel_selector() { + Attach(); + Attach(); +} + +KernelsData col_to_im_kernel_selector::GetBestKernels(const Params& params) const { + return GetNaiveBestKernel(params, KernelType::COL_TO_IM); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.h new file mode 100644 index 00000000000000..00e936f9c9195e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.h @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class col_to_im_kernel_selector : public kernel_selector_base { +public: + static col_to_im_kernel_selector& Instance() { + static col_to_im_kernel_selector instance_; + return instance_; + } + + col_to_im_kernel_selector(); + + virtual ~col_to_im_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp index cf5581afda8e19..52c226f95562b2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp @@ -17,10 +17,12 @@ ParamsKey DeformableConvolutionKernel_bfyx_Ref::GetSupportedKey() const { k.EnableInputWeightsType(WeightsType::F16); k.EnableInputWeightsType(WeightsType::F32); k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::byxf); + // k.EnableInputLayout(DataLayout::bfyx); + // k.EnableOutputLayout(DataLayout::bfyx); + // k.EnableInputLayout(DataLayout::byxf); + // k.EnableOutputLayout(DataLayout::byxf); + k.EnableAllInputLayout(); // XXX: this is hack to run e2e network. Not sure whether it is a correct fix or not. + k.EnableAllOutputLayout(); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableDilation(); diff --git a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp new file mode 100644 index 00000000000000..616274bc48ce4f --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/plugin/common_utils.hpp" + +#include "openvino/op/col2im.hpp" + +#include "intel_gpu/primitives/col_to_im.hpp" +#include "openvino/op/constant.hpp" + +namespace ov { +namespace intel_gpu { + +static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {3}); // XXX Please check whether 3 is correct number + auto inputPrimitives = p.GetInputInfo(op); + std::string layerName = layer_type_name_ID(op); + + // The total number of blocks calculated(L) = product from d=1 to 2 of + // floor((output_size[d] + pads_begin[d] + pads_end[d] - dilation[d] * (kernel_size[d] - 1) - 1) / stride[d] + 1) + // d : all spatial dimension + auto strides = op->get_strides(); + auto dilations = op->get_dilations(); + auto pads_begin = op->get_pads_begin(); + auto pads_end = op->get_pads_end(); + ov::CoordinateDiff padding_begin; + ov::CoordinateDiff padding_end; + + for (auto p: op->get_pads_begin()) + padding_begin.push_back(p); + for (auto p: op->get_pads_end()) + padding_end.push_back(p); + + // std::cout << ">> col2im : " << op->get_friendly_name() << std::endl; + + auto output_shape_const = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); + auto vec_output_shape = output_shape_const->cast_vector(); + ov::Shape output_shape(vec_output_shape); + + auto kernel_size_const = ov::as_type_ptr(op->get_input_node_shared_ptr(2)); + auto kernel_size = kernel_size_const->cast_vector(); + ov::Shape kernel_shape(kernel_size); + + // std::cout << " -- output shape : " << vec_output_shape[0] << ", " << vec_output_shape[1] << std::endl; + // std::cout << " -- kernel size : " << kernel_shape.to_string() << std::endl; + + // Create col2im prim + // iputs : data, output size, kernel_size(required) + auto CallToImPrim = cldnn::col_to_im(layerName, + inputPrimitives[0], + inputPrimitives[1], + inputPrimitives[2], + strides, + dilations, + padding_begin, + padding_end, + vec_output_shape, + kernel_shape); + + p.add_primitive(*op, CallToImPrim); +} + +REGISTER_FACTORY_IMPL(v15, Col2Im); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp b/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp index eb8977c11ed7b1..f5f0853137260b 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp @@ -9,7 +9,8 @@ #include "intel_gpu/primitives/depth_to_space.hpp" -namespace ov::intel_gpu { +namespace ov { +namespace intel_gpu { static cldnn::depth_to_space_mode GetDepthMode(ov::op::v0::DepthToSpace::DepthToSpaceMode mode) { switch (mode) { @@ -40,4 +41,5 @@ static void CreateDepthToSpaceOp(ProgramBuilder& p, const std::shared_ptr