openvinotoolkit · byungilm · Mar 13, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
@@ -42,6 +42,19 @@ class OPENVINO_API Col2Im : public ov::op::Op {
     const Shape& get_pads_begin() const;
     const Shape& get_pads_end() const;
 
+    void set_strides(const Strides& strides) {
+        m_strides = strides;
+    };
+    void set_dilations(const Strides& dilations) {
+        m_dilations = dilations;
+    };
+    void set_pads_begin(const Shape& pads_begin) {
+        m_pads_begin = pads_begin;
+    };
+    void set_pads_end(const Shape& pads_end) {
+        m_pads_end = pads_end;
+    };
+
 private:
     Strides m_strides;
     Strides m_dilations;

@@ -269,12 +269,14 @@ REGISTER_FACTORY(v13, BitwiseOr);
 REGISTER_FACTORY(v13, BitwiseXor);
 REGISTER_FACTORY(v13, FakeConvert);
 
+
 // ------------------------------ Supported v15 ops ----------------------------- //
 REGISTER_FACTORY(v15, ROIAlignRotated);
 REGISTER_FACTORY(v15, BitwiseRightShift);
 REGISTER_FACTORY(v15, BitwiseLeftShift);
 REGISTER_FACTORY(v15, SearchSorted);
 REGISTER_FACTORY(v15, STFT);
+REGISTER_FACTORY(v15, Col2Im);
 
 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);

@@ -0,0 +1,100 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "primitive.hpp"
+
+namespace cldnn {
+
+/// @brief
+/// @details
+struct col2im : public primitive_base<col2im> {
+    CLDNN_DECLARE_PRIMITIVE(col2im)
+
+    col2im() : primitive_base("", {}) {}
+
+    /// @brief Constructs col2im primitive.
+    /// @param id This primitive id.
+    /// @param input Input dictionary primitive id.
+    /// @param stride Defines shift in input buffer
+    /// @param dilation Defines gaps in the input
+    /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis).
+    /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis).
+    /// @param output_shape Defines the output tensor the output image
+    /// @param kernel_shape Defines size of the sliding blocks
+    col2im(const primitive_id& id,
+                   const input_info& input,
+                   ov::Strides stride,
+                   ov::Strides dilation,
+                   ov::CoordinateDiff padding_begin,
+                   ov::CoordinateDiff padding_end,
+                   ov::Shape output_shape,
+                   ov::Shape kernel_shape)
+        : primitive_base(id, {input})
+        , stride(stride)
+        , dilation(dilation)
+        , padding_begin(padding_begin)
+        , padding_end(padding_end)
+        , output_shape(output_shape)
+        , kernel_shape(kernel_shape) {}
+
+    /// @brief Defines shift in input buffer
+    ov::Strides stride;
+    // @brief Defines gaps in the input
+    ov::Strides dilation;
+    /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis).
+    ov::CoordinateDiff padding_begin;
+    /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis).
+    ov::CoordinateDiff padding_end;
+    ov::Shape output_shape;
+    ov::Shape kernel_shape;
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        seed = hash_range(seed, padding_end.begin(), padding_end.end());
+        seed = hash_range(seed, padding_begin.begin(), padding_begin.end());
+        seed = hash_range(seed, dilation.begin(), dilation.end());
+        seed = hash_range(seed, stride.begin(), stride.end());
+        seed = hash_range(seed, output_shape.begin(), output_shape.end());
+        seed = hash_range(seed, kernel_shape.begin(), kernel_shape.end());
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs))
+            return false;
+
+        auto rhs_casted = downcast<const col2im>(rhs);
+
+        #define cmp_fields(name) name == rhs_casted.name
+        return cmp_fields(stride) &&
+               cmp_fields(dilation) &&
+               cmp_fields(padding_begin) &&
+               cmp_fields(padding_end) &&
+               cmp_fields(output_shape) &&
+               cmp_fields(kernel_shape);
+        #undef cmp_fields
+    }
+
+    void save(BinaryOutputBuffer& ob) const override {
+        primitive_base<col2im>::save(ob);
+        ob << stride;
+        ob << dilation;
+        ob << padding_begin;
+        ob << padding_end;
+        ob << output_shape;
+        ob << kernel_shape;
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+        primitive_base<col2im>::load(ib);
+        ib >> stride;
+        ib >> dilation;
+        ib >> padding_begin;
+        ib >> padding_end;
+        ib >> output_shape;
+        ib >> kernel_shape;
+    }
+};
+}  // namespace cldnn
@@ -0,0 +1,113 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "col2im_inst.h"
+#include "col2im_shape_inference.hpp"
+
+#include "primitive_type_base.h"
+#include "intel_gpu/runtime/error_handler.hpp"
+#include "json_object.h"
+#include <string>
+
+namespace cldnn {
+GPU_DEFINE_PRIMITIVE_TYPE_ID(col2im)
+
+bool col2im_inst::validate_num_blocks(kernel_impl_params const& impl_param, size_t candidate_num_blocks) {
+    constexpr size_t spatial_dims = 2;
+    auto desc = impl_param.typed_desc<col2im>();
+
+    size_t L_calculated = 1;
+    for (size_t d = 0; d < spatial_dims; ++d) {
+        L_calculated *= ((desc->output_shape[d] + desc->padding_begin[d] + desc->padding_end[d] -
+                                (desc->dilation[d] * (desc->kernel_shape[d] - 1)) - 1) / desc->stride[d]) + 1;
+    }
+
+    return (candidate_num_blocks == L_calculated);
+}
+
+layout col2im_inst::calc_output_layout(col2im_node const& node, kernel_impl_params const& impl_param) {
+    auto output = calc_output_layouts<ov::PartialShape>(node, impl_param);
+    return output[0];
+}
+
+template<typename ShapeType>
+std::vector<layout> col2im_inst::calc_output_layouts(col2im_node const& node, kernel_impl_params const& impl_param) {
+    auto desc = impl_param.typed_desc<col2im>();
+    auto input_layout = impl_param.get_input_layout(0);
+    auto output_type = desc->output_data_types[0].value_or(input_layout.data_type);
+    auto output_format = input_layout.format;
+
+    auto reshaped_input = input_layout;
+    if (input_layout.get_rank() >= 4) {
+        bool is_batched = true;
+        auto num_blocks_l = input_layout.spatial(1);
+        if (num_blocks_l == 1 && !validate_num_blocks(impl_param, input_layout.spatial(1))) {
+            is_batched = false;
+            num_blocks_l = input_layout.feature();
+        }
+
+        const auto batch = is_batched ? input_layout.batch() : 1;
+        const auto num_elements = is_batched ? input_layout.feature() : input_layout.batch();
+
+        if (is_batched)
+            reshaped_input.set_partial_shape({batch, num_elements, num_blocks_l});
+        else
+            reshaped_input.set_partial_shape({num_elements, num_blocks_l});
+    }
+
+    ov::op::v15::Col2Im op;
+    op.set_strides(desc->stride);
+    op.set_dilations(desc->dilation);
+    op.set_pads_begin(ov::Shape(desc->padding_begin.begin(), desc->padding_begin.end()));
+    op.set_pads_end(ov::Shape(desc->padding_end.begin(), desc->padding_end.end()));
+
+    // output_size is 1D tensor of two positive integer numbers (height and width).
+    std::vector<size_t> output_size = {desc->output_shape[0], desc->output_shape[1]};
+    // kernel_size is 1D tensor of non-negative integer numbers
+    std::vector<size_t> kernel_size = {desc->kernel_shape[0], desc->kernel_shape[1]};
+
+    auto output_tensor = ov::Tensor(ov::element::Type_t::u64, ov::Shape{ output_size.size() }, output_size.data());
+    auto kernel_tensor = ov::Tensor(ov::element::Type_t::u64, ov::Shape{ kernel_size.size() }, kernel_size.data());
+
+    std::unordered_map<size_t, ov::Tensor> const_data;
+    const_data.emplace(1, output_tensor);
+    const_data.emplace(2, kernel_tensor);
+
+    std::vector<ShapeType> input_shapes = {
+        reshaped_input.get<ShapeType>(),
+        output_tensor.get_shape(),
+        kernel_tensor.get_shape(),
+    };
+
+    std::vector<ShapeType> output_shapes;
+    output_shapes = ov::op::v15::shape_infer(&op, input_shapes, ov::make_tensor_accessor(const_data));
+
+    return { layout{output_shapes[0], output_type, output_format} };
+}
+
+template std::vector<layout> col2im_inst::calc_output_layouts<ov::PartialShape>(col2im_node const& node, const kernel_impl_params& impl_param);
+
+std::string col2im_inst::to_string(col2im_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+
+    auto strd = desc->stride;
+
+    std::stringstream primitive_description;
+
+    json_composite col2im_info;
+    col2im_info.add("input id", input.id());
+    col2im_info.add("stride", cldnn::to_string(strd));
+    col2im_info.add("dilation", cldnn::to_string(desc->dilation));
+    col2im_info.add("padding begin", cldnn::to_string(desc->padding_begin));
+    col2im_info.add("padding end", cldnn::to_string(desc->padding_end));
+
+    node_info->add("col2im info", col2im_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+}  // namespace cldnn
@@ -371,6 +371,10 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
         if (replace_candidate.is_type<convolution>()) {
             auto& conv = replace_candidate.as<convolution>();
             auto desc = conv.get_primitive();
+            // XXX: deformable convolution does not support bias fusing at this moment.
+            // It is just not tested and deformable_mode value is not properly handled below.
+            if (desc->deformable_mode)
+                continue;
             primitive_id biases = bias_name;
 
             // If the primitive has biases, then we try to combine the values, or do nothing and keep as fused sum.
@@ -742,6 +746,15 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                     // prelu fusion is not implemented in oneDNN3.1 (CVS-108233)
                     return;
                 }
+
+                // Fusing prelu to multi batch onednn conv caused an accuracy issue. Blocked fusing of the case.
+                auto input_layout = input.get_output_layout();
+                if (input.is_type<convolution>() && (lo.get_preferred_impl_type(input, format::any /*dummy*/) == impl_types::onednn) &&
+                    activation_func == cldnn::activation_func::relu_negative_slope &&
+                    input_layout.is_static() && input_layout.batch() > 1) {
+                    return;
+                }
+
                 // Activation should not be fused if oneDNN does NOT support it
                 if (lo.is_primitive_implemented_for_onednn(input))  {
                     #ifdef ENABLE_ONEDNN_FOR_GPU

@@ -115,7 +115,7 @@ struct travel_direction_wrapper<direction_e::backwards> {
 };
 
 static format get_target_output_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, program_node *next) {
-    auto user_idx = node->get_user_index(*next);
+    auto user_idx = next->get_dependency_output_port(*node);
 
     // 1. Check selected preferred_output_format
     auto ret = node->get_preferred_output_fmt(user_idx);
@@ -541,16 +541,11 @@ static bool is_weights_dependency(program_node* predecessor, program_node* succe
 // If there is layout mismatch between two layers, add reorder
 template <direction_e dir>
 void insert_reorders_in_dir(program& p, const std::map<program_node*, format::type>& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) {
-    auto fmt = fmt_map.at(node);
-
     auto next_cpy = travel_direction_wrapper<dir>::next_nodes(node);
     for (auto next : next_cpy) {
         if (!next->is_in_data_flow())
             continue;
 
-        if (fmt_map.count(next) > 0 && fmt_map.at(next) == fmt)
-            continue;
-
         if (is_weights_dependency(node, next))
             continue;
 
@@ -567,6 +562,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
 
         in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
         out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);
+        if (in_layout.format == out_layout.format)
+            continue;
 
         GPU_DEBUG_LOG << dir_msg(dir) << "  " << node->id() << " --> " << next->id() << " ## "
                       << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;

@@ -10,6 +10,7 @@
 #include "intel_gpu/primitives/border.hpp"
 #include "intel_gpu/primitives/broadcast.hpp"
 #include "intel_gpu/primitives/bucketize.hpp"
+#include "intel_gpu/primitives/col2im.hpp"
 #include "intel_gpu/primitives/concatenation.hpp"
 #include "intel_gpu/primitives/convert_color.hpp"
 #include "intel_gpu/primitives/crop.hpp"

@@ -0,0 +1,40 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+
+KERNEL(col2im)(const __global INPUT0_TYPE* input,
+                    __global OUTPUT_TYPE* output
+)
+{
+    const uint batch = get_global_id(2);
+    const uint channel_idx = get_global_id(0);
+
+    const int channel_offset = batch * NUM_CHANNELS + channel_idx;
+
+    for (int idx = 0; idx < KERNEL_PRODUCT; ++idx) {
+        const int width_offset = idx % KERNEL_SIZE_1;
+        const int height_offset = (idx / KERNEL_SIZE_1) % KERNEL_SIZE_0;
+        const int column = channel_idx * KERNEL_PRODUCT + idx;
+        const int column_offset = batch * NUM_ELEMENTS_FOR_BLOCK + column;
+
+        for (int column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) {
+            const int image_height_idx = column_height_idx * STRIDE_0 - PAD_BEGIN_0 + height_offset * DILATION_0;
+
+            if (image_height_idx >= 0 && image_height_idx < OUT_SIZE_0) {
+                for (int column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) {
+                    const int image_width_idx = column_width_idx * STRIDE_1 - PAD_BEGIN_1 + width_offset * DILATION_1;
+
+                    if (image_width_idx >= 0 && image_width_idx < OUT_SIZE_1) {
+                        const int img_idx = (channel_offset * OUT_SIZE_0 + image_height_idx) * OUT_SIZE_1 + image_width_idx;
+                        const int data_idx = (column_offset * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx;
+
+                        // sum the overlapping values
+                        output[img_idx] += input[data_idx];
+                    }
+                }
+            }
+        }
+    }
+}