Skip to content

Implement Col2im primitive and support relevant models #29529

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9af86eb
[GPU] Implement col2im
byungilm Mar 13, 2025
ad115ad
fix for build
isanghao Mar 12, 2025
c0ad041
keep three inputs for shape inference
isanghao Mar 12, 2025
e7421cd
col2im shape_infer succeeds. Not sure whether it is correct or not
isanghao Mar 12, 2025
f5524a1
now ocl build fails
isanghao Mar 13, 2025
114b2ea
[GPU] Implement col2im reference kernel
byungilm Mar 19, 2025
8475c23
[GPU] Init draft for col2im opt kernel
byungilm Mar 19, 2025
f6d8d8c
Update col2im opt kernel
byungilm Mar 20, 2025
28070b9
[GPU] reorder_inputs fixed
isanghao Mar 23, 2025
b7c241a
[GPU] update calc_output_layout for col_to_im
byungilm Mar 26, 2025
d09c64b
[GPU] Resolve acc issue by fusing prelu
byungilm Mar 26, 2025
cbd3fc2
[GPU] Fix conv fusing logic
byungilm Mar 26, 2025
8dd80b8
[GPU] clear cpplint
byungilm Mar 26, 2025
e1ecf28
[GPU] Added unit-test for col-to-im
byungilm Mar 26, 2025
7c12840
[GPU] Fix comment for prelu fusing
byungilm Mar 26, 2025
31554ba
[GPU] clear cpplint
byungilm Mar 26, 2025
7fffcbe
[GPU] clear CI failure form prelu fusing
byungilm Mar 26, 2025
65085e3
[GPU] code clean up of col-to-im test code
byungilm Mar 26, 2025
8a77be3
[GPU] Modified test-code for prelu fusing issue
byungilm Mar 27, 2025
56730ad
[GPU] Modified test-code for col-to-im impl
byungilm Mar 27, 2025
00f5d9e
[GPU] Code cleap up for col2im impl
byungilm Mar 27, 2025
76903aa
[GPU] Apply comment for prelu fusing
byungilm Mar 27, 2025
67b14d2
[GPU] Apply comments for col2im
byungilm Mar 27, 2025
6407f86
[GPU] Fix unit-tests for col_to_im
byungilm Mar 27, 2025
a716b5e
[GPU] Changed naming col_to_im
byungilm Mar 28, 2025
3f08feb
[GPU] Removed needs_calculate_L
byungilm Mar 31, 2025
5ec5b70
[GPU] Bugfix and add func test for col2im
byungilm Apr 2, 2025
e200492
[GPU] Fixed CI failure
byungilm Apr 3, 2025
93f0cf5
[GPU] Implement col2im to ocl_v2
byungilm Apr 4, 2025
d87f144
[GPU] modified calc_output_layout to use shape_infer
byungilm Apr 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/core/include/openvino/op/col2im.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ class OPENVINO_API Col2Im : public ov::op::Op {
const Shape& get_pads_begin() const;
const Shape& get_pads_end() const;

void set_strides(const Strides& strides) {
m_strides = strides;
};
void set_dilations(const Strides& dilations) {
m_dilations = dilations;
};
void set_pads_begin(const Shape& pads_begin) {
m_pads_begin = pads_begin;
};
void set_pads_end(const Shape& pads_end) {
m_pads_end = pads_end;
};

private:
Strides m_strides;
Strides m_dilations;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,12 +269,14 @@ REGISTER_FACTORY(v13, BitwiseOr);
REGISTER_FACTORY(v13, BitwiseXor);
REGISTER_FACTORY(v13, FakeConvert);


// ------------------------------ Supported v15 ops ----------------------------- //
REGISTER_FACTORY(v15, ROIAlignRotated);
REGISTER_FACTORY(v15, BitwiseRightShift);
REGISTER_FACTORY(v15, BitwiseLeftShift);
REGISTER_FACTORY(v15, SearchSorted);
REGISTER_FACTORY(v15, STFT);
REGISTER_FACTORY(v15, Col2Im);

// --------------------------- Supported internal ops --------------------------- //
REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
Expand Down
100 changes: 100 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/primitives/col2im.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once
#include "primitive.hpp"

namespace cldnn {

/// @brief
/// @details
struct col2im : public primitive_base<col2im> {
CLDNN_DECLARE_PRIMITIVE(col2im)

col2im() : primitive_base("", {}) {}

/// @brief Constructs col2im primitive.
/// @param id This primitive id.
/// @param input Input dictionary primitive id.
/// @param stride Defines shift in input buffer
/// @param dilation Defines gaps in the input
/// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis).
/// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis).
/// @param output_shape Defines the output tensor the output image
/// @param kernel_shape Defines size of the sliding blocks
col2im(const primitive_id& id,
const input_info& input,
ov::Strides stride,
ov::Strides dilation,
ov::CoordinateDiff padding_begin,
ov::CoordinateDiff padding_end,
ov::Shape output_shape,
ov::Shape kernel_shape)
: primitive_base(id, {input})
, stride(stride)
, dilation(dilation)
, padding_begin(padding_begin)
, padding_end(padding_end)
, output_shape(output_shape)
, kernel_shape(kernel_shape) {}

/// @brief Defines shift in input buffer
ov::Strides stride;
// @brief Defines gaps in the input
ov::Strides dilation;
/// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis).
ov::CoordinateDiff padding_begin;
/// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis).
ov::CoordinateDiff padding_end;
ov::Shape output_shape;
ov::Shape kernel_shape;

size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_range(seed, padding_end.begin(), padding_end.end());
seed = hash_range(seed, padding_begin.begin(), padding_begin.end());
seed = hash_range(seed, dilation.begin(), dilation.end());
seed = hash_range(seed, stride.begin(), stride.end());
seed = hash_range(seed, output_shape.begin(), output_shape.end());
seed = hash_range(seed, kernel_shape.begin(), kernel_shape.end());
return seed;
}

bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;

auto rhs_casted = downcast<const col2im>(rhs);

#define cmp_fields(name) name == rhs_casted.name
return cmp_fields(stride) &&
cmp_fields(dilation) &&
cmp_fields(padding_begin) &&
cmp_fields(padding_end) &&
cmp_fields(output_shape) &&
cmp_fields(kernel_shape);
#undef cmp_fields
}

void save(BinaryOutputBuffer& ob) const override {
primitive_base<col2im>::save(ob);
ob << stride;
ob << dilation;
ob << padding_begin;
ob << padding_end;
ob << output_shape;
ob << kernel_shape;
}

void load(BinaryInputBuffer& ib) override {
primitive_base<col2im>::load(ib);
ib >> stride;
ib >> dilation;
ib >> padding_begin;
ib >> padding_end;
ib >> output_shape;
ib >> kernel_shape;
}
};
} // namespace cldnn
113 changes: 113 additions & 0 deletions src/plugins/intel_gpu/src/graph/col2im.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "col2im_inst.h"
#include "col2im_shape_inference.hpp"

#include "primitive_type_base.h"
#include "intel_gpu/runtime/error_handler.hpp"
#include "json_object.h"
#include <string>

namespace cldnn {
GPU_DEFINE_PRIMITIVE_TYPE_ID(col2im)

bool col2im_inst::validate_num_blocks(kernel_impl_params const& impl_param, size_t candidate_num_blocks) {
constexpr size_t spatial_dims = 2;
auto desc = impl_param.typed_desc<col2im>();

size_t L_calculated = 1;
for (size_t d = 0; d < spatial_dims; ++d) {
L_calculated *= ((desc->output_shape[d] + desc->padding_begin[d] + desc->padding_end[d] -
(desc->dilation[d] * (desc->kernel_shape[d] - 1)) - 1) / desc->stride[d]) + 1;
}

return (candidate_num_blocks == L_calculated);
}

layout col2im_inst::calc_output_layout(col2im_node const& node, kernel_impl_params const& impl_param) {
auto output = calc_output_layouts<ov::PartialShape>(node, impl_param);
return output[0];
}

template<typename ShapeType>
std::vector<layout> col2im_inst::calc_output_layouts(col2im_node const& node, kernel_impl_params const& impl_param) {
auto desc = impl_param.typed_desc<col2im>();
auto input_layout = impl_param.get_input_layout(0);
auto output_type = desc->output_data_types[0].value_or(input_layout.data_type);
auto output_format = input_layout.format;

auto reshaped_input = input_layout;
if (input_layout.get_rank() >= 4) {
bool is_batched = true;
auto num_blocks_l = input_layout.spatial(1);
if (num_blocks_l == 1 && !validate_num_blocks(impl_param, input_layout.spatial(1))) {
is_batched = false;
num_blocks_l = input_layout.feature();
}

const auto batch = is_batched ? input_layout.batch() : 1;
const auto num_elements = is_batched ? input_layout.feature() : input_layout.batch();

if (is_batched)
reshaped_input.set_partial_shape({batch, num_elements, num_blocks_l});
else
reshaped_input.set_partial_shape({num_elements, num_blocks_l});
}

ov::op::v15::Col2Im op;
op.set_strides(desc->stride);
op.set_dilations(desc->dilation);
op.set_pads_begin(ov::Shape(desc->padding_begin.begin(), desc->padding_begin.end()));
op.set_pads_end(ov::Shape(desc->padding_end.begin(), desc->padding_end.end()));

// output_size is 1D tensor of two positive integer numbers (height and width).
std::vector<size_t> output_size = {desc->output_shape[0], desc->output_shape[1]};
// kernel_size is 1D tensor of non-negative integer numbers
std::vector<size_t> kernel_size = {desc->kernel_shape[0], desc->kernel_shape[1]};

auto output_tensor = ov::Tensor(ov::element::Type_t::u64, ov::Shape{ output_size.size() }, output_size.data());
auto kernel_tensor = ov::Tensor(ov::element::Type_t::u64, ov::Shape{ kernel_size.size() }, kernel_size.data());

std::unordered_map<size_t, ov::Tensor> const_data;
const_data.emplace(1, output_tensor);
const_data.emplace(2, kernel_tensor);

std::vector<ShapeType> input_shapes = {
reshaped_input.get<ShapeType>(),
output_tensor.get_shape(),
kernel_tensor.get_shape(),
};

std::vector<ShapeType> output_shapes;
output_shapes = ov::op::v15::shape_infer(&op, input_shapes, ov::make_tensor_accessor(const_data));

return { layout{output_shapes[0], output_type, output_format} };
}

template std::vector<layout> col2im_inst::calc_output_layouts<ov::PartialShape>(col2im_node const& node, const kernel_impl_params& impl_param);

std::string col2im_inst::to_string(col2im_node const& node) {
auto desc = node.get_primitive();
auto node_info = node.desc_to_json();
auto& input = node.input();

auto strd = desc->stride;

std::stringstream primitive_description;

json_composite col2im_info;
col2im_info.add("input id", input.id());
col2im_info.add("stride", cldnn::to_string(strd));
col2im_info.add("dilation", cldnn::to_string(desc->dilation));
col2im_info.add("padding begin", cldnn::to_string(desc->padding_begin));
col2im_info.add("padding end", cldnn::to_string(desc->padding_end));

node_info->add("col2im info", col2im_info);
node_info->dump(primitive_description);

return primitive_description.str();
}

} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,10 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
if (replace_candidate.is_type<convolution>()) {
auto& conv = replace_candidate.as<convolution>();
auto desc = conv.get_primitive();
// XXX: deformable convolution does not support bias fusing at this moment.
// It is just not tested and deformable_mode value is not properly handled below.
if (desc->deformable_mode)
continue;
primitive_id biases = bias_name;

// If the primitive has biases, then we try to combine the values, or do nothing and keep as fused sum.
Expand Down Expand Up @@ -742,6 +746,15 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
// prelu fusion is not implemented in oneDNN3.1 (CVS-108233)
return;
}

// Fusing prelu to multi batch onednn conv caused an accuracy issue. Blocked fusing of the case.
auto input_layout = input.get_output_layout();
if (input.is_type<convolution>() && (lo.get_preferred_impl_type(input, format::any /*dummy*/) == impl_types::onednn) &&
activation_func == cldnn::activation_func::relu_negative_slope &&
input_layout.is_static() && input_layout.batch() > 1) {
return;
}

// Activation should not be fused if oneDNN does NOT support it
if (lo.is_primitive_implemented_for_onednn(input)) {
#ifdef ENABLE_ONEDNN_FOR_GPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ struct travel_direction_wrapper<direction_e::backwards> {
};

static format get_target_output_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, program_node *next) {
auto user_idx = node->get_user_index(*next);
auto user_idx = next->get_dependency_output_port(*node);

// 1. Check selected preferred_output_format
auto ret = node->get_preferred_output_fmt(user_idx);
Expand Down Expand Up @@ -541,16 +541,11 @@ static bool is_weights_dependency(program_node* predecessor, program_node* succe
// If there is layout mismatch between two layers, add reorder
template <direction_e dir>
void insert_reorders_in_dir(program& p, const std::map<program_node*, format::type>& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) {
auto fmt = fmt_map.at(node);

auto next_cpy = travel_direction_wrapper<dir>::next_nodes(node);
for (auto next : next_cpy) {
if (!next->is_in_data_flow())
continue;

if (fmt_map.count(next) > 0 && fmt_map.at(next) == fmt)
continue;

if (is_weights_dependency(node, next))
continue;

Expand All @@ -567,6 +562,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty

in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);
if (in_layout.format == out_layout.format)
continue;

GPU_DEBUG_LOG << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## "
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "intel_gpu/primitives/border.hpp"
#include "intel_gpu/primitives/broadcast.hpp"
#include "intel_gpu/primitives/bucketize.hpp"
#include "intel_gpu/primitives/col2im.hpp"
#include "intel_gpu/primitives/concatenation.hpp"
#include "intel_gpu/primitives/convert_color.hpp"
#include "intel_gpu/primitives/crop.hpp"
Expand Down
40 changes: 40 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl_v2/col2im.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "include/batch_headers/fetch_data.cl"

KERNEL(col2im)(const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
)
{
const uint batch = get_global_id(2);
const uint channel_idx = get_global_id(0);

const int channel_offset = batch * NUM_CHANNELS + channel_idx;

for (int idx = 0; idx < KERNEL_PRODUCT; ++idx) {
const int width_offset = idx % KERNEL_SIZE_1;
const int height_offset = (idx / KERNEL_SIZE_1) % KERNEL_SIZE_0;
const int column = channel_idx * KERNEL_PRODUCT + idx;
const int column_offset = batch * NUM_ELEMENTS_FOR_BLOCK + column;

for (int column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) {
const int image_height_idx = column_height_idx * STRIDE_0 - PAD_BEGIN_0 + height_offset * DILATION_0;

if (image_height_idx >= 0 && image_height_idx < OUT_SIZE_0) {
for (int column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) {
const int image_width_idx = column_width_idx * STRIDE_1 - PAD_BEGIN_1 + width_offset * DILATION_1;

if (image_width_idx >= 0 && image_width_idx < OUT_SIZE_1) {
const int img_idx = (channel_offset * OUT_SIZE_0 + image_height_idx) * OUT_SIZE_1 + image_width_idx;
const int data_idx = (column_offset * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx;

// sum the overlapping values
output[img_idx] += input[data_idx];
}
}
}
}
}
}
Loading