From 2059cbeb2bd6b0f7807daaa4629bf17181fac4d7 Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Thu, 13 Mar 2025 16:46:12 +0900 Subject: [PATCH 1/8] [GPU] Implement col2im Signed-off-by: Min, Byungil --- .../intel_gpu/plugin/primitives_list.hpp | 2 + .../intel_gpu/primitives/col_to_im.hpp | 84 +++++++++++++++++ src/plugins/intel_gpu/src/graph/col_to_im.cpp | 88 ++++++++++++++++++ .../src/graph/include/col_to_im_inst.h | 45 ++++++++++ .../cl_kernels/col_to_im_gpu_ref.cl | 14 +++ .../src/kernel_selector/common_types.h | 3 +- .../col_to_im/col_to_im_kernel_base.cpp | 60 +++++++++++++ .../kernels/col_to_im/col_to_im_kernel_base.h | 44 +++++++++ .../col_to_im/col_to_im_kernel_ref.cpp | 89 +++++++++++++++++++ .../kernels/col_to_im/col_to_im_kernel_ref.h | 31 +++++++ .../col_to_im/col_to_im_kernel_selector.cpp | 17 ++++ .../col_to_im/col_to_im_kernel_selector.h | 23 +++++ .../intel_gpu/src/plugin/ops/col_to_im.cpp | 42 +++++++++ .../src/plugin/ops/depth_to_space.cpp | 6 +- 14 files changed, 545 insertions(+), 3 deletions(-) create mode 100644 src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp create mode 100644 src/plugins/intel_gpu/src/graph/col_to_im.cpp create mode 100644 src/plugins/intel_gpu/src/graph/include/col_to_im_inst.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_gpu_ref.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.h create mode 100644 src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index bbfa4c9d0392dc..c5d05bd6292c4a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -269,12 +269,14 @@ REGISTER_FACTORY(v13, BitwiseOr); REGISTER_FACTORY(v13, BitwiseXor); REGISTER_FACTORY(v13, FakeConvert); + // ------------------------------ Supported v15 ops ----------------------------- // REGISTER_FACTORY(v15, ROIAlignRotated); REGISTER_FACTORY(v15, BitwiseRightShift); REGISTER_FACTORY(v15, BitwiseLeftShift); REGISTER_FACTORY(v15, SearchSorted); REGISTER_FACTORY(v15, STFT); +REGISTER_FACTORY(v15, Col2Im); // --------------------------- Supported internal ops --------------------------- // REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp new file mode 100644 index 00000000000000..7e359962d29aa6 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp @@ -0,0 +1,84 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "primitive.hpp" + +namespace cldnn { + +/// @brief +/// @details +struct col_to_im : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(col_to_im) + + col_to_im() : primitive_base("", {}) {} + + /// @brief Constructs col_to_im primitive. + /// @param id This primitive id. + /// @param input Input dictionary primitive id. + /// @param stride Defines shift in input buffer + /// @param dilation Defines gaps in the input + /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis). + /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis). + col_to_im(const primitive_id& id, + const input_info& input, + ov::Strides stride, + ov::Strides dilation, + ov::CoordinateDiff padding_begin, + ov::CoordinateDiff padding_end) + : primitive_base(id, {input}) + , stride(stride) + , dilation(dilation) + , padding_begin(padding_begin) + , padding_end(padding_end) {} + + /// @brief Defines shift in input buffer + ov::Strides stride; + // @brief Defines gaps in the input + ov::Strides dilation; + /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis). + ov::CoordinateDiff padding_begin; + /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis). + ov::CoordinateDiff padding_end; + + size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_range(seed, padding_end.begin(), padding_end.end()); + seed = hash_range(seed, padding_begin.begin(), padding_begin.end()); + seed = hash_range(seed, dilation.begin(), dilation.end()); + seed = hash_range(seed, stride.begin(), stride.end()); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + + #define cmp_fields(name) name == rhs_casted.name + return cmp_fields(stride) && + cmp_fields(dilation) && + cmp_fields(padding_begin) && + cmp_fields(padding_end); + #undef cmp_fields + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << stride; + ob << dilation; + ob << padding_begin; + ob << padding_end; + } + + void load(BinaryInputBuffer& ib) override { + primitive_base::load(ib); + ib >> stride; + ib >> dilation; + ib >> padding_begin; + ib >> padding_end; + } +}; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/col_to_im.cpp b/src/plugins/intel_gpu/src/graph/col_to_im.cpp new file mode 100644 index 00000000000000..1e1c1576c3fd11 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/col_to_im.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_inst.h" +#include "col2im_shape_inference.hpp" + +#include "primitive_type_base.h" +#include "intel_gpu/runtime/error_handler.hpp" +#include "json_object.h" +#include + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(col_to_im) + +layout col_to_im_inst::calc_output_layout(col_to_im_node const& node, kernel_impl_params const& impl_param) { + auto desc = impl_param.typed_desc(); + + auto input_layout = impl_param.get_input_layout(); + auto input_format = input_layout.format; + + // TODO : do sth here for col2im.(Copied dummy from depth_to_space) + auto out_size = input_layout.get_tensor(); + if (format::spatial_num(input_layout.format) == 3) { + // const size_t feature = input_layout.feature() / block_size / block_size / block_size; + // const size_t z = input_layout.spatial(2) * block_size; + // const size_t y = input_layout.spatial(1) * block_size; + // const size_t x = input_layout.spatial(0) * block_size; + // out_size = tensor(TensorValue(input_layout.batch()), TensorValue(feature), TensorValue(x), TensorValue(y), TensorValue(z)); + } else { + // const size_t feature = input_layout.feature() / block_size / block_size; + // const size_t y = input_layout.spatial(1) * block_size; + // const size_t x = input_layout.spatial(0) * block_size; + // out_size = tensor(TensorValue(input_layout.batch()), TensorValue(feature), TensorValue(x), TensorValue(y)); + } + + if (impl_param.has_fused_primitives()) { + input_layout.data_type = impl_param.get_output_element_type(); + } + + return layout{input_layout.data_type, input_format, out_size}; +} + +template +std::vector col_to_im_inst::calc_output_layouts(col_to_im_node const& node, kernel_impl_params const& impl_param) { + auto desc = impl_param.typed_desc(); + auto input_layout = impl_param.get_input_layout(0); + auto output_type = desc->output_data_types[0].value_or(input_layout.data_type); + auto output_format = input_layout.format; + + ov::op::v15::Col2Im op; + + std::vector input_shapes = { + input_layout.get() + }; + std::vector output_shapes = ov::op::v15::shape_infer(&op, input_shapes); + + return { layout{output_shapes[0], output_type, output_format} }; +} + +template std::vector col_to_im_inst::calc_output_layouts(col_to_im_node const& node, const kernel_impl_params& impl_param); + +std::string col_to_im_inst::to_string(col_to_im_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + auto strd = desc->stride; + + std::stringstream primitive_description; + + json_composite col_to_im_info; + col_to_im_info.add("input id", input.id()); + col_to_im_info.add("stride", cldnn::to_string(strd)); + col_to_im_info.add("dilation", cldnn::to_string(desc->dilation)); + col_to_im_info.add("padding begin", cldnn::to_string(desc->padding_begin)); + col_to_im_info.add("padding end", cldnn::to_string(desc->padding_end)); + + node_info->add("col_to_im info", col_to_im_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +col_to_im_inst::typed_primitive_inst(network& network, col_to_im_node const& node) + : parent(network, node) {} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/col_to_im_inst.h b/src/plugins/intel_gpu/src/graph/include/col_to_im_inst.h new file mode 100644 index 00000000000000..c843e92605ee6d --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/col_to_im_inst.h @@ -0,0 +1,45 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/col_to_im.hpp" +#include "primitive_inst.h" + +#include +#include + +namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } + std::shared_ptr get_fuse_params() const override { + return std::make_shared(col_to_im::type_id()); + } + std::vector get_shape_infer_dependencies() const override { return {}; } +}; + +using col_to_im_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(col_to_im_node const& node, kernel_impl_params const& impl_param); + static layout calc_output_layout(col_to_im_node const& node, kernel_impl_params const& impl_param); + + static std::string to_string(col_to_im_node const& node); + + typed_primitive_inst(network& network, col_to_im_node const& desc); +}; + +using col_to_im_inst = typed_primitive_inst; +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_gpu_ref.cl new file mode 100644 index 00000000000000..42cdf2c559fb6d --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_gpu_ref.cl @@ -0,0 +1,14 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +KERNEL(depth_to_space_ref)(const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) +{ +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index 2797212463f087..caca5b3e961b7b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -104,7 +104,8 @@ enum class KernelType { ROPE, DYNAMIC_QUANTIZE, SEARCH_SORTED, - STFT + STFT, + COL_TO_IM }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp new file mode 100644 index 00000000000000..efef62a1a73ebe --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp @@ -0,0 +1,60 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_base.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +bool ColToImKernelBase::Validate(const Params& p) const { + if (p.GetType() != KernelType::COL_TO_IM) { + return false; + } + + const col_to_im_params& params = static_cast(p); + for (auto& fused_op : params.fused_ops) { + if (!IsFusedPrimitiveSupported(fused_op)) + return false; + } + + if (params.inputs[0].Dimentions() > 5) + return false; + + return true; +} + +JitConstants ColToImKernelBase::GetJitConstants(const col_to_im_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.AddConstant(MakeJitConstant("STRIDE", params.stride)); + jit.AddConstant(MakeJitConstant("DILATION", params.dilation)); + jit.AddConstant(MakeJitConstant("PAD_BEGIN", params.padding_begin)); + jit.AddConstant(MakeJitConstant("PAD_END", params.padding_end)); + + return jit; +} + +KernelsData ColToImKernelBase::GetCommonKernelsData(const Params& params) const { + KernelData kd = KernelData::Default(params); + col_to_im_params& newParams = *static_cast(kd.params.get()); + + if (!Validate(params)) { + return {}; + } + + auto dispatchData = SetDefault(newParams); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params); + auto cldnn_jit = GetJitConstants(newParams); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, + EXE_MODE_DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params)); + + return { kd }; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h new file mode 100644 index 00000000000000..e128a35b13a401 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "kernel_selector_params.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// col_to_im_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct col_to_im_params : public base_params { + col_to_im_params() + : base_params(KernelType::COL_TO_IM) {} + uSize stride; + uSize dilation; + uSize padding_begin; + uSize padding_end; +}; + +struct col_to_im_fuse_params : fuse_params { + col_to_im_fuse_params() : fuse_params(KernelType::COL_TO_IM) {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// ColToImKernelBase +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class ColToImKernelBase : public KernelBaseOpenCL { +public: + using KernelBaseOpenCL::KernelBaseOpenCL; + virtual ~ColToImKernelBase() {} + + struct DispatchData : public CommonDispatchData { + }; + +protected: + bool Validate(const Params&) const override; + virtual JitConstants GetJitConstants(const col_to_im_params& params) const; + virtual CommonDispatchData SetDefault(const col_to_im_params& params) const = 0; + KernelsData GetCommonKernelsData(const Params& params) const; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp new file mode 100644 index 00000000000000..45e457576ba9e9 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp @@ -0,0 +1,89 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_ref.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +ParamsKey ColToImKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; +} + +CommonDispatchData ColToImKernelRef::SetDefault(const col_to_im_params& params) const { + CommonDispatchData dispatchData; + + // TODO : implement for col_to_im_gpu_ref + // auto in_layout = params.inputs[0].GetLayout(); + // auto out_layout = params.outputs[0].GetLayout(); + { + // std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + // { Tensor::DataChannelName::FEATURE }, + // { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; + + // dispatchData.gws = { params.outputs[0].Batch().v, + // params.outputs[0].Feature().v, + // params.outputs[0].Z().v * params.outputs[0].Y().v * params.outputs[0].X().v }; + + // // The reason why reverse input/output of GetOptimalLocalWorkGroupSizes(): + // // Large X*Y*Z lws size is better than large batch lws, but current GetOptimalLocalWorkGroupSizes not work like that. + // reverse(dims_by_gws.begin(), dims_by_gws.end()); + // reverse(dispatchData.gws.begin(), dispatchData.gws.end()); + // dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); + // reverse(dispatchData.lws.begin(), dispatchData.lws.end()); + // reverse(dispatchData.gws.begin(), dispatchData.gws.end()); + + dispatchData.gws = {1, 1, 1}; + dispatchData.lws = {1, 1, 1}; + } + + return dispatchData; +} + +KernelsData ColToImKernelRef::GetKernelsData(const Params& params) const { + return GetCommonKernelsData(params); +} + +KernelsPriority ColToImKernelRef::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_9; +} + +JitConstants ColToImKernelRef::GetJitConstants(const col_to_im_params& params) const { + auto jit = Parent::GetJitConstants(params); + auto input = params.inputs[0]; + auto input_dt = input.GetDType(); + + // TODO : implement for col_to_im_gpu_ref + if (!params.fused_ops.empty()) { + std::vector idx_order; + if (input.Dimentions() == 5) { + idx_order = {"batch", "feature", "z", "y", "x"}; + } else if (input.Dimentions() == 4) { + idx_order = {"batch", "feature", "y", "x"}; + } + FusedOpsConfiguration conf = {"", idx_order, "in_val", input_dt, 1}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + return jit; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.h new file mode 100644 index 00000000000000..3d507f1be090d9 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.h @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "col_to_im_kernel_base.h" + +namespace kernel_selector { +class ColToImKernelRef : public ColToImKernelBase { +public: + using Parent = ColToImKernelBase; + + ColToImKernelRef() : ColToImKernelBase("col_to_im_ref") {} + virtual ~ColToImKernelRef() {} + + CommonDispatchData SetDefault(const col_to_im_params& params) const override; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + +protected: + JitConstants GetJitConstants(const col_to_im_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::ELTWISE, + FusedOpType::QUANTIZE, + FusedOpType::REORDER, + FusedOpType::ACTIVATION }; + } +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp new file mode 100644 index 00000000000000..3df5fdb8ff8f16 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp @@ -0,0 +1,17 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_selector.h" +#include "col_to_im_kernel_ref.h" + +namespace kernel_selector { + +col_to_im_kernel_selector::col_to_im_kernel_selector() { + Attach(); +} + +KernelsData col_to_im_kernel_selector::GetBestKernels(const Params& params) const { + return GetNaiveBestKernel(params, KernelType::COL_TO_IM); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.h new file mode 100644 index 00000000000000..00e936f9c9195e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.h @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class col_to_im_kernel_selector : public kernel_selector_base { +public: + static col_to_im_kernel_selector& Instance() { + static col_to_im_kernel_selector instance_; + return instance_; + } + + col_to_im_kernel_selector(); + + virtual ~col_to_im_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp new file mode 100644 index 00000000000000..8bae25b94c6327 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/plugin/common_utils.hpp" + +#include "openvino/op/col2im.hpp" + +#include "intel_gpu/primitives/col_to_im.hpp" + +namespace ov { +namespace intel_gpu { + +static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {1}); + auto inputPrimitives = p.GetInputInfo(op); + std::string layerName = layer_type_name_ID(op); + + // The total number of blocks calculated(L) = product from d=1 to 2 of + // floor((output_size[d] + pads_begin[d] + pads_end[d] - dilation[d] * (kernel_size[d] - 1) - 1) / stride[d] + 1) + // d : all spatial dimension + auto strides = op->get_strides(); + auto dilations = op->get_dilations(); + auto pads_begin = op->get_pads_begin(); + auto pads_end = op->get_pads_end(); + + // Create col2im prim + auto CallToImPrim = cldnn::col_to_im(layerName, + inputPrimitives[0], + strides, + dilations, + pads_begin, + pads_end); + + p.add_primitive(*op, CallToImPrim); +} + +REGISTER_FACTORY_IMPL(v15, Col2Im); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp b/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp index eb8977c11ed7b1..f5f0853137260b 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/depth_to_space.cpp @@ -9,7 +9,8 @@ #include "intel_gpu/primitives/depth_to_space.hpp" -namespace ov::intel_gpu { +namespace ov { +namespace intel_gpu { static cldnn::depth_to_space_mode GetDepthMode(ov::op::v0::DepthToSpace::DepthToSpaceMode mode) { switch (mode) { @@ -40,4 +41,5 @@ static void CreateDepthToSpaceOp(ProgramBuilder& p, const std::shared_ptr Date: Thu, 13 Mar 2025 05:25:14 +0900 Subject: [PATCH 2/8] fix for build --- .../include/col2im_shape_inference.hpp | 2 +- .../intel_gpu/src/graph/registry/registry.hpp | 1 + src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp | 13 ++++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/core/shape_inference/include/col2im_shape_inference.hpp b/src/core/shape_inference/include/col2im_shape_inference.hpp index 835503010e24f2..7dfd57772439c7 100644 --- a/src/core/shape_inference/include/col2im_shape_inference.hpp +++ b/src/core/shape_inference/include/col2im_shape_inference.hpp @@ -16,7 +16,7 @@ template > std::vector shape_infer(const Col2Im* op, const std::vector& input_shapes, const ITensorAccessor& tensor_accessor = make_tensor_accessor()) { - NODE_VALIDATION_CHECK(op, input_shapes.size() == 3); + NODE_VALIDATION_CHECK(op, input_shapes.size() == 3 || input_shapes.size() == 1); // XXX: please check whether "3 or 1" is a correct condition const auto& data_shape = input_shapes[0]; const auto& output_size_shape = input_shapes[1]; const auto& kernel_shape = input_shapes[2]; diff --git a/src/plugins/intel_gpu/src/graph/registry/registry.hpp b/src/plugins/intel_gpu/src/graph/registry/registry.hpp index 1eec48b9eb9971..11cc58cebddf89 100644 --- a/src/plugins/intel_gpu/src/graph/registry/registry.hpp +++ b/src/plugins/intel_gpu/src/graph/registry/registry.hpp @@ -163,6 +163,7 @@ REGISTER_DEFAULT_IMPLS(adaptive_pooling, OCL_S); REGISTER_DEFAULT_IMPLS(batch_to_space, OCL_S); REGISTER_DEFAULT_IMPLS(border, OCL_S, OCL_D); REGISTER_DEFAULT_IMPLS(bucketize, OCL_S); +REGISTER_DEFAULT_IMPLS(col_to_im, OCL_S); REGISTER_DEFAULT_IMPLS(custom_gpu_primitive, OCL_S); REGISTER_DEFAULT_IMPLS(data, COMMON_S, COMMON_D); REGISTER_DEFAULT_IMPLS(depth_to_space, OCL_S); diff --git a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp index 8bae25b94c6327..c3705cf68c270f 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp @@ -13,7 +13,7 @@ namespace ov { namespace intel_gpu { static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {1}); + validate_inputs_count(op, {3}); // XXX Please check whether 3 is correct number auto inputPrimitives = p.GetInputInfo(op); std::string layerName = layer_type_name_ID(op); @@ -24,14 +24,21 @@ static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptrget_dilations(); auto pads_begin = op->get_pads_begin(); auto pads_end = op->get_pads_end(); + ov::CoordinateDiff padding_begin; + ov::CoordinateDiff padding_end; + + for (auto p: op->get_pads_begin()) + padding_begin.push_back(p); + for (auto p: op->get_pads_end()) + padding_end.push_back(p); // Create col2im prim auto CallToImPrim = cldnn::col_to_im(layerName, inputPrimitives[0], strides, dilations, - pads_begin, - pads_end); + padding_begin, + padding_end); p.add_primitive(*op, CallToImPrim); } From d84f34c1c806462e6230ea998fcfc41591a6028b Mon Sep 17 00:00:00 2001 From: "Kim, Mingyu" Date: Thu, 13 Mar 2025 06:01:39 +0900 Subject: [PATCH 3/8] keep three inputs for shape inference --- src/core/shape_inference/include/col2im_shape_inference.hpp | 2 +- .../intel_gpu/include/intel_gpu/primitives/col_to_im.hpp | 6 +++++- src/plugins/intel_gpu/src/graph/col_to_im.cpp | 4 +++- src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp | 2 ++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/core/shape_inference/include/col2im_shape_inference.hpp b/src/core/shape_inference/include/col2im_shape_inference.hpp index 7dfd57772439c7..835503010e24f2 100644 --- a/src/core/shape_inference/include/col2im_shape_inference.hpp +++ b/src/core/shape_inference/include/col2im_shape_inference.hpp @@ -16,7 +16,7 @@ template > std::vector shape_infer(const Col2Im* op, const std::vector& input_shapes, const ITensorAccessor& tensor_accessor = make_tensor_accessor()) { - NODE_VALIDATION_CHECK(op, input_shapes.size() == 3 || input_shapes.size() == 1); // XXX: please check whether "3 or 1" is a correct condition + NODE_VALIDATION_CHECK(op, input_shapes.size() == 3); const auto& data_shape = input_shapes[0]; const auto& output_size_shape = input_shapes[1]; const auto& kernel_shape = input_shapes[2]; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp index 7e359962d29aa6..848347dfc2d2da 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp @@ -17,17 +17,21 @@ struct col_to_im : public primitive_base { /// @brief Constructs col_to_im primitive. /// @param id This primitive id. /// @param input Input dictionary primitive id. + /// @param output_size Input + /// @param kernel_size Input /// @param stride Defines shift in input buffer /// @param dilation Defines gaps in the input /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis). /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis). col_to_im(const primitive_id& id, const input_info& input, + const input_info& output_size, + const input_info& kernel_size, ov::Strides stride, ov::Strides dilation, ov::CoordinateDiff padding_begin, ov::CoordinateDiff padding_end) - : primitive_base(id, {input}) + : primitive_base(id, {input, output_size, kernel_size}) , stride(stride) , dilation(dilation) , padding_begin(padding_begin) diff --git a/src/plugins/intel_gpu/src/graph/col_to_im.cpp b/src/plugins/intel_gpu/src/graph/col_to_im.cpp index 1e1c1576c3fd11..c5f96513a73c81 100644 --- a/src/plugins/intel_gpu/src/graph/col_to_im.cpp +++ b/src/plugins/intel_gpu/src/graph/col_to_im.cpp @@ -51,7 +51,9 @@ std::vector col_to_im_inst::calc_output_layouts(col_to_im_node const& no ov::op::v15::Col2Im op; std::vector input_shapes = { - input_layout.get() + input_layout.get(), + impl_param.get_input_layout(1).get(), + impl_param.get_input_layout(2).get(), }; std::vector output_shapes = ov::op::v15::shape_infer(&op, input_shapes); diff --git a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp index c3705cf68c270f..6ecdd691a5cd5e 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp @@ -35,6 +35,8 @@ static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptr Date: Thu, 13 Mar 2025 07:14:46 +0900 Subject: [PATCH 4/8] col2im shape_infer succeeds. Not sure whether it is correct or not --- .../include/intel_gpu/primitives/col_to_im.hpp | 10 ++++++++-- src/plugins/intel_gpu/src/graph/col_to_im.cpp | 11 +++++++++++ src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp | 13 ++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp index 848347dfc2d2da..8052a3ddedcc28 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp @@ -30,12 +30,16 @@ struct col_to_im : public primitive_base { ov::Strides stride, ov::Strides dilation, ov::CoordinateDiff padding_begin, - ov::CoordinateDiff padding_end) + ov::CoordinateDiff padding_end, + ov::Shape output_shape, + ov::Shape kernel_shape) : primitive_base(id, {input, output_size, kernel_size}) , stride(stride) , dilation(dilation) , padding_begin(padding_begin) - , padding_end(padding_end) {} + , padding_end(padding_end) + , output_shape(output_shape) + , kernel_shape(kernel_shape) {} /// @brief Defines shift in input buffer ov::Strides stride; @@ -45,6 +49,8 @@ struct col_to_im : public primitive_base { ov::CoordinateDiff padding_begin; /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis). ov::CoordinateDiff padding_end; + ov::Shape output_shape; + ov::Shape kernel_shape; size_t hash() const override { size_t seed = primitive::hash(); diff --git a/src/plugins/intel_gpu/src/graph/col_to_im.cpp b/src/plugins/intel_gpu/src/graph/col_to_im.cpp index c5f96513a73c81..4d96813bd7561d 100644 --- a/src/plugins/intel_gpu/src/graph/col_to_im.cpp +++ b/src/plugins/intel_gpu/src/graph/col_to_im.cpp @@ -57,6 +57,17 @@ std::vector col_to_im_inst::calc_output_layouts(col_to_im_node const& no }; std::vector output_shapes = ov::op::v15::shape_infer(&op, input_shapes); + // XXX: quick and dirty implementation of output shape inference. It should have been fed into shape_infer function + output_shapes[0][-1] = node.get_primitive()->output_shape[1]; + output_shapes[0][-2] = node.get_primitive()->output_shape[0]; + size_t prod = 1; + for (auto t: node.get_primitive()->kernel_shape) { + prod *= t; + } + auto C = input_shapes[0][-2] / prod; + output_shapes[0][-3] = C; + + // std::cout << __FILE__ << ":" << __LINE__ << " " << node.id() << " " << output_shapes[0] << " " << input_shapes[1] << " x "<< std::endl; return { layout{output_shapes[0], output_type, output_format} }; } diff --git a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp index 6ecdd691a5cd5e..756ce277498be5 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp @@ -8,6 +8,7 @@ #include "openvino/op/col2im.hpp" #include "intel_gpu/primitives/col_to_im.hpp" +#include "openvino/op/constant.hpp" namespace ov { namespace intel_gpu { @@ -32,6 +33,14 @@ static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptrget_pads_end()) padding_end.push_back(p); + auto output_shape_const = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); + auto vec_output_shape = output_shape_const->cast_vector(); + ov::Shape output_shape(vec_output_shape); + + auto kernel_size_const = ov::as_type_ptr(op->get_input_node_shared_ptr(2)); + auto kernel_size = kernel_size_const->cast_vector(); + ov::Shape kernel_shape(kernel_size); + // Create col2im prim auto CallToImPrim = cldnn::col_to_im(layerName, inputPrimitives[0], @@ -40,7 +49,9 @@ static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptr Date: Thu, 13 Mar 2025 12:04:39 +0900 Subject: [PATCH 5/8] now ocl build fails --- .../prepare_primitive_fusing.cpp | 3 ++ .../src/graph/impls/ocl/col_to_im.cpp | 50 +++++++++++++++++++ .../src/graph/impls/ocl/register.cpp | 1 + .../src/graph/impls/ocl/register.hpp | 2 + ...{col_to_im_gpu_ref.cl => col_to_im_ref.cl} | 2 +- ...deformable_convolution_kernel_bfyx_ref.cpp | 10 ++-- 6 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp rename src/plugins/intel_gpu/src/kernel_selector/cl_kernels/{col_to_im_gpu_ref.cl => col_to_im_ref.cl} (81%) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 622b7cff4101ad..a8f7715c5a9574 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -371,6 +371,9 @@ void prepare_primitive_fusing::fuse_bias(program &p) { if (replace_candidate.is_type()) { auto& conv = replace_candidate.as(); auto desc = conv.get_primitive(); + // XXX: deformable convolution does not support bias fusing at this moment. It is just not tested and deformable_mode value is not properly handled below. + if (desc->deformable_mode) + continue; primitive_id biases = bias_name; // If the primitive has biases, then we try to combine the values, or do nothing and keep as fused sum. diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp new file mode 100644 index 00000000000000..040fb627548a5a --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "primitive_base.hpp" + +#include "col_to_im_inst.h" +#include "col_to_im/col_to_im_kernel_selector.h" +#include "col_to_im/col_to_im_kernel_ref.h" + +namespace cldnn { +namespace ocl { +struct col_to_im_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::col_to_im_kernel_selector; + using kernel_params_t = kernel_selector::col_to_im_params; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::col_to_im_impl) + + std::unique_ptr clone() const override { + return make_deep_copy(*this); + } + + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param); + + return params; + } +}; + +namespace detail { + +attach_col_to_im_impl::attach_col_to_im_impl() { + std::vector dt = { + data_types::f16, + }; + std::vector fmt = { + format::bfyx, + }; + implementation_map::add(impl_types::ocl, typed_primitive_impl_ocl::create, dt, fmt); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::col_to_im_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::col_to_im) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index cfb01239f7709e..3a2115ae92f296 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -15,6 +15,7 @@ void register_implementations() { REGISTER_OCL(border); REGISTER_OCL(broadcast); REGISTER_OCL(bucketize); + REGISTER_OCL(col_to_im); REGISTER_OCL(concatenation); REGISTER_OCL(crop); REGISTER_OCL(custom_gpu_primitive); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index 4f975539a003fc..612b75e76a1dd9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -10,6 +10,7 @@ #include "intel_gpu/primitives/border.hpp" #include "intel_gpu/primitives/broadcast.hpp" #include "intel_gpu/primitives/bucketize.hpp" +#include "intel_gpu/primitives/col_to_im.hpp" #include "intel_gpu/primitives/concatenation.hpp" #include "intel_gpu/primitives/convert_color.hpp" #include "intel_gpu/primitives/crop.hpp" @@ -88,6 +89,7 @@ REGISTER_OCL(batch_to_space); REGISTER_OCL(border); REGISTER_OCL(broadcast); REGISTER_OCL(bucketize); +REGISTER_OCL(col_to_im); REGISTER_OCL(concatenation); REGISTER_OCL(crop); REGISTER_OCL(custom_gpu_primitive); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl similarity index 81% rename from src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_gpu_ref.cl rename to src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl index 42cdf2c559fb6d..eea34bc20d6e84 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl @@ -4,7 +4,7 @@ #include "include/batch_headers/fetch_data.cl" -KERNEL(depth_to_space_ref)(const __global INPUT0_TYPE* input, +KERNEL(col_to_im_ref)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output #if HAS_FUSED_OPS_DECLS , FUSED_OPS_DECLS diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp index cf5581afda8e19..52c226f95562b2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/deformable_convolution_kernel_bfyx_ref.cpp @@ -17,10 +17,12 @@ ParamsKey DeformableConvolutionKernel_bfyx_Ref::GetSupportedKey() const { k.EnableInputWeightsType(WeightsType::F16); k.EnableInputWeightsType(WeightsType::F32); k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::byxf); + // k.EnableInputLayout(DataLayout::bfyx); + // k.EnableOutputLayout(DataLayout::bfyx); + // k.EnableInputLayout(DataLayout::byxf); + // k.EnableOutputLayout(DataLayout::byxf); + k.EnableAllInputLayout(); // XXX: this is hack to run e2e network. Not sure whether it is a correct fix or not. + k.EnableAllOutputLayout(); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableDilation(); From b958e436f6dbd5353b75601d8929c404d7d6f95e Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Thu, 20 Mar 2025 02:10:33 +0900 Subject: [PATCH 6/8] [GPU] Implement col2im reference kernel Signed-off-by: Min, Byungil --- .../intel_gpu/primitives/col_to_im.hpp | 12 ++++- .../src/graph/impls/ocl/col_to_im.cpp | 29 +++++++++++- .../cl_kernels/col_to_im_ref.cl | 45 +++++++++++++++++++ .../col_to_im/col_to_im_kernel_base.cpp | 21 +++++++-- .../kernels/col_to_im/col_to_im_kernel_base.h | 4 ++ .../col_to_im/col_to_im_kernel_ref.cpp | 25 +---------- .../col_to_im/col_to_im_kernel_selector.cpp | 2 + 7 files changed, 108 insertions(+), 30 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp index 8052a3ddedcc28..922457ea0ba1d3 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/col_to_im.hpp @@ -23,6 +23,8 @@ struct col_to_im : public primitive_base { /// @param dilation Defines gaps in the input /// @param padding_begin Defines a padding added to input image on left (x axis) and top (y axis). /// @param padding_end Defines a padding added to input image on right (x axis) and bottom (y axis). + /// @param output_shape Defines the output tensor the output image + /// @param kernel_shape Defines size of the sliding blocks col_to_im(const primitive_id& id, const input_info& input, const input_info& output_size, @@ -58,6 +60,8 @@ struct col_to_im : public primitive_base { seed = hash_range(seed, padding_begin.begin(), padding_begin.end()); seed = hash_range(seed, dilation.begin(), dilation.end()); seed = hash_range(seed, stride.begin(), stride.end()); + seed = hash_range(seed, output_shape.begin(), output_shape.end()); + seed = hash_range(seed, kernel_shape.begin(), kernel_shape.end()); return seed; } @@ -71,7 +75,9 @@ struct col_to_im : public primitive_base { return cmp_fields(stride) && cmp_fields(dilation) && cmp_fields(padding_begin) && - cmp_fields(padding_end); + cmp_fields(padding_end) && + cmp_fields(output_shape) && + cmp_fields(kernel_shape); #undef cmp_fields } @@ -81,6 +87,8 @@ struct col_to_im : public primitive_base { ob << dilation; ob << padding_begin; ob << padding_end; + ob << output_shape; + ob << kernel_shape; } void load(BinaryInputBuffer& ib) override { @@ -89,6 +97,8 @@ struct col_to_im : public primitive_base { ib >> dilation; ib >> padding_begin; ib >> padding_end; + ib >> output_shape; + ib >> kernel_shape; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp index 040fb627548a5a..ed4f862902d04c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/col_to_im.cpp @@ -8,6 +8,8 @@ #include "col_to_im/col_to_im_kernel_selector.h" #include "col_to_im/col_to_im_kernel_ref.h" +#include "intel_gpu/plugin/common_utils.hpp" + namespace cldnn { namespace ocl { struct col_to_im_impl : typed_primitive_impl_ocl { @@ -24,9 +26,32 @@ struct col_to_im_impl : typed_primitive_impl_ocl { static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { const auto& primitive = impl_param.typed_desc(); - auto params = get_default_params(impl_param); + auto col2im_params = get_default_params(impl_param); + + // Attributes + uint32_t stride_x, stride_y, stride_z; + uint32_t dilation_x, dilation_y, dilation_z; + std::tie(stride_x, stride_y, stride_z) = ov::intel_gpu::get_xyz(primitive->stride, 1); + col2im_params.stride = {stride_x, stride_y, stride_z}; + std::tie(dilation_x, dilation_y, dilation_z) = ov::intel_gpu::get_xyz(primitive->dilation, 1); + col2im_params.dilation = {dilation_x, dilation_y, dilation_z}; + + // padding being & end + uint32_t pad_begin_x, pad_begin_y, pad_begin_z; + std::tie(pad_begin_x, pad_begin_y, pad_begin_z) = ov::intel_gpu::get_xyz(primitive->padding_begin, 0); + col2im_params.padding_begin = {pad_begin_x, pad_begin_y, pad_begin_z}; + uint32_t pad_end_x, pad_end_y, pad_end_z; + std::tie(pad_end_x, pad_end_y, pad_end_z) = ov::intel_gpu::get_xyz(primitive->padding_end, 0); + col2im_params.padding_end = {pad_end_x, pad_end_y, pad_end_z}; + + // Col2Im-15 implementation : required + // output size is 1D tensor of two positive integer numbers (height and width) + std::vector output_size(primitive->output_shape.begin(), primitive->output_shape.end()); + std::vector kernel_size(primitive->kernel_shape.begin(), primitive->kernel_shape.end()); + col2im_params.output_size = {output_size[0], output_size[1], (uint32_t)1}; + col2im_params.kernel_size = {kernel_size[0], kernel_size[1], (uint32_t)1}; - return params; + return col2im_params; } }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl index eea34bc20d6e84..4f34159af954cd 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl @@ -11,4 +11,49 @@ KERNEL(col_to_im_ref)(const __global INPUT0_TYPE* input, #endif ) { + const uint output_size[2] = {OUT_SIZE_X, OUT_SIZE_Y}; + const uint kernel_size[2] = {KERNEL_SIZE_X, KERNEL_SIZE_Y}; + const uint strides[2] = {STRIDE_SIZE_X, STRIDE_SIZE_Y}; + const uint dilations[2] = {DILATION_SIZE_X, DILATION_SIZE_Y}; + const uint pads_begin[2] = {PAD_BEGIN_SIZE_X, PAD_BEGIN_SIZE_Y}; + const uint pads_end[2] = {PAD_END_SIZE_X, PAD_END_SIZE_Y}; + + const uint batch_count = INPUT0_BATCH_NUM; + const uint num_blocks = INPUT0_SIZE_Y; + const uint kernel_product = KERNEL_SIZE_X * KERNEL_SIZE_Y; + const uint channels_per_column = INPUT0_FEATURE_NUM; + const uint channel_count = channels_per_column / kernel_product; + + // calculate the original height and width + // uint get_image_dimension_index = [&](const uint column_dim_idx, const uint dim_offset, const uint idx) { + // return column_dim_idx * strides[idx] - pads_begin[idx] + dim_offset * dilations[idx]; + // }; + + for (uint batch = 0; batch < batch_count; ++batch) { + for (uint column = 0; column < channels_per_column; ++column) { + const uint width_offset = column % kernel_size[1]; + const uint height_offset = (column / kernel_size[1]) % kernel_size[0]; + const uint channel_idx = column / kernel_product; + + for (uint column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { + // get_image_dimension_index(column_height_idx, height_offset, 0); + const uint image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; + if (image_height_idx >= 0 && image_height_idx < output_size[0]) { + for (uint column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { + // get_image_dimension_index(column_width_idx, width_offset, 1); + const uint image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; + if (image_width_idx >= 0 && image_width_idx < output_size[1]) { + const uint img_idx = + ((batch * channel_count + channel_idx) * output_size[0] + image_height_idx) * image_width_idx; + const uint data_idx = + ((batch * channels_per_column + column) * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx; + + // sum the overlapping values + output[img_idx] += input[data_idx]; + } + } + } + } + } + } } \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp index efef62a1a73ebe..33fc53c0d0217c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp @@ -28,11 +28,24 @@ bool ColToImKernelBase::Validate(const Params& p) const { JitConstants ColToImKernelBase::GetJitConstants(const col_to_im_params& params) const { JitConstants jit = MakeBaseParamsJitConstants(params); + const auto& output_size = params.output_size; + const auto& stride = params.stride; + const auto& dilation = params.dilation; + const auto& pads_begin = params.padding_begin; + const auto& pads_end = params.padding_begin; - jit.AddConstant(MakeJitConstant("STRIDE", params.stride)); - jit.AddConstant(MakeJitConstant("DILATION", params.dilation)); - jit.AddConstant(MakeJitConstant("PAD_BEGIN", params.padding_begin)); - jit.AddConstant(MakeJitConstant("PAD_END", params.padding_end)); + // Get original dimension + const uint orig_height = (output_size.x + pads_begin.x + pads_end.x - (dilation.x * (params.kernel_size.x - 1) + 1)) / stride.x + 1; + const uint orig_width = (output_size.y + pads_begin.y + pads_end.y - (dilation.y * (params.kernel_size.y - 1) + 1)) / stride.y + 1; + jit.AddConstant(MakeJitConstant("ORIG_HEIGHT", orig_height)); + jit.AddConstant(MakeJitConstant("ORIG_WIDTH", orig_width)); + + jit.AddConstant(MakeJitConstant("OUT", params.output_size)); + jit.AddConstant(MakeJitConstant("KERNEL", params.kernel_size)); + jit.AddConstant(MakeJitConstant("STRIDE", stride)); + jit.AddConstant(MakeJitConstant("DILATION", dilation)); + jit.AddConstant(MakeJitConstant("PAD_BEGIN", pads_begin)); + jit.AddConstant(MakeJitConstant("PAD_END", pads_end)); return jit; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h index e128a35b13a401..f3d3904c139cbf 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.h @@ -14,6 +14,10 @@ namespace kernel_selector { struct col_to_im_params : public base_params { col_to_im_params() : base_params(KernelType::COL_TO_IM) {} + // Required + uSize output_size; + uSize kernel_size; + // Optional uSize stride; uSize dilation; uSize padding_begin; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp index 45e457576ba9e9..7163f267d647d0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp @@ -31,29 +31,8 @@ ParamsKey ColToImKernelRef::GetSupportedKey() const { CommonDispatchData ColToImKernelRef::SetDefault(const col_to_im_params& params) const { CommonDispatchData dispatchData; - // TODO : implement for col_to_im_gpu_ref - // auto in_layout = params.inputs[0].GetLayout(); - // auto out_layout = params.outputs[0].GetLayout(); - { - // std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, - // { Tensor::DataChannelName::FEATURE }, - // { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; - - // dispatchData.gws = { params.outputs[0].Batch().v, - // params.outputs[0].Feature().v, - // params.outputs[0].Z().v * params.outputs[0].Y().v * params.outputs[0].X().v }; - - // // The reason why reverse input/output of GetOptimalLocalWorkGroupSizes(): - // // Large X*Y*Z lws size is better than large batch lws, but current GetOptimalLocalWorkGroupSizes not work like that. - // reverse(dims_by_gws.begin(), dims_by_gws.end()); - // reverse(dispatchData.gws.begin(), dispatchData.gws.end()); - // dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); - // reverse(dispatchData.lws.begin(), dispatchData.lws.end()); - // reverse(dispatchData.gws.begin(), dispatchData.gws.end()); - - dispatchData.gws = {1, 1, 1}; - dispatchData.lws = {1, 1, 1}; - } + dispatchData.gws = {1, 1, 1}; + dispatchData.lws = {1, 1, 1}; return dispatchData; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp index 3df5fdb8ff8f16..788e668962828a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp @@ -4,11 +4,13 @@ #include "col_to_im_kernel_selector.h" #include "col_to_im_kernel_ref.h" +// #include "col_to_im_kernel_opt.h" namespace kernel_selector { col_to_im_kernel_selector::col_to_im_kernel_selector() { Attach(); + // Attach(); } KernelsData col_to_im_kernel_selector::GetBestKernels(const Params& params) const { From 124eedd5b062f39a32fe7dcfa20cba7d381f2c6a Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Thu, 20 Mar 2025 02:12:36 +0900 Subject: [PATCH 7/8] [GPU] Init draft for col2im opt kernel Signed-off-by: Min, Byungil --- .../cl_kernels/col_to_im_opt.cl | 60 ++++++++++++ .../col_to_im/col_to_im_kernel_opt.cpp | 93 +++++++++++++++++++ .../kernels/col_to_im/col_to_im_kernel_opt.h | 31 +++++++ .../col_to_im/col_to_im_kernel_selector.cpp | 4 +- 4 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.h diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl new file mode 100644 index 00000000000000..cde769cf5a0314 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl @@ -0,0 +1,60 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +KERNEL(col_to_im_opt)(const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) +{ + const uint output_size[2] = {OUT_SIZE_X, OUT_SIZE_Y}; + const uint kernel_size[2] = {KERNEL_SIZE_X, KERNEL_SIZE_Y}; + const uint strides[2] = {STRIDE_SIZE_X, STRIDE_SIZE_Y}; + const uint dilations[2] = {DILATION_SIZE_X, DILATION_SIZE_Y}; + const uint pads_begin[2] = {PAD_BEGIN_SIZE_X, PAD_BEGIN_SIZE_Y}; + const uint pads_end[2] = {PAD_END_SIZE_X, PAD_END_SIZE_Y}; + + const uint num_blocks = INPUT0_SIZE_Y; + const uint kernel_product = KERNEL_SIZE_X * KERNEL_SIZE_Y; + const uint channels_per_column = INPUT0_FEATURE_NUM; + const uint channel_count = channels_per_column / kernel_product; + + const uint batch_count = INPUT0_BATCH_NUM; + const uint batch = get_global_id(2); + + // printf("batch(%d) num_blocks(%u) output(%u, %u), channel(%u, %u) original_height(%u) original_width(%u) \n", + // batch, num_blocks, (uint)OUT_SIZE_X, (uint)OUT_SIZE_Y, (uint)KERNEL_SIZE_X, (uint)KERNEL_SIZE_Y, ORIG_HEIGHT, ORIG_WIDTH); + + // for (uint batch = 0; batch < batch_count; ++batch) { + for (uint column = 0; column < channels_per_column; ++column) { + const uint width_offset = column % kernel_size[1]; + const uint height_offset = (column / kernel_size[1]) % kernel_size[0]; + const uint channel_idx = column / kernel_product; + + const uint out_idx = (batch * channel_count + channel_idx) * output_size[0]; + const uint height_idx = (batch * channels_per_column + column) * ORIG_HEIGHT; + + for (uint column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { + // get_image_dimension_index(column_height_idx, height_offset, 0); + const uint image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; + if (image_height_idx >= 0 && image_height_idx < output_size[0]) { + for (uint column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { + // get_image_dimension_index(column_width_idx, width_offset, 1); + const uint image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; + if (image_width_idx >= 0 && image_width_idx < output_size[1]) { + const uint img_idx = (out_idx + image_height_idx) * image_width_idx; + const uint data_idx = (height_idx + column_height_idx) * ORIG_WIDTH + column_width_idx; + + // sum the overlapping values + output[img_idx] += input[data_idx]; + } + } + } + } + } + // } +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp new file mode 100644 index 00000000000000..1c91503dcaac85 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "col_to_im_kernel_opt.h" +#include "kernel_selector_utils.h" +#include +#include + +namespace kernel_selector { + +ParamsKey ColToImKernelOpt::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; +} + +CommonDispatchData ColToImKernelOpt::SetDefault(const col_to_im_params& params) const { + CommonDispatchData dispatchData; + + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.outputs[0].GetLayout(); + { + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; + + dispatchData.gws = { params.outputs[0].Batch().v, + 1, + 1}; // params.outputs[0].Feature().v * params.outputs[0].Z().v * params.outputs[0].Y().v * params.outputs[0].X().v }; + + // // The reason why reverse input/output of GetOptimalLocalWorkGroupSizes(): + // Large X*Y*Z lws size is better than large batch lws, but current GetOptimalLocalWorkGroupSizes not work like that. + reverse(dims_by_gws.begin(), dims_by_gws.end()); + reverse(dispatchData.gws.begin(), dispatchData.gws.end()); + + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); + + // reverse(dispatchData.lws.begin(), dispatchData.lws.end()); + // reverse(dispatchData.gws.begin(), dispatchData.gws.end()); + } + + // dispatchData.gws = {1, 1, 1}; + dispatchData.lws = {1, 1, 1}; + + std::cout << " Select ColToImKernelOpt : gws(" << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << ")" << std::endl; + std::cout << " Select ColToImKernelOpt : lws(" << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << ")" << std::endl; + + return dispatchData; +} + +KernelsData ColToImKernelOpt::GetKernelsData(const Params& params) const { + return GetCommonKernelsData(params); +} + +KernelsPriority ColToImKernelOpt::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_2; +} + +JitConstants ColToImKernelOpt::GetJitConstants(const col_to_im_params& params) const { + auto jit = Parent::GetJitConstants(params); + auto input = params.inputs[0]; + auto input_dt = input.GetDType(); + + // TODO : implement for col_to_im_gpu_opt + if (!params.fused_ops.empty()) { + std::vector idx_order; + if (input.Dimentions() == 5) { + idx_order = {"batch", "feature", "z", "y", "x"}; + } else if (input.Dimentions() == 4) { + idx_order = {"batch", "feature", "y", "x"}; + } + FusedOpsConfiguration conf = {"", idx_order, "in_val", input_dt, 1}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + return jit; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.h new file mode 100644 index 00000000000000..e05ec67b8db9e2 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.h @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "col_to_im_kernel_base.h" + +namespace kernel_selector { +class ColToImKernelOpt : public ColToImKernelBase { +public: + using Parent = ColToImKernelBase; + + ColToImKernelOpt() : ColToImKernelBase("col_to_im_opt") {} + virtual ~ColToImKernelOpt() {} + + CommonDispatchData SetDefault(const col_to_im_params& params) const override; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + +protected: + JitConstants GetJitConstants(const col_to_im_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::ELTWISE, + FusedOpType::QUANTIZE, + FusedOpType::REORDER, + FusedOpType::ACTIVATION }; + } +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp index 788e668962828a..a10a632f5d4c4e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_selector.cpp @@ -4,13 +4,13 @@ #include "col_to_im_kernel_selector.h" #include "col_to_im_kernel_ref.h" -// #include "col_to_im_kernel_opt.h" +#include "col_to_im_kernel_opt.h" namespace kernel_selector { col_to_im_kernel_selector::col_to_im_kernel_selector() { Attach(); - // Attach(); + Attach(); } KernelsData col_to_im_kernel_selector::GetBestKernels(const Params& params) const { From d10be366cc52924a05058c85c5c30284a4a7c7f5 Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Thu, 20 Mar 2025 11:18:50 +0900 Subject: [PATCH 8/8] Update col2im opt kernel Signed-off-by: Min, Byungil --- .../cl_kernels/col_to_im_opt.cl | 57 ++++++++----------- .../cl_kernels/col_to_im_ref.cl | 43 ++++++-------- .../col_to_im/col_to_im_kernel_base.cpp | 16 +++++- .../col_to_im/col_to_im_kernel_opt.cpp | 30 ++-------- .../col_to_im/col_to_im_kernel_ref.cpp | 3 +- .../intel_gpu/src/plugin/ops/col_to_im.cpp | 6 ++ 6 files changed, 64 insertions(+), 91 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl index cde769cf5a0314..be3390317d0f84 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_opt.cl @@ -16,45 +16,34 @@ KERNEL(col_to_im_opt)(const __global INPUT0_TYPE* input, const uint strides[2] = {STRIDE_SIZE_X, STRIDE_SIZE_Y}; const uint dilations[2] = {DILATION_SIZE_X, DILATION_SIZE_Y}; const uint pads_begin[2] = {PAD_BEGIN_SIZE_X, PAD_BEGIN_SIZE_Y}; - const uint pads_end[2] = {PAD_END_SIZE_X, PAD_END_SIZE_Y}; - const uint num_blocks = INPUT0_SIZE_Y; - const uint kernel_product = KERNEL_SIZE_X * KERNEL_SIZE_Y; - const uint channels_per_column = INPUT0_FEATURE_NUM; - const uint channel_count = channels_per_column / kernel_product; - - const uint batch_count = INPUT0_BATCH_NUM; const uint batch = get_global_id(2); + const uint channel_idx = get_global_id(0); + + const int channel_offset = batch * NUM_CHANNELS + channel_idx; + + for (int idx = 0; idx < KERNEL_PRODUCT; ++idx) { + const int width_offset = idx % kernel_size[1]; + const int height_offset = (idx / kernel_size[1]) % kernel_size[0]; + const int column = channel_idx * KERNEL_PRODUCT + idx; + const int column_offset = batch * NUM_ELEMENTS_FOR_BLOCK + column; + + for (int column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { + const int image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; + + if (image_height_idx >= 0 && image_height_idx < output_size[0]) { + for (int column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { + const int image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; + + if (image_width_idx >= 0 && image_width_idx < output_size[1]) { + const int img_idx = (channel_offset * output_size[0] + image_height_idx) * output_size[1] + image_width_idx; + const int data_idx = (column_offset * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx; - // printf("batch(%d) num_blocks(%u) output(%u, %u), channel(%u, %u) original_height(%u) original_width(%u) \n", - // batch, num_blocks, (uint)OUT_SIZE_X, (uint)OUT_SIZE_Y, (uint)KERNEL_SIZE_X, (uint)KERNEL_SIZE_Y, ORIG_HEIGHT, ORIG_WIDTH); - - // for (uint batch = 0; batch < batch_count; ++batch) { - for (uint column = 0; column < channels_per_column; ++column) { - const uint width_offset = column % kernel_size[1]; - const uint height_offset = (column / kernel_size[1]) % kernel_size[0]; - const uint channel_idx = column / kernel_product; - - const uint out_idx = (batch * channel_count + channel_idx) * output_size[0]; - const uint height_idx = (batch * channels_per_column + column) * ORIG_HEIGHT; - - for (uint column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { - // get_image_dimension_index(column_height_idx, height_offset, 0); - const uint image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; - if (image_height_idx >= 0 && image_height_idx < output_size[0]) { - for (uint column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { - // get_image_dimension_index(column_width_idx, width_offset, 1); - const uint image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; - if (image_width_idx >= 0 && image_width_idx < output_size[1]) { - const uint img_idx = (out_idx + image_height_idx) * image_width_idx; - const uint data_idx = (height_idx + column_height_idx) * ORIG_WIDTH + column_width_idx; - - // sum the overlapping values - output[img_idx] += input[data_idx]; - } + // sum the overlapping values + output[img_idx] += input[data_idx]; } } } } - // } + } } \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl index 4f34159af954cd..7a4f52060ca671 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/col_to_im_ref.cl @@ -16,41 +16,30 @@ KERNEL(col_to_im_ref)(const __global INPUT0_TYPE* input, const uint strides[2] = {STRIDE_SIZE_X, STRIDE_SIZE_Y}; const uint dilations[2] = {DILATION_SIZE_X, DILATION_SIZE_Y}; const uint pads_begin[2] = {PAD_BEGIN_SIZE_X, PAD_BEGIN_SIZE_Y}; - const uint pads_end[2] = {PAD_END_SIZE_X, PAD_END_SIZE_Y}; - const uint batch_count = INPUT0_BATCH_NUM; const uint num_blocks = INPUT0_SIZE_Y; const uint kernel_product = KERNEL_SIZE_X * KERNEL_SIZE_Y; const uint channels_per_column = INPUT0_FEATURE_NUM; - const uint channel_count = channels_per_column / kernel_product; + const uint channel_count = channels_per_column / KERNEL_PRODUCT; - // calculate the original height and width - // uint get_image_dimension_index = [&](const uint column_dim_idx, const uint dim_offset, const uint idx) { - // return column_dim_idx * strides[idx] - pads_begin[idx] + dim_offset * dilations[idx]; - // }; + const uint batch = get_global_id(2); - for (uint batch = 0; batch < batch_count; ++batch) { - for (uint column = 0; column < channels_per_column; ++column) { - const uint width_offset = column % kernel_size[1]; - const uint height_offset = (column / kernel_size[1]) % kernel_size[0]; - const uint channel_idx = column / kernel_product; + for (int column = 0; column < channels_per_column; ++column) { + const int width_offset = column % kernel_size[1]; + const int height_offset = (column / kernel_size[1]) % kernel_size[0]; + const int channel_idx = column / kernel_product; - for (uint column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { - // get_image_dimension_index(column_height_idx, height_offset, 0); - const uint image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; - if (image_height_idx >= 0 && image_height_idx < output_size[0]) { - for (uint column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { - // get_image_dimension_index(column_width_idx, width_offset, 1); - const uint image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; - if (image_width_idx >= 0 && image_width_idx < output_size[1]) { - const uint img_idx = - ((batch * channel_count + channel_idx) * output_size[0] + image_height_idx) * image_width_idx; - const uint data_idx = - ((batch * channels_per_column + column) * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx; + for (int column_height_idx = 0; column_height_idx < ORIG_HEIGHT; ++column_height_idx) { + const int image_height_idx = column_height_idx * strides[0] - pads_begin[0] + height_offset * dilations[0]; + if (image_height_idx >= 0 && image_height_idx < output_size[0]) { + for (int column_width_idx = 0; column_width_idx < ORIG_WIDTH; ++column_width_idx) { + const int image_width_idx = column_width_idx * strides[1] - pads_begin[1] + width_offset * dilations[1]; + if (image_width_idx >= 0 && image_width_idx < output_size[1]) { + const int img_idx = ((batch * channel_count + channel_idx) * output_size[0] + image_height_idx) * output_size[1] + image_width_idx; + const int data_idx = ((batch * channels_per_column + column) * ORIG_HEIGHT + column_height_idx) * ORIG_WIDTH + column_width_idx; - // sum the overlapping values - output[img_idx] += input[data_idx]; - } + // sum the overlapping values + output[img_idx] += input[data_idx]; } } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp index 33fc53c0d0217c..c39e47288dd157 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_base.cpp @@ -28,18 +28,28 @@ bool ColToImKernelBase::Validate(const Params& p) const { JitConstants ColToImKernelBase::GetJitConstants(const col_to_im_params& params) const { JitConstants jit = MakeBaseParamsJitConstants(params); + auto input = params.inputs[0]; const auto& output_size = params.output_size; const auto& stride = params.stride; const auto& dilation = params.dilation; const auto& pads_begin = params.padding_begin; const auto& pads_end = params.padding_begin; - // Get original dimension - const uint orig_height = (output_size.x + pads_begin.x + pads_end.x - (dilation.x * (params.kernel_size.x - 1) + 1)) / stride.x + 1; - const uint orig_width = (output_size.y + pads_begin.y + pads_end.y - (dilation.y * (params.kernel_size.y - 1) + 1)) / stride.y + 1; + const auto orig_height = (output_size.x + pads_begin.x + pads_end.x - (dilation.x * (params.kernel_size.x - 1) + 1)) / stride.x + 1; + const auto orig_width = (output_size.y + pads_begin.y + pads_end.y - (dilation.y * (params.kernel_size.y - 1) + 1)) / stride.y + 1; jit.AddConstant(MakeJitConstant("ORIG_HEIGHT", orig_height)); jit.AddConstant(MakeJitConstant("ORIG_WIDTH", orig_width)); + // Consider input tensor : (N, C * Product(kernel_size), L) + const auto num_elements_for_block = input.Feature().v; + const auto num_blocks = input.Y().v; + const auto kernel_product = params.kernel_size.x * params.kernel_size.y; + const auto num_channels = num_elements_for_block / kernel_product; + jit.AddConstant(MakeJitConstant("NUM_ELEMENTS_FOR_BLOCK", num_elements_for_block)); + jit.AddConstant(MakeJitConstant("KERNEL_PRODUCT", kernel_product)); + jit.AddConstant(MakeJitConstant("NUM_CHANNELS", num_channels)); + jit.AddConstant(MakeJitConstant("NUM_BLOCKS", num_blocks)); + jit.AddConstant(MakeJitConstant("OUT", params.output_size)); jit.AddConstant(MakeJitConstant("KERNEL", params.kernel_size)); jit.AddConstant(MakeJitConstant("STRIDE", stride)); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp index 1c91503dcaac85..9a9b3e3b996eee 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_opt.cpp @@ -31,34 +31,14 @@ ParamsKey ColToImKernelOpt::GetSupportedKey() const { CommonDispatchData ColToImKernelOpt::SetDefault(const col_to_im_params& params) const { CommonDispatchData dispatchData; - auto in_layout = params.inputs[0].GetLayout(); - auto out_layout = params.outputs[0].GetLayout(); - { - std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, - { Tensor::DataChannelName::FEATURE }, - { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; - - dispatchData.gws = { params.outputs[0].Batch().v, - 1, - 1}; // params.outputs[0].Feature().v * params.outputs[0].Z().v * params.outputs[0].Y().v * params.outputs[0].X().v }; - - // // The reason why reverse input/output of GetOptimalLocalWorkGroupSizes(): - // Large X*Y*Z lws size is better than large batch lws, but current GetOptimalLocalWorkGroupSizes not work like that. - reverse(dims_by_gws.begin(), dims_by_gws.end()); - reverse(dispatchData.gws.begin(), dispatchData.gws.end()); - - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); - - // reverse(dispatchData.lws.begin(), dispatchData.lws.end()); - // reverse(dispatchData.gws.begin(), dispatchData.gws.end()); - } + auto input = params.inputs[0]; + const auto num_elements_for_block = input.Feature().v; + const auto kernel_product = params.kernel_size.x * params.kernel_size.y; + const auto num_channels = num_elements_for_block / kernel_product; - // dispatchData.gws = {1, 1, 1}; + dispatchData.gws = {num_channels, 1, params.outputs[0].Batch().v}; dispatchData.lws = {1, 1, 1}; - std::cout << " Select ColToImKernelOpt : gws(" << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << ")" << std::endl; - std::cout << " Select ColToImKernelOpt : lws(" << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << ")" << std::endl; - return dispatchData; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp index 7163f267d647d0..eb5208d24437fd 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/col_to_im/col_to_im_kernel_ref.cpp @@ -31,7 +31,7 @@ ParamsKey ColToImKernelRef::GetSupportedKey() const { CommonDispatchData ColToImKernelRef::SetDefault(const col_to_im_params& params) const { CommonDispatchData dispatchData; - dispatchData.gws = {1, 1, 1}; + dispatchData.gws = {1, 1, params.outputs[0].Batch().v}; dispatchData.lws = {1, 1, 1}; return dispatchData; @@ -50,7 +50,6 @@ JitConstants ColToImKernelRef::GetJitConstants(const col_to_im_params& params) c auto input = params.inputs[0]; auto input_dt = input.GetDType(); - // TODO : implement for col_to_im_gpu_ref if (!params.fused_ops.empty()) { std::vector idx_order; if (input.Dimentions() == 5) { diff --git a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp index 756ce277498be5..616274bc48ce4f 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/col_to_im.cpp @@ -33,6 +33,8 @@ static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptrget_pads_end()) padding_end.push_back(p); + // std::cout << ">> col2im : " << op->get_friendly_name() << std::endl; + auto output_shape_const = ov::as_type_ptr(op->get_input_node_shared_ptr(1)); auto vec_output_shape = output_shape_const->cast_vector(); ov::Shape output_shape(vec_output_shape); @@ -41,7 +43,11 @@ static void CreateCol2ImOp(ProgramBuilder& p, const std::shared_ptrcast_vector(); ov::Shape kernel_shape(kernel_size); + // std::cout << " -- output shape : " << vec_output_shape[0] << ", " << vec_output_shape[1] << std::endl; + // std::cout << " -- kernel size : " << kernel_shape.to_string() << std::endl; + // Create col2im prim + // iputs : data, output size, kernel_size(required) auto CallToImPrim = cldnn::col_to_im(layerName, inputPrimitives[0], inputPrimitives[1],