From 9efd30228160ae1b94f1fe21ebb854044e05d423 Mon Sep 17 00:00:00 2001 From: Svetlozar Georgiev Date: Wed, 4 Dec 2024 16:58:13 +0000 Subject: [PATCH] gpu: generic: implement SYCL reduction --- src/gpu/generic/sycl/reduction_kernels.hpp | 141 +++++++++++++++++++ src/gpu/generic/sycl/ref_reduction.cpp | 76 ++++++++++ src/gpu/generic/sycl/ref_reduction.hpp | 83 +++++++++++ src/gpu/generic/sycl/sycl_primitive_conf.hpp | 12 ++ src/gpu/gpu_reduction_list.cpp | 19 ++- 5 files changed, 324 insertions(+), 7 deletions(-) create mode 100644 src/gpu/generic/sycl/reduction_kernels.hpp create mode 100644 src/gpu/generic/sycl/ref_reduction.cpp create mode 100644 src/gpu/generic/sycl/ref_reduction.hpp diff --git a/src/gpu/generic/sycl/reduction_kernels.hpp b/src/gpu/generic/sycl/reduction_kernels.hpp new file mode 100644 index 00000000000..f67dc26877b --- /dev/null +++ b/src/gpu/generic/sycl/reduction_kernels.hpp @@ -0,0 +1,141 @@ + +#ifndef GPU_GENERIC_SYCL_REDUCTION_KERNELS_HPP +#define GPU_GENERIC_SYCL_REDUCTION_KERNELS_HPP + +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/primitive_exec_types.hpp" +#include "common/utils.hpp" +#include "gpu/generic/sycl/sycl_io_helper.hpp" +#include "gpu/generic/sycl/sycl_math_utils.hpp" +#include "gpu/generic/sycl/sycl_primitive_conf.hpp" +#include "xpu/sycl/memory_storage_base.hpp" +#include "xpu/sycl/types.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace generic { +namespace sycl { + +#define DEBUG_PRINT_KERNEL 1 + +#if DEBUG_PRINT_KERNEL == 1 +#define DUMP(FMT, ...) \ + ::sycl::ext::oneapi::experimental::printf(FMT, __VA_ARGS__) +#else +#define DUMP(FMT, ...) +#endif + +struct Reducer { + dnnl_alg_kind_t alg_; + float p_, eps_; + + Reducer(dnnl_alg_kind_t alg, float p, float eps) + : alg_(alg), p_(p), eps_(eps) {} + + float identity() const { + if (alg_ == dnnl_reduction_min) { + return std::numeric_limits::max(); + } else if (alg_ == dnnl_reduction_max) { + return std::numeric_limits::lowest(); + } else if (alg_ == dnnl_reduction_mul) { + return 1.f; + } + + return 0.f; + } + + float reduce(float lhs, float rhs) const { + if (alg_ == dnnl_reduction_sum || alg_ == dnnl_reduction_mean) { + return lhs + rhs; + } else if (alg_ == dnnl_reduction_min) { + return ::sycl::min(lhs, rhs); + } else if (alg_ == dnnl_reduction_max) { + return ::sycl::max(lhs, rhs); + } else if (alg_ == dnnl_reduction_mul) { + return lhs * rhs; + } else if (alg_ == dnnl_reduction_norm_lp_max + || alg_ == dnnl_reduction_norm_lp_sum + || alg_ == dnnl_reduction_norm_lp_power_p_max + || alg_ == dnnl_reduction_norm_lp_power_p_sum) { + return lhs + ::sycl::pow(::sycl::fabs(rhs), p_); + } + + return ::sycl::nan(0U); + } + + float finalize(float val, int size) const { + if (alg_ == dnnl_reduction_mean) { + return val / size; + } else if (alg_ == dnnl_reduction_norm_lp_max) { + return ::sycl::rootn(::sycl::max(val, eps_), p_); + } else if (alg_ == dnnl_reduction_norm_lp_sum) { + return ::sycl::rootn(val + eps_, p_); + } else if (alg_ == dnnl_reduction_norm_lp_power_p_max) { + return ::sycl::max(val, eps_); + } else if (alg_ == dnnl_reduction_norm_lp_power_p_sum) { + return val + eps_; + } + + return val; + } +}; + +struct reduction_kernel_fwd_t { + sycl_reduction_conf_t conf_; + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::out_memory_arg_t dst_; + post_op_input_args po_args_; + + reduction_kernel_fwd_t(const sycl_reduction_conf_t &conf, + ::sycl::handler &cgh, const exec_ctx_t &ctx) + : conf_(conf) + , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC)) + , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) + , po_args_(cgh, ctx, conf_.post_ops) {} + + void operator()(::sycl::item<1> item) const { + Reducer reducer(conf_.alg, conf_.p, conf_.eps); + + memory_tensor_t<::sycl::access_mode::read> src(src_, conf_.src_md); + memory_tensor_t<::sycl::access_mode::write> dst(dst_, conf_.dst_md); + const int id = item.get_linear_id(); + + const auto &dst_md = conf_.dst_md; + dims_t pos; + int l_offset = id; + for (int i = 0; i < dst_md.ndims(); i++) { + const int d = dst_md.ndims() - 1 - i; + const dim_t cur_dim = dst_md.dims()[d]; + pos[d] = l_offset % cur_dim; + l_offset = l_offset / cur_dim; + } + + float acc = reducer.identity(); + for (off_t d0 = 0; d0 < conf_.reduce_dims[0]; d0++) + for (off_t d1 = 0; d1 < conf_.reduce_dims[1]; d1++) + for (off_t d2 = 0; d2 < conf_.reduce_dims[2]; d2++) + for (off_t d3 = 0; d3 < conf_.reduce_dims[3]; d3++) + for (off_t d4 = 0; d4 < conf_.reduce_dims[4]; d4++) + for (off_t d5 = 0; d5 < conf_.reduce_dims[5]; + d5++) { + dims_t src_off = {pos[0] + d0, pos[1] + d1, + pos[2] + d2, pos[3] + d3, pos[4] + d4, + pos[5] + d5}; + const float val = src.load_md(src_off); + acc = reducer.reduce(acc, val); + } + + float result = reducer.finalize(acc, conf_.reduce_size); + result = conf_.post_ops.apply(result, dst.load_md(pos), po_args_, pos); + dst.store_md(result, pos); + } +}; + +} // namespace sycl +} // namespace generic +} // namespace gpu +} // namespace impl +} // namespace dnnl +#endif \ No newline at end of file diff --git a/src/gpu/generic/sycl/ref_reduction.cpp b/src/gpu/generic/sycl/ref_reduction.cpp new file mode 100644 index 00000000000..8d0c21f5b9c --- /dev/null +++ b/src/gpu/generic/sycl/ref_reduction.cpp @@ -0,0 +1,76 @@ +#include "ref_reduction.hpp" + +#include "gpu/generic/sycl/engine.hpp" +#include "gpu/generic/sycl/reduction_kernels.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace generic { +namespace sycl { + +#define DEBUG_PRINT 0 + +#if DEBUG_PRINT == 1 +#define DUMP_VAR(X) std::cout << #X << ": " << (X) << "\n"; + +#define DUMP_VEC(X) \ + std::cout << #X << ": ["; \ + for (const auto &e : (X)) \ + std::cout << e << ", "; \ + std::cout << "]\n"; +#else +#define DUMP_VAR(X) +#define DUMP_VEC(X) +#endif + +status_t ref_reduction_t::pd_t::init_conf() { + conf_.alg = desc()->alg_kind; + conf_.src_md = xpu::sycl::md_t(src_md()); + conf_.dst_md = xpu::sycl::md_t(dst_md()); + conf_.p = desc()->p; + conf_.eps = desc()->eps; + + auto src_wrap = memory_desc_wrapper(src_md()); + auto dst_wrap = memory_desc_wrapper(dst_md()); + + const auto ndims = dst_wrap.ndims(); + for (int d = 0; d < xpu::sycl::md_t::max_dims; d++) { + conf_.reduce_dims[d] = dim_t {1}; + if (d < ndims) { + if (src_wrap.dims()[d] != dst_wrap.dims()[d]) { + conf_.reduce_dims[d] = src_wrap.dims()[d]; + conf_.reduce_size *= conf_.reduce_dims[d]; + } + } + } + + conf_.post_ops = sycl_post_ops_t(attr(), dst_wrap); + + DUMP_VAR(conf_.reduce_size); + DUMP_VEC(conf_.reduce_dims); + DUMP_VEC(conf_.is_reduction_dim); + + return status::success; +} + +status_t ref_reduction_t::init(impl::engine_t *engine) { + const auto kid = ::sycl::get_kernel_id(); + CHECK(create_kernel(engine, kid, &kernel_)); + + return status::success; +} + +status_t ref_reduction_t::execute(const exec_ctx_t &ctx) const { + auto dst_wrap = memory_desc_wrapper(pd()->dst_md()); + return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) { + reduction_kernel_fwd_t reduction_kernel(pd()->conf_, cgh, ctx); + cgh.parallel_for(::sycl::range<1>(dst_wrap.nelems()), reduction_kernel); + }); +} + +} // namespace sycl +} // namespace generic +} // namespace gpu +} // namespace impl +} // namespace dnnl \ No newline at end of file diff --git a/src/gpu/generic/sycl/ref_reduction.hpp b/src/gpu/generic/sycl/ref_reduction.hpp new file mode 100644 index 00000000000..73a106598de --- /dev/null +++ b/src/gpu/generic/sycl/ref_reduction.hpp @@ -0,0 +1,83 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_GENERIC_SYCL_REF_REDUCTION_HPP +#define GPU_GENERIC_SYCL_REF_REDUCTION_HPP + +#include "common/primitive_desc_iterator.hpp" +#include "common/reorder.hpp" +#include "common/reorder_pd.hpp" +#include "gpu/generic/sycl/sycl_gpu_primitive.hpp" +#include "gpu/generic/sycl/sycl_io_helper.hpp" +#include "gpu/generic/sycl/sycl_post_ops.hpp" +#include "gpu/generic/sycl/sycl_primitive_conf.hpp" +#include "gpu/generic/sycl/sycl_utils.hpp" +#include "gpu/gpu_reduction_pd.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace generic { +namespace sycl { + +struct ref_reduction_t : public gpu::generic::sycl::primitive_t { + using gpu::generic::sycl::primitive_t::primitive_t; + + struct pd_t : public gpu_reduction_pd_t { + using gpu_reduction_pd_t::gpu_reduction_pd_t; + + DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reduction_t); + + status_t init(impl::engine_t *engine) { + using sm = primitive_attr_t::skip_mask_t; + + memory_desc_wrapper src_wrap(src_md()); + memory_desc_wrapper dst_wrap(dst_md()); + + bool ok = set_default_params() == status::success + && attr()->has_default_values(sm::post_ops) + && sycl_post_ops_t::post_ops_ok(attr()) + && attr_.set_default_formats(dst_md()) == status::success + && src_wrap.is_plain() && dst_wrap.is_plain() + && src_wrap.ndims() == dst_wrap.ndims() + && md_dims_in_range(src_md()) && md_dims_in_range(dst_md()); + if (!ok) return status::unimplemented; + + return init_conf(); + } + + sycl_reduction_conf_t conf_; + + private: + status_t init_conf(); + }; + + status_t init(impl::engine_t *engine) override; + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + kernel_t kernel_; + std::shared_ptr reorder_p_; +}; + +} // namespace sycl +} // namespace generic +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/generic/sycl/sycl_primitive_conf.hpp b/src/gpu/generic/sycl/sycl_primitive_conf.hpp index ec4e812cebb..dd73597e255 100644 --- a/src/gpu/generic/sycl/sycl_primitive_conf.hpp +++ b/src/gpu/generic/sycl/sycl_primitive_conf.hpp @@ -415,6 +415,17 @@ struct sycl_pooling_bwd_conf_t : public sycl_pooling_base_conf_t { xpu::sycl::md_t diff_dst_md; }; +struct sycl_reduction_conf_t { + dnnl_alg_kind_t alg = dnnl_alg_kind_undef; + xpu::sycl::md_t src_md; + xpu::sycl::md_t dst_md; + float p; + float eps; + sycl_post_ops_t post_ops; + dim_t reduce_dims[xpu::sycl::md_t::max_dims]; + int reduce_size = 1; +}; + CHECK_SYCL_KERNEL_ARG_TYPE(sycl_binary_conf_t); CHECK_SYCL_KERNEL_ARG_TYPE(sycl_prelu_conf_t); CHECK_SYCL_KERNEL_ARG_TYPE(sycl_shuffle_conf_t); @@ -431,6 +442,7 @@ CHECK_SYCL_KERNEL_ARG_TYPE(sycl_pooling_bwd_conf_t); CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_fwd_conf_t); CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_data_conf_t); CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_weights_conf_t); +CHECK_SYCL_KERNEL_ARG_TYPE(sycl_reduction_conf_t); } // namespace sycl } // namespace generic diff --git a/src/gpu/gpu_reduction_list.cpp b/src/gpu/gpu_reduction_list.cpp index b29c238e04a..7d180c9af46 100644 --- a/src/gpu/gpu_reduction_list.cpp +++ b/src/gpu/gpu_reduction_list.cpp @@ -36,6 +36,10 @@ #include "gpu/amd/miopen_reduction.hpp" #endif +#ifdef GENERIC_SYCL_KERNELS_ENABLED +#include "gpu/generic/sycl/ref_reduction.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -44,13 +48,14 @@ namespace { // clang-format off constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({ - GPU_INSTANCE_INTEL_DEVMODE(intel::jit::jit_reduction_t) - GPU_INSTANCE_INTEL(intel::ocl::atomic_reduction_t) - GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t) - GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t) - GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t) - GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t) - GPU_INSTANCE_AMD(amd::miopen_reduction_t) + // GPU_INSTANCE_INTEL_DEVMODE(intel::jit::jit_reduction_t) + // GPU_INSTANCE_INTEL(intel::ocl::atomic_reduction_t) + // GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t) + // GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t) + // GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t) + // GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t) + // GPU_INSTANCE_AMD(amd::miopen_reduction_t) + GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_reduction_t) nullptr, }); // clang-format on