From 9efd30228160ae1b94f1fe21ebb854044e05d423 Mon Sep 17 00:00:00 2001
From: Svetlozar Georgiev <svetlozar.georgiev@codeplay.com>
Date: Wed, 4 Dec 2024 16:58:13 +0000
Subject: [PATCH] gpu: generic: implement SYCL reduction

---
 src/gpu/generic/sycl/reduction_kernels.hpp   | 141 +++++++++++++++++++
 src/gpu/generic/sycl/ref_reduction.cpp       |  76 ++++++++++
 src/gpu/generic/sycl/ref_reduction.hpp       |  83 +++++++++++
 src/gpu/generic/sycl/sycl_primitive_conf.hpp |  12 ++
 src/gpu/gpu_reduction_list.cpp               |  19 ++-
 5 files changed, 324 insertions(+), 7 deletions(-)
 create mode 100644 src/gpu/generic/sycl/reduction_kernels.hpp
 create mode 100644 src/gpu/generic/sycl/ref_reduction.cpp
 create mode 100644 src/gpu/generic/sycl/ref_reduction.hpp
diff --git a/src/gpu/generic/sycl/reduction_kernels.hpp b/src/gpu/generic/sycl/reduction_kernels.hpp
new file mode 100644
index 00000000000..f67dc26877b
--- /dev/null
+++ b/src/gpu/generic/sycl/reduction_kernels.hpp
@@ -0,0 +1,141 @@
+
+#ifndef GPU_GENERIC_SYCL_REDUCTION_KERNELS_HPP
+#define GPU_GENERIC_SYCL_REDUCTION_KERNELS_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/primitive_exec_types.hpp"
+#include "common/utils.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_math_utils.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "xpu/sycl/memory_storage_base.hpp"
+#include "xpu/sycl/types.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+#define DEBUG_PRINT_KERNEL 1
+
+#if DEBUG_PRINT_KERNEL == 1
+#define DUMP(FMT, ...) \
+    ::sycl::ext::oneapi::experimental::printf(FMT, __VA_ARGS__)
+#else
+#define DUMP(FMT, ...)
+#endif
+
+struct Reducer {
+    dnnl_alg_kind_t alg_;
+    float p_, eps_;
+
+    Reducer(dnnl_alg_kind_t alg, float p, float eps)
+        : alg_(alg), p_(p), eps_(eps) {}
+
+    float identity() const {
+        if (alg_ == dnnl_reduction_min) {
+            return std::numeric_limits<float>::max();
+        } else if (alg_ == dnnl_reduction_max) {
+            return std::numeric_limits<float>::lowest();
+        } else if (alg_ == dnnl_reduction_mul) {
+            return 1.f;
+        }
+
+        return 0.f;
+    }
+
+    float reduce(float lhs, float rhs) const {
+        if (alg_ == dnnl_reduction_sum || alg_ == dnnl_reduction_mean) {
+            return lhs + rhs;
+        } else if (alg_ == dnnl_reduction_min) {
+            return ::sycl::min(lhs, rhs);
+        } else if (alg_ == dnnl_reduction_max) {
+            return ::sycl::max(lhs, rhs);
+        } else if (alg_ == dnnl_reduction_mul) {
+            return lhs * rhs;
+        } else if (alg_ == dnnl_reduction_norm_lp_max
+                || alg_ == dnnl_reduction_norm_lp_sum
+                || alg_ == dnnl_reduction_norm_lp_power_p_max
+                || alg_ == dnnl_reduction_norm_lp_power_p_sum) {
+            return lhs + ::sycl::pow(::sycl::fabs(rhs), p_);
+        }
+
+        return ::sycl::nan(0U);
+    }
+
+    float finalize(float val, int size) const {
+        if (alg_ == dnnl_reduction_mean) {
+            return val / size;
+        } else if (alg_ == dnnl_reduction_norm_lp_max) {
+            return ::sycl::rootn(::sycl::max(val, eps_), p_);
+        } else if (alg_ == dnnl_reduction_norm_lp_sum) {
+            return ::sycl::rootn(val + eps_, p_);
+        } else if (alg_ == dnnl_reduction_norm_lp_power_p_max) {
+            return ::sycl::max(val, eps_);
+        } else if (alg_ == dnnl_reduction_norm_lp_power_p_sum) {
+            return val + eps_;
+        }
+
+        return val;
+    }
+};
+
+struct reduction_kernel_fwd_t {
+    sycl_reduction_conf_t conf_;
+    xpu::sycl::in_memory_arg_t src_;
+    xpu::sycl::out_memory_arg_t dst_;
+    post_op_input_args po_args_;
+
+    reduction_kernel_fwd_t(const sycl_reduction_conf_t &conf,
+            ::sycl::handler &cgh, const exec_ctx_t &ctx)
+        : conf_(conf)
+        , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
+        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , po_args_(cgh, ctx, conf_.post_ops) {}
+
+    void operator()(::sycl::item<1> item) const {
+        Reducer reducer(conf_.alg, conf_.p, conf_.eps);
+
+        memory_tensor_t<::sycl::access_mode::read> src(src_, conf_.src_md);
+        memory_tensor_t<::sycl::access_mode::write> dst(dst_, conf_.dst_md);
+        const int id = item.get_linear_id();
+
+        const auto &dst_md = conf_.dst_md;
+        dims_t pos;
+        int l_offset = id;
+        for (int i = 0; i < dst_md.ndims(); i++) {
+            const int d = dst_md.ndims() - 1 - i;
+            const dim_t cur_dim = dst_md.dims()[d];
+            pos[d] = l_offset % cur_dim;
+            l_offset = l_offset / cur_dim;
+        }
+
+        float acc = reducer.identity();
+        for (off_t d0 = 0; d0 < conf_.reduce_dims[0]; d0++)
+            for (off_t d1 = 0; d1 < conf_.reduce_dims[1]; d1++)
+                for (off_t d2 = 0; d2 < conf_.reduce_dims[2]; d2++)
+                    for (off_t d3 = 0; d3 < conf_.reduce_dims[3]; d3++)
+                        for (off_t d4 = 0; d4 < conf_.reduce_dims[4]; d4++)
+                            for (off_t d5 = 0; d5 < conf_.reduce_dims[5];
+                                    d5++) {
+                                dims_t src_off = {pos[0] + d0, pos[1] + d1,
+                                        pos[2] + d2, pos[3] + d3, pos[4] + d4,
+                                        pos[5] + d5};
+                                const float val = src.load_md(src_off);
+                                acc = reducer.reduce(acc, val);
+                            }
+
+        float result = reducer.finalize(acc, conf_.reduce_size);
+        result = conf_.post_ops.apply(result, dst.load_md(pos), po_args_, pos);
+        dst.store_md(result, pos);
+    }
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+#endif
\ No newline at end of file
diff --git a/src/gpu/generic/sycl/ref_reduction.cpp b/src/gpu/generic/sycl/ref_reduction.cpp
new file mode 100644
index 00000000000..8d0c21f5b9c
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_reduction.cpp
@@ -0,0 +1,76 @@
+#include "ref_reduction.hpp"
+
+#include "gpu/generic/sycl/engine.hpp"
+#include "gpu/generic/sycl/reduction_kernels.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+#define DEBUG_PRINT 0
+
+#if DEBUG_PRINT == 1
+#define DUMP_VAR(X) std::cout << #X << ": " << (X) << "\n";
+
+#define DUMP_VEC(X) \
+    std::cout << #X << ": ["; \
+    for (const auto &e : (X)) \
+        std::cout << e << ", "; \
+    std::cout << "]\n";
+#else
+#define DUMP_VAR(X)
+#define DUMP_VEC(X)
+#endif
+
+status_t ref_reduction_t::pd_t::init_conf() {
+    conf_.alg = desc()->alg_kind;
+    conf_.src_md = xpu::sycl::md_t(src_md());
+    conf_.dst_md = xpu::sycl::md_t(dst_md());
+    conf_.p = desc()->p;
+    conf_.eps = desc()->eps;
+
+    auto src_wrap = memory_desc_wrapper(src_md());
+    auto dst_wrap = memory_desc_wrapper(dst_md());
+
+    const auto ndims = dst_wrap.ndims();
+    for (int d = 0; d < xpu::sycl::md_t::max_dims; d++) {
+        conf_.reduce_dims[d] = dim_t {1};
+        if (d < ndims) {
+            if (src_wrap.dims()[d] != dst_wrap.dims()[d]) {
+                conf_.reduce_dims[d] = src_wrap.dims()[d];
+                conf_.reduce_size *= conf_.reduce_dims[d];
+            }
+        }
+    }
+
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_wrap);
+
+    DUMP_VAR(conf_.reduce_size);
+    DUMP_VEC(conf_.reduce_dims);
+    DUMP_VEC(conf_.is_reduction_dim);
+
+    return status::success;
+}
+
+status_t ref_reduction_t::init(impl::engine_t *engine) {
+    const auto kid = ::sycl::get_kernel_id<reduction_kernel_fwd_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+
+    return status::success;
+}
+
+status_t ref_reduction_t::execute(const exec_ctx_t &ctx) const {
+    auto dst_wrap = memory_desc_wrapper(pd()->dst_md());
+    return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+        reduction_kernel_fwd_t reduction_kernel(pd()->conf_, cgh, ctx);
+        cgh.parallel_for(::sycl::range<1>(dst_wrap.nelems()), reduction_kernel);
+    });
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
\ No newline at end of file
diff --git a/src/gpu/generic/sycl/ref_reduction.hpp b/src/gpu/generic/sycl/ref_reduction.hpp
new file mode 100644
index 00000000000..73a106598de
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_reduction.hpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_REF_REDUCTION_HPP
+#define GPU_GENERIC_SYCL_REF_REDUCTION_HPP
+
+#include "common/primitive_desc_iterator.hpp"
+#include "common/reorder.hpp"
+#include "common/reorder_pd.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
+#include "gpu/gpu_reduction_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+struct ref_reduction_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_reduction_pd_t {
+        using gpu_reduction_pd_t::gpu_reduction_pd_t;
+
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reduction_t);
+
+        status_t init(impl::engine_t *engine) {
+            using sm = primitive_attr_t::skip_mask_t;
+
+            memory_desc_wrapper src_wrap(src_md());
+            memory_desc_wrapper dst_wrap(dst_md());
+
+            bool ok = set_default_params() == status::success
+                    && attr()->has_default_values(sm::post_ops)
+                    && sycl_post_ops_t::post_ops_ok(attr())
+                    && attr_.set_default_formats(dst_md()) == status::success
+                    && src_wrap.is_plain() && dst_wrap.is_plain()
+                    && src_wrap.ndims() == dst_wrap.ndims()
+                    && md_dims_in_range(src_md()) && md_dims_in_range(dst_md());
+            if (!ok) return status::unimplemented;
+
+            return init_conf();
+        }
+
+        sycl_reduction_conf_t conf_;
+
+    private:
+        status_t init_conf();
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    kernel_t kernel_;
+    std::shared_ptr<impl::primitive_t> reorder_p_;
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/sycl/sycl_primitive_conf.hpp b/src/gpu/generic/sycl/sycl_primitive_conf.hpp
index ec4e812cebb..dd73597e255 100644
--- a/src/gpu/generic/sycl/sycl_primitive_conf.hpp
+++ b/src/gpu/generic/sycl/sycl_primitive_conf.hpp
@@ -415,6 +415,17 @@ struct sycl_pooling_bwd_conf_t : public sycl_pooling_base_conf_t {
     xpu::sycl::md_t diff_dst_md;
 };
 
+struct sycl_reduction_conf_t {
+    dnnl_alg_kind_t alg = dnnl_alg_kind_undef;
+    xpu::sycl::md_t src_md;
+    xpu::sycl::md_t dst_md;
+    float p;
+    float eps;
+    sycl_post_ops_t post_ops;
+    dim_t reduce_dims[xpu::sycl::md_t::max_dims];
+    int reduce_size = 1;
+};
+
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_binary_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_prelu_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_shuffle_conf_t);
@@ -431,6 +442,7 @@ CHECK_SYCL_KERNEL_ARG_TYPE(sycl_pooling_bwd_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_fwd_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_data_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_weights_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_reduction_conf_t);
 
 } // namespace sycl
 } // namespace generic
diff --git a/src/gpu/gpu_reduction_list.cpp b/src/gpu/gpu_reduction_list.cpp
index b29c238e04a..7d180c9af46 100644
--- a/src/gpu/gpu_reduction_list.cpp
+++ b/src/gpu/gpu_reduction_list.cpp
@@ -36,6 +36,10 @@
 #include "gpu/amd/miopen_reduction.hpp"
 #endif
 
+#ifdef GENERIC_SYCL_KERNELS_ENABLED
+#include "gpu/generic/sycl/ref_reduction.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -44,13 +48,14 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({
-        GPU_INSTANCE_INTEL_DEVMODE(intel::jit::jit_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::atomic_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t)
-        GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t)
-        GPU_INSTANCE_AMD(amd::miopen_reduction_t)
+        // GPU_INSTANCE_INTEL_DEVMODE(intel::jit::jit_reduction_t)
+        // GPU_INSTANCE_INTEL(intel::ocl::atomic_reduction_t)
+        // GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t)
+        // GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t)
+        // GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t)
+        // GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t)
+        // GPU_INSTANCE_AMD(amd::miopen_reduction_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_reduction_t)
         nullptr,
 });
 // clang-format on