diff --git a/src/gpu/generic/sycl/README.md b/src/gpu/generic/sycl/README.md index e7ff444462d..f1c64ed91d0 100644 --- a/src/gpu/generic/sycl/README.md +++ b/src/gpu/generic/sycl/README.md @@ -94,6 +94,14 @@ The implementation supports both forward and backward directions. * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N` * Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8`, `u8` +## Inner Product + +The implementation supports the forward direction only. + +* Supported formats: All plain formats are supported. +* Supported data types: All possible data combinations listed in the oneDNN specification are supported. +* Supported post-ops: All the post operations as mentioned in the specification are supported. + ## Layer Normalization The implementation supports both forward and backward directions. diff --git a/src/gpu/generic/sycl/ref_inner_product.cpp b/src/gpu/generic/sycl/ref_inner_product.cpp new file mode 100644 index 00000000000..eca131ef670 --- /dev/null +++ b/src/gpu/generic/sycl/ref_inner_product.cpp @@ -0,0 +1,55 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* Copyright 2024 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/generic/sycl/ref_inner_product.hpp" +#include "common/primitive_desc_iterator.hpp" + +namespace dnnl::impl::gpu::generic::sycl { + +status_t ref_inner_product_fwd_t::pd_t::init_matmul(impl::engine_t *engine) { + matmul_desc_t matmul_desc; + CHECK(matmul_desc_init(&matmul_desc, &src_md_reshaped, &weights_md_reshaped, + &bias_md_reshaped, arg_md(DNNL_ARG_DST))); + primitive_attr_t matmul_attr(*attr()); + + primitive_desc_iterator_t it(engine, + reinterpret_cast(&matmul_desc), &matmul_attr, nullptr); + if (!it.is_initialized()) return status::out_of_memory; + while (++it != it.end()) { + matmul_pd = *it; + if (matmul_pd) { break; } + } + if (!matmul_pd) { return status::invalid_arguments; } + return status::success; +} + +status_t ref_inner_product_fwd_t::init(impl::engine_t *engine) { + std::pair, cache_state_t> p; + CHECK(pd()->matmul_pd->create_primitive_nested(p, engine)); + matmul_primitive = p.first; + return status::success; +} + +status_t ref_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const { + nested_scratchpad_t nested_scratchpad( + ctx, memory_tracking::names::key_nested, matmul_primitive); + exec_ctx_t copied_ctx(ctx); + copied_ctx.set_scratchpad_grantor(nested_scratchpad.grantor()); + return matmul_primitive->execute(copied_ctx); +} + +} // namespace dnnl::impl::gpu::generic::sycl diff --git a/src/gpu/generic/sycl/ref_inner_product.hpp b/src/gpu/generic/sycl/ref_inner_product.hpp new file mode 100644 index 00000000000..648d17bca49 --- /dev/null +++ b/src/gpu/generic/sycl/ref_inner_product.hpp @@ -0,0 +1,175 @@ +/******************************************************************************* +* Copyright 2023-2024 Intel Corporation +* Copyright 2024-2025 Codeplay Software Limited +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP +#define GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP + +#include "gpu/generic/sycl/ref_matmul.hpp" +#include "gpu/generic/sycl/sycl_gpu_primitive.hpp" +#include "gpu/generic/sycl/sycl_post_ops.hpp" +#include "gpu/generic/sycl/sycl_primitive_conf.hpp" +#include "gpu/generic/sycl/sycl_utils.hpp" +#include "gpu/gpu_inner_product_pd.hpp" +#include "gpu/gpu_primitive.hpp" + +namespace dnnl::impl::gpu::generic::sycl { +struct ref_inner_product_fwd_t : public gpu::generic::sycl::primitive_t { + using gpu::generic::sycl::primitive_t::primitive_t; + + struct pd_t : public gpu_inner_product_fwd_pd_t { + using gpu_inner_product_fwd_pd_t::gpu_inner_product_fwd_pd_t; + using sm = primitive_attr_t::skip_mask_t; + + DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_inner_product_fwd_t); + + status_t init(impl::engine_t *engine) { + auto src_dt = arg_md(DNNL_ARG_SRC)->data_type; + auto weights_dt = arg_md(DNNL_ARG_WEIGHTS)->data_type; + auto dst_dt = arg_md(DNNL_ARG_DST)->data_type; + auto bias_dt = with_bias() ? arg_md(DNNL_ARG_BIAS)->data_type + : data_type::undef; + + const bool ok = (set_default_params() == status::success) + && is_fwd() + && check_if_dtypes_valid( + src_dt, dst_dt, bias_dt, weights_dt) + && sycl_post_ops_t::post_ops_ok(attr()) + && (attr_.set_default_formats(dst_md()) == status::success) + // Blocked memory formats are not supported + && memory_desc_wrapper(src_md()).is_plain() + && memory_desc_wrapper(dst_md()).is_plain() + && memory_desc_wrapper(weights_md()).is_plain(); + + if (!ok) { return status::unimplemented; } + CHECK(create_ip_mds()); + CHECK(init_matmul(engine)); + + // book scratchpad for the matmul + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_nested, + matmul_pd->scratchpad_registry()); + return status::success; + } + + std::shared_ptr matmul_pd; + + private: + bool check_if_dtypes_valid(const data_type_t &src_dt, + const data_type_t &dst_dt, const data_type_t &bias_dt, + const data_type_t &weight_dt) const { + using namespace data_type; + return (utils::one_of(src_dt, f32) && utils::one_of(weight_dt, f32) + && utils::one_of(dst_dt, f32) + && utils::one_of(bias_dt, f32, undef)) + || (utils::one_of(src_dt, f16) + && utils::one_of(weight_dt, f16) + && utils::one_of(dst_dt, f16, f32, s8, u8) + && utils::one_of(bias_dt, f16, f32, undef)) + || (utils::one_of(src_dt, u8, s8) + && utils::one_of(weight_dt, s8) + && utils::one_of(dst_dt, u8, s8, s32, bf16, f32) + && utils::one_of( + bias_dt, u8, s8, s32, bf16, f32, undef)) + || (utils::one_of(src_dt, bf16) + && utils::one_of(weight_dt, bf16) + && utils::one_of(dst_dt, f32, bf16) + && utils::one_of(bias_dt, f32, bf16, undef)); + } + + std::vector get_dim_order(int ndims, const dims_t strides) { + std::vector order(ndims); + for (int i = 0; i < ndims; ++i) { + order[i] = i; + } + + std::sort( + order.begin(), order.end(), [&strides](size_t i, size_t j) { + return strides[i] < strides[j]; + }); + + return order; + } + + status_t create_ip_mds() { + auto accumulate_dimensions = [](const dims_t dimensions, int start, + int end) -> int64_t { + int64_t accum = 1; + for (int i = start; i < end; i++) { + accum *= dimensions[i]; + } + return accum; + }; + + const auto src_md_ = arg_md(DNNL_ARG_SRC); + const auto weights_md_ = arg_md(DNNL_ARG_WEIGHTS); + const auto bias_md_ = arg_md(DNNL_ARG_BIAS); + auto src_wrap = memory_desc_wrapper(src_md_); + auto w_wrap = memory_desc_wrapper(weights_md_); + + // src and weights dims need to be in the same order + if (get_dim_order(src_wrap.ndims(), src_wrap.strides()) + != get_dim_order(w_wrap.ndims(), w_wrap.strides())) { + return status::unimplemented; + } + + // Reshape input into the form of Batch x (\prod_{dim_{n-1}}^dim_0) + if (src_md_->ndims == 2) { + src_md_reshaped = *src_md_; + } else { + int64_t src_flattened_dimension = accumulate_dimensions( + src_md_->dims, 1, src_md_->ndims); + dims_t src_reshaped_dims { + src_md_->dims[0], src_flattened_dimension}; + CHECK(memory_desc_init_by_tag(src_md_reshaped, 2, + src_reshaped_dims, src_md_->data_type, format_tag::ab)); + } + + // Reshape weights as (OC x (\prod_{dim_{n-1}}^dim_0))^T + int weights_flattened_dimensions = accumulate_dimensions( + weights_md_->dims, 1, weights_md_->ndims); + dims_t weights_reshaped_dims { + weights_flattened_dimensions, weights_md_->dims[0]}; + CHECK(memory_desc_init_by_tag(weights_md_reshaped, 2, + weights_reshaped_dims, weights_md_->data_type, + format_tag::ba)); + if (with_bias()) { + dims_t bias_reshaped_dims {1, bias_md_->dims[0]}; + CHECK(memory_desc_init_by_tag(bias_md_reshaped, 2, + bias_reshaped_dims, bias_md_->data_type, + format_tag::ab)); + } + return status::success; + } + + status_t init_matmul(impl::engine_t *engine); + // Memory descriptors to contain reshaped tensors from nD to 2D for IP + memory_desc_t src_md_reshaped; + memory_desc_t weights_md_reshaped; + memory_desc_t bias_md_reshaped; + }; + + status_t init(impl::engine_t *engine) override; + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + kernel_t kernel_; + std::shared_ptr matmul_primitive; +}; +} // namespace dnnl::impl::gpu::generic::sycl + +#endif diff --git a/src/gpu/gpu_inner_product_list.cpp b/src/gpu/gpu_inner_product_list.cpp index b13f990a9a5..dccaedc1681 100644 --- a/src/gpu/gpu_inner_product_list.cpp +++ b/src/gpu/gpu_inner_product_list.cpp @@ -32,6 +32,10 @@ #include "gpu/amd/miopen_gemm_inner_product.hpp" #endif +#ifdef GENERIC_SYCL_KERNELS_ENABLED +#include "gpu/generic/sycl/ref_inner_product.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { @@ -49,6 +53,7 @@ const std::map> GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t) GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t) GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_fwd_t) + GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_inner_product_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ diff --git a/tests/gtests/test_inner_product_forward.cpp b/tests/gtests/test_inner_product_forward.cpp index a92be9571ba..c5672163926 100644 --- a/tests/gtests/test_inner_product_forward.cpp +++ b/tests/gtests/test_inner_product_forward.cpp @@ -88,16 +88,18 @@ class inner_product_test_t protected: void SetUp() override { auto p = ::testing::TestWithParam::GetParam(); - SKIP_IF_CUDA(!cuda_check_format_tags(p.src_format, p.weights_format, - p.bias_format, p.dst_format), + SKIP_IF_CUDA(!cuda_generic_check_format_tags(p.src_format, + p.weights_format, p.bias_format, p.dst_format), + "Unsupported format tag"); + SKIP_IF_GENERIC(!cuda_generic_check_format_tags(p.src_format, + p.weights_format, p.bias_format, p.dst_format), "Unsupported format tag"); SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions"); - SKIP_IF_GENERIC(true, "Primitive not implemented"); catch_expected_failures( [&]() { Test(); }, p.expect_to_fail, p.expected_status); } - bool cuda_check_format_tags(memory::format_tag src_format, + bool cuda_generic_check_format_tags(memory::format_tag src_format, memory::format_tag wei_format, memory::format_tag bia_format, memory::format_tag dst_format) { bool src_ok = src_format == memory::format_tag::ncdhw @@ -130,6 +132,20 @@ class inner_product_test_t return src_ok && wei_ok && bia_ok && dst_ok; } + std::vector get_dim_order(const memory::dims &strides) { + size_t ndims = strides.size(); + std::vector order(ndims); + for (size_t i = 0; i < ndims; ++i) { + order[i] = i; + } + + std::sort(order.begin(), order.end(), [&strides](size_t i, size_t j) { + return strides[i] < strides[j]; + }); + + return order; + } + void Test() { auto p = ::testing::TestWithParam::GetParam(); test_inner_product_descr_t ipd = p.test_ipd; @@ -169,6 +185,10 @@ class inner_product_test_t : create_md({}, data_type, p.bias_format); auto ip_dst_desc = create_md({ipd.mb, ipd.oc}, data_type, p.dst_format); + SKIP_IF_GENERIC(get_dim_order(ip_src_desc.get_strides()) + != get_dim_order(ip_weights_desc.get_strides()), + "Unsupported case for generic"); + auto ip_primitive_desc = with_bias ? pd_t(eng, p.aprop_kind, ip_src_desc, ip_weights_desc, ip_bias_desc, ip_dst_desc) @@ -176,11 +196,15 @@ class inner_product_test_t ip_dst_desc); auto aa = allows_attr_t {false}; - aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng); aa.po_eltwise = true; - aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng); aa.po_sum = true; - +#ifdef DNNL_SYCL_GENERIC + aa.po_binary = true; + aa.po_prelu = true; +#else + aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng); + aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng); +#endif test_fwd_pd_constructors(ip_primitive_desc, aa, p.aprop_kind, ip_src_desc, ip_weights_desc, ip_bias_desc, ip_dst_desc);