gpu: nvidia: ip: respect acc_mode for sum post-op (#2479)

sgeor255 · web-flow · commit bbf83992b562 · 2025-02-25T10:34:25.000Z
diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md
@@ -215,6 +215,14 @@ limitations when using Nvidia backend for eltwise primitive:
 The inner product primitives is an implementation of matrix multiplication plus
 bias activation. There are two implementation of inner product in cuDNN backend.
 
+With `sum` post-op, the accumulation mode attribute affects the behavior as
+follows:
+- `relaxed`: Uses GEMM’s beta parameter for a fused and optimized sum post-op
+  but may reduce output precision for large `f16` inputs.
+- `strict` (default): Converts GEMM output to `f32`, performs `sum` post-op as a
+  separate operation, then converts it back to the original type. This attribute
+  provides better output precision but reduced performance.
+
 #### Using GEMM
 
 The default backend for inner product is the gemm backend using `cublasGemmEx`
diff --git a/src/gpu/nvidia/cudnn_conv_inner_product.hpp b/src/gpu/nvidia/cudnn_conv_inner_product.hpp
@@ -91,7 +91,8 @@ struct cudnn_conv_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
                     new cudnn_conv_inner_product_fwd_impl_t());
 
             auto st = inner_product_impl_->init(engine, this, with_relu(),
-                    with_eltwise(), with_sum(), use_fused_path_for_blocking);
+                    with_eltwise(), with_sum(), use_fused_path_for_blocking,
+                    false);
             return st;
         }
         bool with_eltwise() const {
@@ -250,7 +251,7 @@ struct cudnn_conv_inner_product_bwd_data_t
                     new cudnn_conv_inner_product_bwd_data_impl_t());
 
             return inner_product_impl_->init(
-                    engine, this, false, false, false, false);
+                    engine, this, false, false, false, false, false);
         }
 
         status_t set_default_params() {
@@ -341,7 +342,7 @@ struct cudnn_conv_inner_product_bwd_weights_t
                     new cudnn_conv_inner_product_bwd_weights_impl_t());
 
             return inner_product_impl_->init(
-                    engine, this, false, false, false, false);
+                    engine, this, false, false, false, false, false);
         }
 
         status_t set_default_params() {
diff --git a/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp
@@ -117,7 +117,7 @@ struct cudnn_conv_inner_product_fwd_impl_t
     }
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool with_relu, bool with_eltwise, bool with_sum,
-            bool use_fuse_path_for_blocking) override {
+            bool use_fuse_path_for_blocking, bool /* use_f32_sum */) override {
         with_bias_ = pd->with_bias();
         with_relu_ = with_relu;
         with_eltwise_ = with_eltwise;
@@ -424,7 +424,8 @@ struct cudnn_conv_inner_product_bwd_data_impl_t
     cudnnTensorFormat_t diff_source_format_;
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool /*using_fused_path_for_blocking*/) override {
+            bool /*using_fused_path_for_blocking*/,
+            bool /* use_f32_sum */) override {
         // Pad out the dimensions to 4
         if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) {
             return status::invalid_arguments;
@@ -575,7 +576,8 @@ struct cudnn_conv_inner_product_bwd_weights_impl_t
 
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool /*using_fused_path_for_blocking*/) override {
+            bool /*using_fused_path_for_blocking*/,
+            bool /* use_f32_sum */) override {
         // If any of the dimensions are 0 we should not continue with creating
         // cudnn descriptors
         with_bias_ = pd->with_bias();
diff --git a/src/gpu/nvidia/cudnn_gemm_inner_product.hpp b/src/gpu/nvidia/cudnn_gemm_inner_product.hpp
@@ -222,10 +222,14 @@ struct cudnn_gemm_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
                     && (gemm_compatible || need_reorder);
             if (!ok) return status::unimplemented;
 
+            const bool is_relaxed_acc_mode
+                    = attr()->acc_mode_ == dnnl_accumulation_mode_relaxed;
+            const bool use_f32_sum = with_sum && !is_relaxed_acc_mode;
+
             inner_product_impl_.reset(
                     new cudnn_gemm_inner_product_fwd_impl_t());
             return inner_product_impl_->init(engine, this, with_eltwise,
-                    with_eltwise, with_sum, need_reorder);
+                    with_eltwise, with_sum, need_reorder, use_f32_sum);
         }
 
         status_t set_default_params() {
@@ -289,7 +293,7 @@ struct cudnn_gemm_inner_product_bwd_data_t
                     new cudnn_gemm_inner_product_bwd_data_impl_t());
 
             return inner_product_impl_->init(
-                    engine, this, false, false, false, need_reorder);
+                    engine, this, false, false, false, need_reorder, false);
         }
 
         status_t set_default_params() {
@@ -345,7 +349,7 @@ struct cudnn_gemm_inner_product_bwd_weights_t
             inner_product_impl_.reset(
                     new cudnn_gemm_inner_product_bwd_weights_impl_t());
             return inner_product_impl_->init(
-                    engine, this, false, false, false, need_reorder);
+                    engine, this, false, false, false, need_reorder, false);
         }
 
         status_t set_default_params() {
diff --git a/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp
@@ -77,8 +77,8 @@ struct cudnn_gemm_inner_product_fwd_impl_t
     bool need_reorder_;
 
     virtual status_t init(impl::engine_t *, inner_product_pd_t *pd,
-            bool with_relu, bool with_eltwise, bool with_sum,
-            bool need_reorder) override {
+            bool with_relu, bool with_eltwise, bool with_sum, bool need_reorder,
+            bool use_f32_sum) override {
         need_reorder_ = need_reorder;
         // GEMM is column major, here the data is row major.
         // By switching the weight and source we convert the row major to
@@ -121,8 +121,10 @@ struct cudnn_gemm_inner_product_fwd_impl_t
         use_acc_dst_ = ((pd->dst_md()->data_type == data_type::s8)
                 || (with_bias_
                         && pd->weights_md(1)->data_type
-                                != pd->dst_md()->data_type));
+                                != pd->dst_md()->data_type)
+                || use_f32_sum);
         with_sum_ = with_sum;
+        with_f32_sum_ = use_f32_sum;
         // scaling factor to add the previous destination value to the current
         // computation. This is equivalent of
         sum_scale_ = sum_scale(pd);
@@ -154,12 +156,23 @@ struct cudnn_gemm_inner_product_fwd_impl_t
 
         if (with_bias_) {
             CHECK(convert_data_type(pd->weights_md(1), &data_types_[io::bia]));
+
             // format is always nchw
             set_bias_dims(CUDNN_TENSOR_NCHW, ndims_, pd->OC());
 
             CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia],
                     data_types_[io::bia], ndims_, dims_[io::bia],
                     strides_[io::bia]));
+
+            if (with_f32_sum_) {
+                pd->scratchpad_registry().registrar().book(
+                        memory_tracking::names::key_iprod_bias_bf16_convert_wsp,
+                        memory_desc_wrapper(pd->weights_md(1)).nelems(),
+                        types::data_type_size(data_type::f32));
+                CHECK(create_and_set_tensor_descriptor(&bias_f32_desc_,
+                        CUDNN_DATA_FLOAT, ndims_, dims_[io::bia],
+                        strides_[io::bia]));
+            }
         }
         if (use_acc_dst_) {
             pd->scratchpad_registry().registrar().book(
@@ -178,10 +191,10 @@ struct cudnn_gemm_inner_product_fwd_impl_t
 
     void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle,
             const std::vector<void *> &args) const override {
-        assert(args.size() == 9);
+        assert(args.size() == 10);
         auto x = args[0], w = args[1], b = args[2], y = args[3],
              workspace = args[4], src_scale = args[6], wei_scale = args[7],
-             dst_scale = args[8];
+             dst_scale = args[8], bias_f32 = args[9];
         auto w_arg = w;
         if (need_reorder_) {
             void *transformed_w = args[5];
@@ -222,8 +235,18 @@ struct cudnn_gemm_inner_product_fwd_impl_t
 
         if (with_bias_) {
             float alpha = 1.0f;
-            CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &alpha,
-                    tensor_descs_[io::bia], b, &alpha, y_acc_desc_, y_dst);
+            float beta = 0.f;
+            auto bias = b;
+            auto bias_desc = tensor_descs_[io::bia];
+            if (with_f32_sum_) {
+                cudnnTransformTensor(cudnn_handle, &alpha,
+                        tensor_descs_[io::bia], b, &beta, bias_f32_desc_,
+                        bias_f32);
+                bias = bias_f32;
+                bias_desc = bias_f32_desc_;
+            }
+            CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &alpha, bias_desc,
+                    bias, &alpha, y_acc_desc_, y_dst);
         }
         if (with_eltwise_) {
             CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_,
@@ -281,7 +304,7 @@ struct cudnn_gemm_inner_product_bwd_data_impl_t
 
     virtual status_t init(impl::engine_t *, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool need_reorder) override {
+            bool need_reorder, bool /* use_f32_sum */) override {
         need_reorder_ = need_reorder;
 
         // GEMM is column major, here the data is row major.
@@ -365,7 +388,7 @@ struct cudnn_gemm_inner_product_bwd_weights_impl_t
     }
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool need_reorder) override {
+            bool need_reorder, bool /* use_f32_sum */) override {
         need_reorder_ = need_reorder;
         with_bias_ = pd->with_bias();
 
diff --git a/src/gpu/nvidia/cudnn_inner_product.cpp b/src/gpu/nvidia/cudnn_inner_product.cpp
@@ -49,6 +49,8 @@ status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
                 memory_tracking::names::key_iprod_int_dat_in_acc_dt);
         auto arg_spacial_scratch
                 = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
+        auto arg_f32_bias_scratch = CTX_SCRATCH_SYCL_MEMORY(
+                memory_tracking::names::key_iprod_bias_bf16_convert_wsp);
         compat::host_task(cgh, [=, this](const compat::interop_handle &ih) {
             auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
                     cuda_stream->engine());
@@ -72,6 +74,7 @@ status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
             args.push_back(arg_src_scale.get_native_pointer(ih));
             args.push_back(arg_wei_scale.get_native_pointer(ih));
             args.push_back(arg_dst_scale.get_native_pointer(ih));
+            args.push_back(arg_f32_bias_scratch.get_native_pointer(ih));
 
             pd()->inner_product_impl_->execute(
                     cudnn_handle, cublas_handle, args);
diff --git a/src/gpu/nvidia/cudnn_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_inner_product_impl.hpp
@@ -125,6 +125,9 @@ struct cudnn_inner_product_impl_base_t {
     bool with_relu_ = false, with_eltwise_ = false, with_sum_ = false;
     bool filter_using_spatial_format_ = false;
 
+    cudnnTensorDescriptor_t bias_f32_desc_;
+    bool with_f32_sum_ = false;
+
     virtual bool need_to_transform_filter() const {
         return filter_using_spatial_format_;
     }
@@ -146,12 +149,19 @@ struct cudnn_inner_product_impl_base_t {
     virtual status_t init(impl::engine_t * /*engine*/,
             inner_product_pd_t * /*pd*/, bool /*with_relu*/,
             bool /*with_eltwise*/, bool /*with_sum */,
-            bool /*using_fused_path_for_blocking*/)
+            bool /*using_fused_path_for_blocking*/, bool /* use_f32_sum */)
             = 0;
 
     virtual void execute(cudnnHandle_t /*handle*/,
             cublasHandle_t /*cublas_handle*/,
             const std::vector<void *> & /*args*/) const = 0;
+
+    virtual ~cudnn_inner_product_impl_base_t() {
+        for (int i = 0; i < NUM_IO; ++i) {
+            cudnnDestroyTensorDescriptor(tensor_descs_[i]);
+        }
+        if (with_f32_sum_) { cudnnDestroyTensorDescriptor(bias_f32_desc_); }
+    }
 };
 
 struct cudnn_inner_product_fwd_base_t : public cudnn_inner_product_impl_base_t {
diff --git a/tests/benchdnn/ip/ip.cpp b/tests/benchdnn/ip/ip.cpp
@@ -262,7 +262,17 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) {}
 
 void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
         const args_t &ref_args) {
-    cmp.set_threshold(0.f);
+    // The nvidia implementation has different precision guarantees in some cases
+    // for large problems with post-op sum
+    if (is_nvidia_gpu()
+            && prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1
+            && prb->dst_dt() == dnnl_f16 && (prb->dir & FLAG_FWD)
+            && prb->attr.acc_mode == dnnl_accumulation_mode_relaxed) {
+        const float trh = epsilon_dt(prb->dt[2]);
+        cmp.set_threshold(trh);
+    } else {
+        cmp.set_threshold(0.f);
+    }
 }
 
 std::vector<int> supported_exec_args(dir_t dir) {