Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu: nvidia: ip: respect acc_mode for sum post-op #2479

Merged
merged 1 commit into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/gpu/nvidia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,14 @@ limitations when using Nvidia backend for eltwise primitive:
The inner product primitives is an implementation of matrix multiplication plus
bias activation. There are two implementation of inner product in cuDNN backend.

With `sum` post-op, the accumulation mode attribute affects the behavior as
follows:
- `relaxed`: Uses GEMM’s beta parameter for a fused and optimized sum post-op
but may reduce output precision for large `f16` inputs.
- `strict` (default): Converts GEMM output to `f32`, performs `sum` post-op as a
separate operation, then converts it back to the original type. This attribute
provides better output precision but reduced performance.

#### Using GEMM

The default backend for inner product is the gemm backend using `cublasGemmEx`
Expand Down
7 changes: 4 additions & 3 deletions src/gpu/nvidia/cudnn_conv_inner_product.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ struct cudnn_conv_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
new cudnn_conv_inner_product_fwd_impl_t());

auto st = inner_product_impl_->init(engine, this, with_relu(),
with_eltwise(), with_sum(), use_fused_path_for_blocking);
with_eltwise(), with_sum(), use_fused_path_for_blocking,
false);
return st;
}
bool with_eltwise() const {
Expand Down Expand Up @@ -250,7 +251,7 @@ struct cudnn_conv_inner_product_bwd_data_t
new cudnn_conv_inner_product_bwd_data_impl_t());

return inner_product_impl_->init(
engine, this, false, false, false, false);
engine, this, false, false, false, false, false);
}

status_t set_default_params() {
Expand Down Expand Up @@ -341,7 +342,7 @@ struct cudnn_conv_inner_product_bwd_weights_t
new cudnn_conv_inner_product_bwd_weights_impl_t());

return inner_product_impl_->init(
engine, this, false, false, false, false);
engine, this, false, false, false, false, false);
}

status_t set_default_params() {
Expand Down
8 changes: 5 additions & 3 deletions src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ struct cudnn_conv_inner_product_fwd_impl_t
}
virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
bool with_relu, bool with_eltwise, bool with_sum,
bool use_fuse_path_for_blocking) override {
bool use_fuse_path_for_blocking, bool /* use_f32_sum */) override {
with_bias_ = pd->with_bias();
with_relu_ = with_relu;
with_eltwise_ = with_eltwise;
Expand Down Expand Up @@ -424,7 +424,8 @@ struct cudnn_conv_inner_product_bwd_data_impl_t
cudnnTensorFormat_t diff_source_format_;
virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool /*using_fused_path_for_blocking*/) override {
bool /*using_fused_path_for_blocking*/,
bool /* use_f32_sum */) override {
// Pad out the dimensions to 4
if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) {
return status::invalid_arguments;
Expand Down Expand Up @@ -575,7 +576,8 @@ struct cudnn_conv_inner_product_bwd_weights_impl_t

virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool /*using_fused_path_for_blocking*/) override {
bool /*using_fused_path_for_blocking*/,
bool /* use_f32_sum */) override {
// If any of the dimensions are 0 we should not continue with creating
// cudnn descriptors
with_bias_ = pd->with_bias();
Expand Down
10 changes: 7 additions & 3 deletions src/gpu/nvidia/cudnn_gemm_inner_product.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,14 @@ struct cudnn_gemm_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
&& (gemm_compatible || need_reorder);
if (!ok) return status::unimplemented;

const bool is_relaxed_acc_mode
= attr()->acc_mode_ == dnnl_accumulation_mode_relaxed;
const bool use_f32_sum = with_sum && !is_relaxed_acc_mode;

inner_product_impl_.reset(
new cudnn_gemm_inner_product_fwd_impl_t());
return inner_product_impl_->init(engine, this, with_eltwise,
with_eltwise, with_sum, need_reorder);
with_eltwise, with_sum, need_reorder, use_f32_sum);
}

status_t set_default_params() {
Expand Down Expand Up @@ -289,7 +293,7 @@ struct cudnn_gemm_inner_product_bwd_data_t
new cudnn_gemm_inner_product_bwd_data_impl_t());

return inner_product_impl_->init(
engine, this, false, false, false, need_reorder);
engine, this, false, false, false, need_reorder, false);
}

status_t set_default_params() {
Expand Down Expand Up @@ -345,7 +349,7 @@ struct cudnn_gemm_inner_product_bwd_weights_t
inner_product_impl_.reset(
new cudnn_gemm_inner_product_bwd_weights_impl_t());
return inner_product_impl_->init(
engine, this, false, false, false, need_reorder);
engine, this, false, false, false, need_reorder, false);
}

status_t set_default_params() {
Expand Down
41 changes: 32 additions & 9 deletions src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ struct cudnn_gemm_inner_product_fwd_impl_t
bool need_reorder_;

virtual status_t init(impl::engine_t *, inner_product_pd_t *pd,
bool with_relu, bool with_eltwise, bool with_sum,
bool need_reorder) override {
bool with_relu, bool with_eltwise, bool with_sum, bool need_reorder,
bool use_f32_sum) override {
need_reorder_ = need_reorder;
// GEMM is column major, here the data is row major.
// By switching the weight and source we convert the row major to
Expand Down Expand Up @@ -121,8 +121,10 @@ struct cudnn_gemm_inner_product_fwd_impl_t
use_acc_dst_ = ((pd->dst_md()->data_type == data_type::s8)
|| (with_bias_
&& pd->weights_md(1)->data_type
!= pd->dst_md()->data_type));
!= pd->dst_md()->data_type)
|| use_f32_sum);
with_sum_ = with_sum;
with_f32_sum_ = use_f32_sum;
// scaling factor to add the previous destination value to the current
// computation. This is equivalent of
sum_scale_ = sum_scale(pd);
Expand Down Expand Up @@ -154,12 +156,23 @@ struct cudnn_gemm_inner_product_fwd_impl_t

if (with_bias_) {
CHECK(convert_data_type(pd->weights_md(1), &data_types_[io::bia]));

// format is always nchw
set_bias_dims(CUDNN_TENSOR_NCHW, ndims_, pd->OC());

CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia],
data_types_[io::bia], ndims_, dims_[io::bia],
strides_[io::bia]));

if (with_f32_sum_) {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_iprod_bias_bf16_convert_wsp,
memory_desc_wrapper(pd->weights_md(1)).nelems(),
types::data_type_size(data_type::f32));
CHECK(create_and_set_tensor_descriptor(&bias_f32_desc_,
CUDNN_DATA_FLOAT, ndims_, dims_[io::bia],
strides_[io::bia]));
}
}
if (use_acc_dst_) {
pd->scratchpad_registry().registrar().book(
Expand All @@ -178,10 +191,10 @@ struct cudnn_gemm_inner_product_fwd_impl_t

void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle,
const std::vector<void *> &args) const override {
assert(args.size() == 9);
assert(args.size() == 10);
auto x = args[0], w = args[1], b = args[2], y = args[3],
workspace = args[4], src_scale = args[6], wei_scale = args[7],
dst_scale = args[8];
dst_scale = args[8], bias_f32 = args[9];
auto w_arg = w;
if (need_reorder_) {
void *transformed_w = args[5];
Expand Down Expand Up @@ -222,8 +235,18 @@ struct cudnn_gemm_inner_product_fwd_impl_t

if (with_bias_) {
float alpha = 1.0f;
CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &alpha,
tensor_descs_[io::bia], b, &alpha, y_acc_desc_, y_dst);
float beta = 0.f;
auto bias = b;
auto bias_desc = tensor_descs_[io::bia];
if (with_f32_sum_) {
cudnnTransformTensor(cudnn_handle, &alpha,
tensor_descs_[io::bia], b, &beta, bias_f32_desc_,
bias_f32);
bias = bias_f32;
bias_desc = bias_f32_desc_;
}
CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &alpha, bias_desc,
bias, &alpha, y_acc_desc_, y_dst);
}
if (with_eltwise_) {
CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_,
Expand Down Expand Up @@ -281,7 +304,7 @@ struct cudnn_gemm_inner_product_bwd_data_impl_t

virtual status_t init(impl::engine_t *, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool need_reorder) override {
bool need_reorder, bool /* use_f32_sum */) override {
need_reorder_ = need_reorder;

// GEMM is column major, here the data is row major.
Expand Down Expand Up @@ -365,7 +388,7 @@ struct cudnn_gemm_inner_product_bwd_weights_impl_t
}
virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool need_reorder) override {
bool need_reorder, bool /* use_f32_sum */) override {
need_reorder_ = need_reorder;
with_bias_ = pd->with_bias();

Expand Down
3 changes: 3 additions & 0 deletions src/gpu/nvidia/cudnn_inner_product.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
memory_tracking::names::key_iprod_int_dat_in_acc_dt);
auto arg_spacial_scratch
= CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
auto arg_f32_bias_scratch = CTX_SCRATCH_SYCL_MEMORY(
memory_tracking::names::key_iprod_bias_bf16_convert_wsp);
compat::host_task(cgh, [=, this](const compat::interop_handle &ih) {
auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
cuda_stream->engine());
Expand All @@ -72,6 +74,7 @@ status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
args.push_back(arg_src_scale.get_native_pointer(ih));
args.push_back(arg_wei_scale.get_native_pointer(ih));
args.push_back(arg_dst_scale.get_native_pointer(ih));
args.push_back(arg_f32_bias_scratch.get_native_pointer(ih));

pd()->inner_product_impl_->execute(
cudnn_handle, cublas_handle, args);
Expand Down
12 changes: 11 additions & 1 deletion src/gpu/nvidia/cudnn_inner_product_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ struct cudnn_inner_product_impl_base_t {
bool with_relu_ = false, with_eltwise_ = false, with_sum_ = false;
bool filter_using_spatial_format_ = false;

cudnnTensorDescriptor_t bias_f32_desc_;
bool with_f32_sum_ = false;

virtual bool need_to_transform_filter() const {
return filter_using_spatial_format_;
}
Expand All @@ -146,12 +149,19 @@ struct cudnn_inner_product_impl_base_t {
virtual status_t init(impl::engine_t * /*engine*/,
inner_product_pd_t * /*pd*/, bool /*with_relu*/,
bool /*with_eltwise*/, bool /*with_sum */,
bool /*using_fused_path_for_blocking*/)
bool /*using_fused_path_for_blocking*/, bool /* use_f32_sum */)
= 0;

virtual void execute(cudnnHandle_t /*handle*/,
cublasHandle_t /*cublas_handle*/,
const std::vector<void *> & /*args*/) const = 0;

virtual ~cudnn_inner_product_impl_base_t() {
for (int i = 0; i < NUM_IO; ++i) {
cudnnDestroyTensorDescriptor(tensor_descs_[i]);
}
if (with_f32_sum_) { cudnnDestroyTensorDescriptor(bias_f32_desc_); }
}
};

struct cudnn_inner_product_fwd_base_t : public cudnn_inner_product_impl_base_t {
Expand Down
12 changes: 11 additions & 1 deletion tests/benchdnn/ip/ip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,17 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) {}

void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
const args_t &ref_args) {
cmp.set_threshold(0.f);
// The nvidia implementation has different precision guarantees in some cases
// for large problems with post-op sum
if (is_nvidia_gpu()
&& prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1
&& prb->dst_dt() == dnnl_f16 && (prb->dir & FLAG_FWD)
&& prb->attr.acc_mode == dnnl_accumulation_mode_relaxed) {
const float trh = epsilon_dt(prb->dt[2]);
cmp.set_threshold(trh);
} else {
cmp.set_threshold(0.f);
}
}

std::vector<int> supported_exec_args(dir_t dir) {
Expand Down
Loading