cpu: x64: pooling: support acc mode in max pooling backprop for bf16

asimonov1 · asimonov1 · commit c2755aed0748 · 2025-02-10T14:56:43.000-08:00
Do not use f32 accumulator in jit_uni_pooling for max pooling back
propagation with bf16 if 'relaxed' or 'any' accumulation mode
is specified.
Use zero error threshold in tests for max pooling if 'strict' or
'f32' accumulation mode is specified.
diff --git a/src/cpu/x64/jit_uni_pool_kernel.cpp b/src/cpu/x64/jit_uni_pool_kernel.cpp
@@ -367,7 +367,9 @@ status_t jit_uni_pool_kernel<isa>::init_conf(
     }
     assert(jpp.ur > 0);
 
-    jpp.needs_f32_accum_for_bf16 = jpp.is_bf16
+    const bool is_relaxed_acc = utils::one_of(
+            attr.acc_mode_, accumulation_mode::relaxed, accumulation_mode::any);
+    jpp.needs_f32_accum_for_bf16 = !is_relaxed_acc && jpp.is_bf16
             && jpp.alg == alg_kind::pooling_max && jpp.is_backward
             && (jpp.stride_d < jpp.kd || jpp.stride_h < jpp.kh
                     || jpp.stride_w < jpp.kw);
diff --git a/tests/benchdnn/inputs/pool/test_pool_bfloat16 b/tests/benchdnn/inputs/pool/test_pool_bfloat16
@@ -22,3 +22,14 @@
 
 --attr-post-ops=add:bf16,linear:0.5:-1
 --batch=set_all_small
+
+# Backward propagation without f32 accumulator
+--attr-post-ops=
+
+--alg=max
+--tag=axb,aBx8b,aBx16b
+
+--dir=BWD_D
+--attr-acc-mode=relaxed
+--batch=set_all
+--batch=set_topologies
diff --git a/tests/benchdnn/pool/pool.cpp b/tests/benchdnn/pool/pool.cpp
@@ -192,9 +192,13 @@ bool cuda_check_correctness(const prb_t *prb,
 
 void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
         const args_t &ref_args) {
+    const bool is_strict_acc
+            = prb->attr.acc_mode == dnnl_accumulation_mode_strict
+            || prb->attr.acc_mode == dnnl_accumulation_mode_f32;
     // Threshold to compensate division error. CPU could live with 6.f coeff.
-    const float trh
-            = prb->alg == alg_t::max ? 0.f : 10.f * epsilon_dt(prb->dt[1]);
+    const float trh = (prb->alg == alg_t::max && is_strict_acc)
+            ? 0.f
+            : 10.f * epsilon_dt(prb->dt[1]);
     cmp.set_threshold(trh);
     // Backward may have most zeroes for ker_in_pad with huge kernels problems.
     const float zero_percent = (prb->dir & FLAG_FWD) ? 99.f : 100.f;