[SYCL][Reduction] Fix identityless reductions with unwritten reducers (#8709)

steffenlarsen · web-flow · commit cb15f33e861a · 2023-04-03T20:17:33.000+01:00
This commit changes reducers to use a new object for holding results,
namely ReducerElement. This new object either holds a value directly or
holds an optional value. The latter is used for identityless reductions
to allow cases where the user functors did not write a value into the
given reducer and as such the reducer's value should be discounted.

This new element is used inside reductions that support identityless to
correctly propagate potentially missing values throughout.

---------

Signed-off-by: Larsen, Steffen &lt;steffen.larsen@intel.com&gt;
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
diff --git a/sycl/test-e2e/Reduction/reduction_big_data.cpp b/sycl/test-e2e/Reduction/reduction_big_data.cpp
@@ -49,8 +49,9 @@ int test(queue &Q, T Identity) {
 
   // Initialize.
   BinaryOperation BOp;
-  T CorrectOut;
-  initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+  std::optional<T> CorrectOutOpt;
+  initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+  T CorrectOut = *CorrectOutOpt;
 
   // Compute.
   Q.submit([&](handler &CGH) {
diff --git a/sycl/test-e2e/Reduction/reduction_ctor.cpp b/sycl/test-e2e/Reduction/reduction_ctor.cpp
@@ -25,7 +25,7 @@ void test_reducer(Reduction &Redu, T A, T B) {
 
   typename Reduction::binary_operation BOp;
   T ExpectedValue = BOp(A, B);
-  assert(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0) &&
+  assert(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0) &&
          "Wrong result of binary operation.");
   assert(
       toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&
@@ -40,7 +40,7 @@ void test_reducer(Reduction &Redu, T Identity, BinaryOperation BOp, T A, T B) {
 
   T ExpectedValue = BOp(A, B);
   assert(
-      toBool(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0)) &&
+      toBool(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0)) &&
       "Wrong result of binary operation.");
   assert(
       toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&
diff --git a/sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp b/sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp
@@ -48,7 +48,9 @@ struct Red {
   }
 
   void init() {
-    initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+    std::optional<T> CorrectOutOpt;
+    initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+    CorrectOut = *CorrectOutOpt;
     if (!PropList.template has_property<
             property::reduction::initialize_to_identity>())
       CorrectOut = BOp(CorrectOut, InitVal);
diff --git a/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Group algorithms are not supported on Nvidia.
+// XFAIL: hip_nvidia
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize,
+           size_t NWItems) {
+  nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
+  NumErrors += test<SkipEvenName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(Q, Identity, Init, BOp, NDRange,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+
+  // Check some non power-of-two work-group sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1, 7);
+  tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 49, 49 * 5);
+
+  // Try some power-of-two work-group sizes.
+  tests<class B1, int>(Q, 0, 99, std::plus<>{}, 1, 32);
+  tests<class B2, int>(Q, 1, 99, std::multiplies<>{}, 4, 32);
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, 8, 128);
+  tests<class B4, int>(Q, 0, 99, std::bit_xor<>{}, 16, 256);
+  tests<class B5, int>(Q, ~0, 99, std::bit_and<>{}, 32, 256);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, 64, 256);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 128, 256);
+  tests<class B8, int>(Q, 0, 99, std::plus<>{}, 256, 256);
+
+  // Check with various types.
+  tests<class C1, float>(Q, 1, 99, std::multiplies<>{}, 8, 24);
+  tests<class C2, short>(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256);
+  tests<class C3, unsigned char>(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256);
+
+  // Check with CUSTOM type.
+  using CV = CustomVec<long long>;
+  tests<class D1, CV>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 8, 256);
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp
@@ -0,0 +1,65 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  constexpr access::mode RW = access::mode::read_write;
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, range<1>(1));
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<1>(2));
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<1>(7));
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<1>(64));
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2));
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5));
+
+  // Check with CUSTOM type.
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<1>(256));
+  tests<class B2, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<1>(MaxWGSize * 3));
+  tests<class B3, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<1>(72));
+
+  // Check with identityless operations.
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(1));
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(2));
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(7));
+  tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(64));
+  tests<class C5, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<1>(MaxWGSize * 2));
+  tests<class C6, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<1>(MaxWGSize * 2 + 5));
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<2>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
+
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<2>{33, MaxWGSize});
+  tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<2>{33, MaxWGSize});
+
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{1, 1});
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 2});
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 3});
+  tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{},
+                       range<2>{MaxWGSize, 1});
+  tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{1, MaxWGSize});
+  tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{2, MaxWGSize * 2});
+  tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{MaxWGSize * 3, 7});
+  tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<2>{3, MaxWGSize * 3});
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp b/sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp
@@ -67,14 +67,12 @@ int main() {
   tests<class D2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
   tests<class D3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});
 
-  /* Temporarily disabled
   tests<class D4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
                            range<3>{1, 1, MaxWGSize + 1});
   tests<class D5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
                            range<3>{1, MaxWGSize + 1, 1});
   tests<class D6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
                            range<3>{MaxWGSize + 1, 1, 1});
-  */
 
   tests<class D7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
                            range<3>{2, 5, MaxWGSize * 2});
@@ -83,6 +81,27 @@ int main() {
   tests<class D9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
                            range<3>{MaxWGSize * 3, 8, 4});
 
+  tests<class E1, int>(Q, 99, MultipliesWithoutIdentity<int>{},
+                       range<3>{1, 1, 1});
+  tests<class E2, int>(Q, 99, MultipliesWithoutIdentity<int>{},
+                       range<3>{2, 2, 2});
+  tests<class E3, int>(Q, 99, MultipliesWithoutIdentity<int>{},
+                       range<3>{2, 3, 4});
+
+  tests<class E4, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class E5, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class E6, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class E7, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class E8, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class E9, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
   printFinalStatus(NumErrors);
   return NumErrors;
 }
diff --git a/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<3>, reduction, func)
+// with reductions initialized with a one element buffer. Additionally, some
+// reducers will not be written to.
+
+#include "reduction_utils.hpp"
+
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <typename T> class SkipEvenName;
+template <typename T> class SkipOddName;
+template <typename T> class SkipAllName;
+
+template <typename Name, typename T, typename... ArgTys>
+void tests(ArgTys &&...Args) {
+  NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                           property_list{}, SkipEvenOp{});
+  NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipOddOp{});
+  NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
+                                          property_list{}, SkipAllOp{});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});
+
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
+                                        range<3>{2, 33, MaxWGSize});
+  tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
+                                        range<3>{2, 33, MaxWGSize});
+
+  tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{1, 1, 1});
+  tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
+  tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});
+
+  tests<class C4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class C9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  printFinalStatus(NumErrors);
+  return NumErrors;
+}
diff --git a/sycl/test-e2e/Reduction/reduction_range_N_vars.cpp b/sycl/test-e2e/Reduction/reduction_range_N_vars.cpp
@@ -48,7 +48,9 @@ struct Red {
   }
 
   void init() {
-    initInputData(InBuf, CorrectOut, BOp, NWorkItems);
+    std::optional<T> CorrectOutOpt;
+    initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
+    CorrectOut = *CorrectOutOpt;
     if (!PropList.template has_property<
             property::reduction::initialize_to_identity>())
       CorrectOut = BOp(CorrectOut, InitVal);
diff --git a/sycl/test-e2e/Reduction/reduction_usm.cpp b/sycl/test-e2e/Reduction/reduction_usm.cpp
@@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
   }
 
   // Initialize.
-  T CorrectOut;
+  std::optional<T> CorrectOutOpt;
   BinaryOperation BOp;
 
   buffer<T, 1> InBuf(NWItems);
-  initInputData(InBuf, CorrectOut, BOp, NWItems);
-  CorrectOut = BOp(CorrectOut, Init);
+  initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
+  T CorrectOut = BOp(*CorrectOutOpt, Init);
 
   // Compute.
   Q.submit([&](handler &CGH) {
diff --git a/sycl/test-e2e/Reduction/reduction_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_usm_dw.cpp
diff --git a/sycl/test-e2e/Reduction/reduction_utils.hpp b/sycl/test-e2e/Reduction/reduction_utils.hpp