Skip to content

Commit cb15f33

Browse files
[SYCL][Reduction] Fix identityless reductions with unwritten reducers (#8709)
This commit changes reducers to use a new object for holding results, namely ReducerElement. This new object either holds a value directly or holds an optional value. The latter is used for identityless reductions to allow cases where the user functors did not write a value into the given reducer and as such the reducer's value should be discounted. This new element is used inside reductions that support identityless to correctly propagate potentially missing values throughout. --------- Signed-off-by: Larsen, Steffen <[email protected]>
1 parent 82ac98f commit cb15f33

13 files changed

+953
-378
lines changed

sycl/include/sycl/reduction.hpp

+553-323
Large diffs are not rendered by default.

sycl/test-e2e/Reduction/reduction_big_data.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ int test(queue &Q, T Identity) {
4949

5050
// Initialize.
5151
BinaryOperation BOp;
52-
T CorrectOut;
53-
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
52+
std::optional<T> CorrectOutOpt;
53+
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
54+
T CorrectOut = *CorrectOutOpt;
5455

5556
// Compute.
5657
Q.submit([&](handler &CGH) {

sycl/test-e2e/Reduction/reduction_ctor.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ void test_reducer(Reduction &Redu, T A, T B) {
2525

2626
typename Reduction::binary_operation BOp;
2727
T ExpectedValue = BOp(A, B);
28-
assert(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0) &&
28+
assert(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0) &&
2929
"Wrong result of binary operation.");
3030
assert(
3131
toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&
@@ -40,7 +40,7 @@ void test_reducer(Reduction &Redu, T Identity, BinaryOperation BOp, T A, T B) {
4040

4141
T ExpectedValue = BOp(A, B);
4242
assert(
43-
toBool(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0)) &&
43+
toBool(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0)) &&
4444
"Wrong result of binary operation.");
4545
assert(
4646
toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&

sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ struct Red {
4848
}
4949

5050
void init() {
51-
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
51+
std::optional<T> CorrectOutOpt;
52+
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
53+
CorrectOut = *CorrectOutOpt;
5254
if (!PropList.template has_property<
5355
property::reduction::initialize_to_identity>())
5456
CorrectOut = BOp(CorrectOut, InitVal);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
//
6+
// Group algorithms are not supported on Nvidia.
7+
// XFAIL: hip_nvidia
8+
9+
// This test performs basic checks of parallel_for(nd_range, reduction, func)
10+
// with reductions initialized with a one element buffer. Additionally, some
11+
// reducers will not be written to.
12+
13+
#include "reduction_utils.hpp"
14+
15+
using namespace sycl;
16+
17+
int NumErrors = 0;
18+
19+
template <typename T> class SkipEvenName;
20+
template <typename T> class SkipOddName;
21+
template <typename T> class SkipAllName;
22+
23+
template <typename Name, typename T, class BinaryOperation>
24+
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize,
25+
size_t NWItems) {
26+
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
27+
NumErrors += test<SkipEvenName<Name>, T>(Q, Identity, Init, BOp, NDRange,
28+
property_list{}, SkipEvenOp{});
29+
NumErrors += test<SkipOddName<Name>, T>(Q, Identity, Init, BOp, NDRange,
30+
property_list{}, SkipOddOp{});
31+
NumErrors += test<SkipAllName<Name>, T>(Q, Identity, Init, BOp, NDRange,
32+
property_list{}, SkipAllOp{});
33+
}
34+
35+
int main() {
36+
queue Q;
37+
printDeviceInfo(Q);
38+
39+
// Check some non power-of-two work-group sizes.
40+
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1, 7);
41+
tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 49, 49 * 5);
42+
43+
// Try some power-of-two work-group sizes.
44+
tests<class B1, int>(Q, 0, 99, std::plus<>{}, 1, 32);
45+
tests<class B2, int>(Q, 1, 99, std::multiplies<>{}, 4, 32);
46+
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, 8, 128);
47+
tests<class B4, int>(Q, 0, 99, std::bit_xor<>{}, 16, 256);
48+
tests<class B5, int>(Q, ~0, 99, std::bit_and<>{}, 32, 256);
49+
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
50+
ext::oneapi::minimum<>{}, 64, 256);
51+
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
52+
ext::oneapi::maximum<>{}, 128, 256);
53+
tests<class B8, int>(Q, 0, 99, std::plus<>{}, 256, 256);
54+
55+
// Check with various types.
56+
tests<class C1, float>(Q, 1, 99, std::multiplies<>{}, 8, 24);
57+
tests<class C2, short>(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256);
58+
tests<class C3, unsigned char>(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256);
59+
60+
// Check with CUSTOM type.
61+
using CV = CustomVec<long long>;
62+
tests<class D1, CV>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 8, 256);
63+
64+
printFinalStatus(NumErrors);
65+
return NumErrors;
66+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
6+
// This test performs basic checks of parallel_for(range<1>, reduction, func)
7+
// with reductions initialized with a one element buffer. Additionally, some
8+
// reducers will not be written to.
9+
10+
#include "reduction_utils.hpp"
11+
12+
using namespace sycl;
13+
14+
int NumErrors = 0;
15+
16+
template <typename T> class SkipEvenName;
17+
template <typename T> class SkipOddName;
18+
template <typename T> class SkipAllName;
19+
20+
template <typename Name, typename T, typename... ArgTys>
21+
void tests(ArgTys &&...Args) {
22+
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
23+
property_list{}, SkipEvenOp{});
24+
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
25+
property_list{}, SkipOddOp{});
26+
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
27+
property_list{}, SkipAllOp{});
28+
}
29+
30+
int main() {
31+
queue Q;
32+
printDeviceInfo(Q);
33+
size_t MaxWGSize =
34+
Q.get_device().get_info<info::device::max_work_group_size>();
35+
36+
constexpr access::mode RW = access::mode::read_write;
37+
// Fast-reduce and Fast-atomics. Try various range types/sizes.
38+
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, range<1>(1));
39+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<1>(2));
40+
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<1>(7));
41+
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<1>(64));
42+
tests<class A5, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2));
43+
tests<class A6, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5));
44+
45+
// Check with CUSTOM type.
46+
tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
47+
range<1>(256));
48+
tests<class B2, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
49+
range<1>(MaxWGSize * 3));
50+
tests<class B3, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
51+
range<1>(72));
52+
53+
// Check with identityless operations.
54+
tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(1));
55+
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(2));
56+
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(7));
57+
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(64));
58+
tests<class C5, int>(Q, 99, PlusWithoutIdentity<int>{},
59+
range<1>(MaxWGSize * 2));
60+
tests<class C6, int>(Q, 99, PlusWithoutIdentity<int>{},
61+
range<1>(MaxWGSize * 2 + 5));
62+
63+
printFinalStatus(NumErrors);
64+
return NumErrors;
65+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// This test performs basic checks of parallel_for(range<2>, reduction, func)
10+
// with reductions initialized with a one element buffer. Additionally, some
11+
// reducers will not be written to.
12+
13+
#include "reduction_utils.hpp"
14+
15+
using namespace sycl;
16+
17+
int NumErrors = 0;
18+
19+
template <typename T> class SkipEvenName;
20+
template <typename T> class SkipOddName;
21+
template <typename T> class SkipAllName;
22+
23+
template <typename Name, typename T, typename... ArgTys>
24+
void tests(ArgTys &&...Args) {
25+
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
26+
property_list{}, SkipEvenOp{});
27+
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
28+
property_list{}, SkipOddOp{});
29+
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
30+
property_list{}, SkipAllOp{});
31+
}
32+
33+
int main() {
34+
queue Q;
35+
printDeviceInfo(Q);
36+
size_t MaxWGSize =
37+
Q.get_device().get_info<info::device::max_work_group_size>();
38+
39+
tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
40+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
41+
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
42+
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
43+
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
44+
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
45+
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
46+
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
47+
48+
tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
49+
range<2>{33, MaxWGSize});
50+
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
51+
range<2>{33, MaxWGSize});
52+
53+
tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{1, 1});
54+
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 2});
55+
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 3});
56+
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{},
57+
range<2>{MaxWGSize, 1});
58+
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
59+
range<2>{1, MaxWGSize});
60+
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
61+
range<2>{2, MaxWGSize * 2});
62+
tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
63+
range<2>{MaxWGSize * 3, 7});
64+
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
65+
range<2>{3, MaxWGSize * 3});
66+
67+
printFinalStatus(NumErrors);
68+
return NumErrors;
69+
}

sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp

+21-2
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,12 @@ int main() {
6767
tests<class D2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
6868
tests<class D3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});
6969

70-
/* Temporarily disabled
7170
tests<class D4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
7271
range<3>{1, 1, MaxWGSize + 1});
7372
tests<class D5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
7473
range<3>{1, MaxWGSize + 1, 1});
7574
tests<class D6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
7675
range<3>{MaxWGSize + 1, 1, 1});
77-
*/
7876

7977
tests<class D7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
8078
range<3>{2, 5, MaxWGSize * 2});
@@ -83,6 +81,27 @@ int main() {
8381
tests<class D9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
8482
range<3>{MaxWGSize * 3, 8, 4});
8583

84+
tests<class E1, int>(Q, 99, MultipliesWithoutIdentity<int>{},
85+
range<3>{1, 1, 1});
86+
tests<class E2, int>(Q, 99, MultipliesWithoutIdentity<int>{},
87+
range<3>{2, 2, 2});
88+
tests<class E3, int>(Q, 99, MultipliesWithoutIdentity<int>{},
89+
range<3>{2, 3, 4});
90+
91+
tests<class E4, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
92+
range<3>{1, 1, MaxWGSize + 1});
93+
tests<class E5, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
94+
range<3>{1, MaxWGSize + 1, 1});
95+
tests<class E6, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
96+
range<3>{MaxWGSize + 1, 1, 1});
97+
98+
tests<class E7, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
99+
range<3>{2, 5, MaxWGSize * 2});
100+
tests<class E8, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
101+
range<3>{3, MaxWGSize * 3, 2});
102+
tests<class E9, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
103+
range<3>{MaxWGSize * 3, 8, 4});
104+
86105
printFinalStatus(NumErrors);
87106
return NumErrors;
88107
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// This test performs basic checks of parallel_for(range<3>, reduction, func)
10+
// with reductions initialized with a one element buffer. Additionally, some
11+
// reducers will not be written to.
12+
13+
#include "reduction_utils.hpp"
14+
15+
using namespace sycl;
16+
17+
int NumErrors = 0;
18+
19+
template <typename T> class SkipEvenName;
20+
template <typename T> class SkipOddName;
21+
template <typename T> class SkipAllName;
22+
23+
template <typename Name, typename T, typename... ArgTys>
24+
void tests(ArgTys &&...Args) {
25+
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
26+
property_list{}, SkipEvenOp{});
27+
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
28+
property_list{}, SkipOddOp{});
29+
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
30+
property_list{}, SkipAllOp{});
31+
}
32+
33+
int main() {
34+
queue Q;
35+
printDeviceInfo(Q);
36+
size_t MaxWGSize =
37+
Q.get_device().get_info<info::device::max_work_group_size>();
38+
39+
tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
40+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
41+
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});
42+
43+
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
44+
range<3>{1, 1, MaxWGSize + 1});
45+
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
46+
range<3>{1, MaxWGSize + 1, 1});
47+
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
48+
range<3>{MaxWGSize + 1, 1, 1});
49+
50+
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
51+
range<3>{2, 5, MaxWGSize * 2});
52+
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
53+
range<3>{3, MaxWGSize * 3, 2});
54+
tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
55+
range<3>{MaxWGSize * 3, 8, 4});
56+
57+
tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
58+
range<3>{2, 33, MaxWGSize});
59+
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
60+
range<3>{2, 33, MaxWGSize});
61+
62+
tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{1, 1, 1});
63+
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
64+
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});
65+
66+
tests<class C4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
67+
range<3>{1, 1, MaxWGSize + 1});
68+
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
69+
range<3>{1, MaxWGSize + 1, 1});
70+
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
71+
range<3>{MaxWGSize + 1, 1, 1});
72+
73+
tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
74+
range<3>{2, 5, MaxWGSize * 2});
75+
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
76+
range<3>{3, MaxWGSize * 3, 2});
77+
tests<class C9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
78+
range<3>{MaxWGSize * 3, 8, 4});
79+
80+
printFinalStatus(NumErrors);
81+
return NumErrors;
82+
}

sycl/test-e2e/Reduction/reduction_range_N_vars.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ struct Red {
4848
}
4949

5050
void init() {
51-
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
51+
std::optional<T> CorrectOutOpt;
52+
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
53+
CorrectOut = *CorrectOutOpt;
5254
if (!PropList.template has_property<
5355
property::reduction::initialize_to_identity>())
5456
CorrectOut = BOp(CorrectOut, InitVal);

sycl/test-e2e/Reduction/reduction_usm.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
3838
}
3939

4040
// Initialize.
41-
T CorrectOut;
41+
std::optional<T> CorrectOutOpt;
4242
BinaryOperation BOp;
4343

4444
buffer<T, 1> InBuf(NWItems);
45-
initInputData(InBuf, CorrectOut, BOp, NWItems);
46-
CorrectOut = BOp(CorrectOut, Init);
45+
initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
46+
T CorrectOut = BOp(*CorrectOutOpt, Init);
4747

4848
// Compute.
4949
Q.submit([&](handler &CGH) {

0 commit comments

Comments
 (0)