diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 1bd9465c57f0c..77d052275e247 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -1079,17 +1079,25 @@ using __sycl_reduction_kernel = std::conditional_t::value, auto_name, MainOrAux>; +// Enum for specifying work size guarantees in tree-reduction. +enum class WorkSizeGuarantees { None, Equal, LessOrEqual }; + // Implementations. template struct NDRangeReduction; template <> struct NDRangeReduction { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, - Reduction &Redu, KernelType &KernelFunc) { + nd_range NDRange, size_t, size_t, + PropertiesT &Properties, Reduction &Redu, + KernelType &KernelFunc) { + static_assert(Reduction::has_identity, + "Identityless reductions are not supported by the " + "local_atomic_and_atomic_cross_wg strategy."); + std::ignore = Queue; using Name = __sycl_reduction_kernel< reduction::MainKrn, KernelName, @@ -1132,11 +1140,12 @@ struct NDRangeReduction { template <> struct NDRangeReduction< reduction::strategy::group_reduce_and_last_wg_detection> { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, - Reduction &Redu, KernelType &KernelFunc) { + nd_range NDRange, size_t, size_t, + PropertiesT &Properties, Reduction &Redu, + KernelType &KernelFunc) { static_assert(Reduction::has_identity, "Identityless reductions are not supported by the " "group_reduce_and_last_wg_detection strategy."); @@ -1276,13 +1285,10 @@ void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func) { } } -// Enum for specifying work size guarantees in tree-reduction. -enum class WorkSizeGuarantees { None, Equal, LessOrEqual }; - template void doTreeReduction(size_t WorkSize, nd_item NDIt, LocalRedsTy &LocalReds, - BinOpTy &BOp, AccessFuncTy AccessFunc) { + const BinOpTy &BOp, AccessFuncTy AccessFunc) { size_t LID = NDIt.get_local_linear_id(); size_t AdjustedWorkSize; if constexpr (WSGuarantee == WorkSizeGuarantees::LessOrEqual || @@ -1329,16 +1335,25 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID, } template <> struct NDRangeReduction { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + nd_range NDRange, size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { std::ignore = Queue; size_t NElements = Reduction::num_elements; size_t WGSize = NDRange.get_local_range().size(); size_t NWorkGroups = NDRange.get_group_range().size(); + assert((WSG != WorkSizeGuarantees::Equal || + (ActiveItemsPerWG == WGSize && ActiveItemsLastWG == WGSize)) && + "Work size guarantee violation."); + + // Ignore active items to avoid warnings when uncaptured. + std::ignore = ActiveItemsPerWG; + std::ignore = ActiveItemsLastWG; + bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity(); auto PartialSums = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH); @@ -1362,14 +1377,31 @@ template <> struct NDRangeReduction { typename Reduction::reducer_type(IdentityContainer, BOp); KernelFunc(NDId, Reducer); + size_t NWorkGroups = NDId.get_group_range().size(); + + // Compute the number of active items for this work-group. + size_t ActiveItems = [&]() { + if constexpr (WSG == WorkSizeGuarantees::Equal) { + // We are guaranteed exactly one element for each work item, so we + // can assume that ActiveItemsPerWG is the same as the workgroup + // size. + return NDId.get_local_range().size(); + } else { + // Otherwise we either have the number of active items, except the + // last group may have less. + size_t GrID = NDId.get_group_linear_id(); + return GrID == NWorkGroups - 1 ? ActiveItemsLastWG : ActiveItemsPerWG; + } + }(); + // If there are multiple values, reduce each separately // This prevents local memory from scaling with elements size_t LID = NDId.get_local_linear_id(); for (int E = 0; E < NElements; ++E) { - doTreeReduction( - WGSize, NDId, LocalReds, BOp, - [&](size_t) { return getReducerAccess(Reducer).getElement(E); }); + doTreeReduction(ActiveItems, NDId, LocalReds, BOp, [&](size_t) { + return getReducerAccess(Reducer).getElement(E); + }); if (LID == 0) { auto V = LocalReds[0]; @@ -1417,11 +1449,16 @@ template <> struct NDRangeReduction { template <> struct NDRangeReduction { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, - Reduction &Redu, KernelType &KernelFunc) { + nd_range NDRange, size_t, size_t, + PropertiesT &Properties, Reduction &Redu, + KernelType &KernelFunc) { + static_assert(Reduction::has_identity, + "Identityless reductions are not supported by the " + "group_reduce_and_atomic_cross_wg strategy."); + std::ignore = Queue; using Name = __sycl_reduction_kernel< reduction::MainKrn, KernelName, @@ -1449,10 +1486,11 @@ struct NDRangeReduction { template <> struct NDRangeReduction< reduction::strategy::local_mem_tree_and_atomic_cross_wg> { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + nd_range NDRange, size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { std::ignore = Queue; using Name = __sycl_reduction_kernel< @@ -1462,6 +1500,14 @@ struct NDRangeReduction< size_t NElements = Reduction::num_elements; size_t WGSize = NDRange.get_local_range().size(); + assert((WSG != WorkSizeGuarantees::Equal || + (ActiveItemsPerWG == WGSize && ActiveItemsLastWG == WGSize)) && + "Work size guarantee violation."); + + // Ignore active items to avoid warnings when uncaptured. + std::ignore = ActiveItemsPerWG; + std::ignore = ActiveItemsLastWG; + // Use local memory to reduce elements in work-groups into zero-th // element. local_accessor LocalReds{WGSize, CGH}; @@ -1471,17 +1517,33 @@ struct NDRangeReduction< typename Reduction::reducer_type Reducer; KernelFunc(NDIt, Reducer); - size_t WGSize = NDIt.get_local_range().size(); size_t LID = NDIt.get_local_linear_id(); + // Compute the number of active items for this work-group. + size_t ActiveItems = [&]() { + if constexpr (WSG == WorkSizeGuarantees::Equal) { + // We are guaranteed exactly one element for each work item, so we + // can assume that ActiveItemsPerWG is the same as the workgroup + // size. + return NDIt.get_local_range().size(); + } else { + // Otherwise we either have the number of active items, except the + // last group may have less. + size_t GrID = NDIt.get_group_linear_id(); + size_t NWorkGroups = NDIt.get_group_range().size(); + return GrID == NWorkGroups - 1 ? ActiveItemsLastWG + : ActiveItemsPerWG; + } + }(); + // If there are multiple values, reduce each separately // This prevents local memory from scaling with elements for (int E = 0; E < NElements; ++E) { typename Reduction::binary_operation BOp; - doTreeReduction( - WGSize, NDIt, LocalReds, BOp, - [&](size_t) { return getReducerAccess(Reducer).getElement(E); }); + doTreeReduction(ActiveItems, NDIt, LocalReds, BOp, [&](size_t) { + return getReducerAccess(Reducer).getElement(E); + }); if (LID == 0) getReducerAccess(Reducer).getElement(E) = LocalReds[0]; @@ -1503,11 +1565,16 @@ struct NDRangeReduction< template <> struct NDRangeReduction< reduction::strategy::group_reduce_and_multiple_kernels> { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, - Reduction &Redu, KernelType &KernelFunc) { + nd_range NDRange, size_t, size_t, + PropertiesT &Properties, Reduction &Redu, + KernelType &KernelFunc) { + static_assert(Reduction::has_identity, + "Identityless reductions are not supported by the " + "group_reduce_and_multiple_kernels strategy."); + // Before running the kernels, check that device has enough local memory // to hold local arrays that may be required for the reduction algorithm. // TODO: If the work-group-size is limited by the local memory, then @@ -1637,10 +1704,11 @@ struct NDRangeReduction< }; template <> struct NDRangeReduction { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + nd_range NDRange, size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { constexpr bool HFR = Reduction::has_fast_reduce; size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type); @@ -1660,6 +1728,14 @@ template <> struct NDRangeReduction { size_t NWorkGroups = NDRange.get_group_range().size(); auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH); + assert((WSG != WorkSizeGuarantees::Equal || + (ActiveItemsPerWG == WGSize && ActiveItemsLastWG == WGSize)) && + "Work size guarantee violation."); + + // Ignore active items to avoid warnings when uncaptured. + std::ignore = ActiveItemsPerWG; + std::ignore = ActiveItemsLastWG; + bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1; @@ -1678,20 +1754,34 @@ template <> struct NDRangeReduction { typename Reduction::reducer_type(IdentityContainer, BOp); KernelFunc(NDIt, Reducer); - size_t WGSize = NDIt.get_local_range().size(); size_t LID = NDIt.get_local_linear_id(); + size_t GrID = NDIt.get_group_linear_id(); + + // Compute the number of active items for this work-group. + size_t ActiveItems = [&]() { + if constexpr (WSG == WorkSizeGuarantees::Equal) { + // We are guaranteed exactly one element for each work item, so we + // can assume that ActiveItemsPerWG is the same as the workgroup + // size. + return NDIt.get_local_range().size(); + } else { + // Otherwise we either have the number of active items, except the + // last group may have less. + size_t NWorkGroups = NDIt.get_group_range().size(); + return GrID == NWorkGroups - 1 ? ActiveItemsLastWG : ActiveItemsPerWG; + } + }(); // If there are multiple values, reduce each separately // This prevents local memory from scaling with elements for (int E = 0; E < NElements; ++E) { - doTreeReduction( - WGSize, NDIt, LocalReds, BOp, - [&](size_t) { return getReducerAccess(Reducer).getElement(E); }); + doTreeReduction(ActiveItems, NDIt, LocalReds, BOp, [&](size_t) { + return getReducerAccess(Reducer).getElement(E); + }); // Compute the partial sum/reduction for the work-group. if (LID == 0) { - size_t GrID = NDIt.get_group_linear_id(); typename Reduction::result_type PSum = LocalReds[0]; if (IsUpdateOfUserVar) PSum = BOp(Out[0], PSum); @@ -1929,23 +2019,29 @@ struct IsArrayReduction { /// All scalar reductions are processed together; there is one loop of log2(N) /// steps, and each reduction uses its own storage. -template +template void reduCGFuncImplScalar( - bool IsOneWG, nd_item NDIt, ReduTupleT LocalAccsTuple, + bool IsOneWG, nd_item NDIt, size_t ActiveItems, + ReduTupleT LocalAccsTuple, ReduTupleT OutAccsTuple, std::tuple &ReducersTuple, ReduTupleT IdentitiesTuple, ReduTupleT BOPsTuple, std::array InitToIdentityProps, std::index_sequence ReduIndices) { - size_t WGSize = NDIt.get_local_range().size(); + static_assert(WSG != WorkSizeGuarantees::None, + "reduCGFuncImplScalar requires either equal or less-or-equal " + "work size guarantee"); + size_t LID = NDIt.get_local_linear_id(); - ((std::get(LocalAccsTuple)[LID] = - getReducerAccess(std::get(ReducersTuple)).getElement(0)), - ...); + if (WSG == WorkSizeGuarantees::Equal || LID < ActiveItems) + ((std::get(LocalAccsTuple)[LID] = + getReducerAccess(std::get(ReducersTuple)).getElement(0)), + ...); - doTreeReductionOnTuple(WGSize, LID, LocalAccsTuple, BOPsTuple, ReduIndices); + doTreeReductionOnTuple(ActiveItems, LID, LocalAccsTuple, BOPsTuple, + ReduIndices); // Compute the partial sum/reduction for the work-group. if (LID == 0) { @@ -1957,22 +2053,22 @@ void reduCGFuncImplScalar( } /// Each array reduction is processed separately. -template +template void reduCGFuncImplArrayHelper(bool IsOneWG, nd_item NDIt, - LocalAccT LocalReds, OutAccT Out, - ReducerT &Reducer, BOPT BOp, + size_t ActiveItems, LocalAccT LocalReds, + OutAccT Out, ReducerT &Reducer, BOPT BOp, bool IsInitializeToIdentity) { - size_t WGSize = NDIt.get_local_range().size(); size_t LID = NDIt.get_local_linear_id(); // If there are multiple values, reduce each separately // This prevents local memory from scaling with elements auto NElements = Reduction::num_elements; for (size_t E = 0; E < NElements; ++E) { - doTreeReduction( - WGSize, NDIt, LocalReds, BOp, - [&](size_t) { return getReducerAccess(Reducer).getElement(E); }); + doTreeReduction(ActiveItems, NDIt, LocalReds, BOp, [&](size_t) { + return getReducerAccess(Reducer).getElement(E); + }); // Add the initial value of user's variable to the final result. if (LID == 0) { @@ -1994,34 +2090,40 @@ void reduCGFuncImplArrayHelper(bool IsOneWG, nd_item NDIt, } } -template +template void reduCGFuncImplArray( - bool IsOneWG, nd_item NDIt, ReduTupleT LocalAccsTuple, + bool IsOneWG, nd_item NDIt, size_t ActiveItems, + ReduTupleT LocalAccsTuple, ReduTupleT OutAccsTuple, std::tuple &ReducersTuple, ReduTupleT BOPsTuple, std::array InitToIdentityProps, std::index_sequence) { using ReductionPack = std::tuple; - (reduCGFuncImplArrayHelper>( - IsOneWG, NDIt, std::get(LocalAccsTuple), std::get(OutAccsTuple), - std::get(ReducersTuple), std::get(BOPsTuple), - InitToIdentityProps[Is]), + (reduCGFuncImplArrayHelper>( + IsOneWG, NDIt, ActiveItems, std::get(LocalAccsTuple), + std::get(OutAccsTuple), std::get(ReducersTuple), + std::get(BOPsTuple), InitToIdentityProps[Is]), ...); } namespace reduction::main_krn { template struct NDRangeMulti; } // namespace reduction::main_krn -template +template void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, - const nd_range &Range, PropertiesT Properties, + const nd_range &Range, size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, PropertiesT Properties, std::tuple &ReduTuple, std::index_sequence ReduIndices) { size_t WGSize = Range.get_local_range().size(); + // Ignore active items to avoid warnings when uncaptured. + std::ignore = ActiveItemsPerWG; + std::ignore = ActiveItemsLastWG; + // Split reduction sequence into two: // 1) Scalar reductions // 2) Array reductions @@ -2066,21 +2168,38 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); }, ReducersTuple); + // Compute the number of active items for this work-group. + size_t ActiveItems = [&]() { + if constexpr (WSG == WorkSizeGuarantees::Equal) { + // We are guaranteed exactly one element for each work item, so we + // can assume that ActiveItemsPerWG is the same as the workgroup + // size. + return NDIt.get_local_range().size(); + } else { + // Otherwise we either have the number of active items, except the + // last group may have less. + size_t GrID = NDIt.get_group_linear_id(); + size_t NWorkGroups = NDIt.get_group_range().size(); + return GrID == NWorkGroups - 1 ? ActiveItemsLastWG : ActiveItemsPerWG; + } + }(); + // Combine and write-back the results of any scalar reductions // reduCGFuncImplScalar(NDIt, LocalAccsTuple, OutAccsTuple, // ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps, // ReduIndices); - reduCGFuncImplScalar( - IsOneWG, NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple, - IdentitiesTuple, BOPsTuple, InitToIdentityProps, ScalarIs); + reduCGFuncImplScalar( + IsOneWG, NDIt, ActiveItems, LocalAccsTuple, OutAccsTuple, + ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps, + ScalarIs); // Combine and write-back the results of any array reductions // These are handled separately to minimize temporary storage and account // for the fact that each array reduction may have a different number of // elements to reduce (i.e. a different extent). - reduCGFuncImplArray(IsOneWG, NDIt, LocalAccsTuple, - OutAccsTuple, ReducersTuple, BOPsTuple, - InitToIdentityProps, ArrayIs); + reduCGFuncImplArray( + IsOneWG, NDIt, ActiveItems, LocalAccsTuple, OutAccsTuple, + ReducersTuple, BOPsTuple, InitToIdentityProps, ArrayIs); }); }; @@ -2282,11 +2401,17 @@ tuple_select_elements(TupleT Tuple, std::index_sequence) { } template <> struct NDRangeReduction { - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + nd_range NDRange, size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, PropertiesT &Properties, RestT... Rest) { + assert((WSG != WorkSizeGuarantees::Equal || + (ActiveItemsPerWG == NDRange.get_local_range().size() && + ActiveItemsLastWG == NDRange.get_local_range().size())) && + "Work size guarantee violation."); + std::tuple ArgsTuple(Rest...); constexpr size_t NumArgs = sizeof...(RestT); auto KernelFunc = std::get(ArgsTuple); @@ -2305,8 +2430,9 @@ template <> struct NDRangeReduction { std::to_string(MaxWGSize), PI_ERROR_INVALID_WORK_GROUP_SIZE); - reduCGFuncMulti(CGH, KernelFunc, NDRange, Properties, ReduTuple, - ReduIndices); + reduCGFuncMulti(CGH, KernelFunc, NDRange, ActiveItemsPerWG, + ActiveItemsLastWG, Properties, ReduTuple, + ReduIndices); reduction::finalizeHandler(CGH); size_t NWorkItems = NDRange.get_group_range().size(); @@ -2326,14 +2452,16 @@ template <> struct NDRangeReduction { using Impl = NDRangeReduction; using Strat = reduction::strategy; - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + nd_range NDRange, size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, PropertiesT &Properties, Reduction &Redu, KernelType &KernelFunc) { auto Delegate = [&](auto Impl) { - Impl.template run(CGH, Queue, NDRange, Properties, Redu, - KernelFunc); + Impl.template run(CGH, Queue, NDRange, ActiveItemsPerWG, + ActiveItemsLastWG, Properties, Redu, + KernelFunc); }; if constexpr (Reduction::has_float64_atomics) { @@ -2359,22 +2487,39 @@ template <> struct NDRangeReduction { assert(false && "Must be unreachable!"); } - template + template static void run(handler &CGH, std::shared_ptr &Queue, - nd_range NDRange, PropertiesT &Properties, + nd_range NDRange, size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, PropertiesT &Properties, RestT... Rest) { - return Impl::run(CGH, Queue, NDRange, Properties, - Rest...); + return Impl::run( + CGH, Queue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG, Properties, + Rest...); } }; +template +void reduction_parallel_for_impl(handler &CGH, + std::shared_ptr &Queue, + nd_range NDRange, + size_t ActiveItemsPerWG, + size_t ActiveItemsLastWG, + PropertiesT Properties, RestT... Rest) { + NDRangeReduction::template run( + CGH, Queue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG, Properties, + Rest...); +} + template void reduction_parallel_for(handler &CGH, nd_range NDRange, PropertiesT Properties, RestT... Rest) { - NDRangeReduction::template run(CGH, CGH.MQueue, NDRange, - Properties, Rest...); + size_t WGSize = NDRange.get_local_range().size(); + reduction_parallel_for_impl( + CGH, CGH.MQueue, NDRange, WGSize, WGSize, Properties, Rest...); } __SYCL_EXPORT uint32_t @@ -2398,7 +2543,8 @@ void reduction_parallel_for(handler &CGH, range Range, if constexpr (sizeof...(RestT) == 2) { using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>; constexpr bool IsTreeReduction = - !Reduction::has_fast_reduce && !Reduction::has_fast_atomics; + !(Reduction::has_fast_reduce && Reduction::has_identity) && + !Reduction::has_fast_atomics; return IsTreeReduction ? sizeof(typename Reduction::result_type) : 0; } else { return reduGetMemPerWorkItem(ReduTuple, ReduIndices); @@ -2428,6 +2574,9 @@ void reduction_parallel_for(handler &CGH, range Range, nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}}; size_t PerGroup = Range.size() / NWorkGroups; + size_t ActiveItemsPerWG = std::min(PerGroup, WGSize); + size_t ActiveItemsLastWG = + std::min(NWorkItems - PerGroup * (NWorkGroups - 1), WGSize); // Iterate through the index space by assigning contiguous chunks to each // work-group, then iterating through each chunk using a stride equal to the // work-group's local range, which gives much better performance than using @@ -2484,13 +2633,17 @@ void reduction_parallel_for(handler &CGH, range Range, return reduction::strategy::range_basic; }(); - reduction_parallel_for(CGH, NDRange, Properties, - Redu, UpdatedKernelFunc); + reduction_parallel_for_impl( + CGH, CGH.MQueue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG, + Properties, Redu, UpdatedKernelFunc); } else { return std::apply( [&](auto &...Reds) { - return reduction_parallel_for( - CGH, NDRange, Properties, Reds..., UpdatedKernelFunc); + return reduction_parallel_for_impl( + CGH, CGH.MQueue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG, + Properties, Reds..., UpdatedKernelFunc); }, ReduTuple); }