diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index df1cd55cab0a2..4054b9fa1428a 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -960,6 +960,68 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT int __spirv_GroupNonUniformBallotFindLSB(__spv::Scope::Flag, __ocl_vec_t) noexcept; +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT + __spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT); + +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool +__spirv_GroupNonUniformAll(__spv::Scope::Flag, bool); + +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool +__spirv_GroupNonUniformAny(__spv::Scope::Flag, bool); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT); + extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void __clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept; diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 2da8ee1a7db90..9534cc5a4f9e9 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,9 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace ext { namespace oneapi { struct sub_group; +namespace experimental { +template class ballot_group; +} // namespace experimental } // namespace oneapi } // namespace ext @@ -56,6 +60,11 @@ template <> struct group_scope<::sycl::ext::oneapi::sub_group> { static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup; }; +template +struct group_scope> { + static constexpr __spv::Scope::Flag value = group_scope::value; +}; + // Generic shuffles and broadcasts may require multiple calls to // intrinsics, and should use the fewest broadcasts possible // - Loop over chunks until remaining bytes < chunk size @@ -94,13 +103,37 @@ void GenericCall(const Functor &ApplyToBytes) { } } -template bool GroupAll(bool pred) { +template bool GroupAll(Group, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } +template +bool GroupAll(ext::oneapi::experimental::ballot_group g, + bool pred) { + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformAll(group_scope::value, pred); + } else { + return __spirv_GroupNonUniformAll(group_scope::value, pred); + } +} -template bool GroupAny(bool pred) { +template bool GroupAny(Group, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } +template +bool GroupAny(ext::oneapi::experimental::ballot_group g, + bool pred) { + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformAny(group_scope::value, pred); + } else { + return __spirv_GroupNonUniformAny(group_scope::value, pred); + } +} // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic // FIXME: Do not special-case for half once all backends support all data types. @@ -157,7 +190,7 @@ template <> struct GroupId<::sycl::ext::oneapi::sub_group> { using type = uint32_t; }; template -EnableIfNativeBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(local_id); using OCLT = detail::ConvertToOpenCLType_t; @@ -167,15 +200,43 @@ EnableIfNativeBroadcast GroupBroadcast(T x, IdT local_id) { OCLIdT OCLId = detail::convertDataToType(GroupLocalId); return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } +template +EnableIfNativeBroadcast +GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, + T x, IdT local_id) { + // Remap local_id to its original numbering in ParentGroup. + auto LocalId = detail::IdToMaskPosition(g, local_id); + + // TODO: Refactor to avoid duplication after design settles. + using GroupIdT = typename GroupId::type; + GroupIdT GroupLocalId = static_cast(LocalId); + using OCLT = detail::ConvertToOpenCLType_t; + using WidenedT = WidenOpenCLTypeTo32_t; + using OCLIdT = detail::ConvertToOpenCLType_t; + WidenedT OCLX = detail::convertDataToType(x); + OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformBroadcast(group_scope::value, + OCLX, OCLId); + } else { + return __spirv_GroupNonUniformBroadcast(group_scope::value, + OCLX, OCLId); + } +} + template -EnableIfBitcastBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; auto BroadcastX = bit_cast(x); - BroadcastT Result = GroupBroadcast(BroadcastX, local_id); + BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id); return bit_cast(Result); } template -EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfGenericBroadcast GroupBroadcast(Group g, T x, IdT local_id) { // Initialize with x to support type T without default constructor T Result = x; char *XBytes = reinterpret_cast(&x); @@ -183,7 +244,7 @@ EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; std::memcpy(&BroadcastX, XBytes + Offset, Size); - BroadcastResult = GroupBroadcast(BroadcastX, local_id); + BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); std::memcpy(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); @@ -192,9 +253,10 @@ EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { // Broadcast with vector local index template -EnableIfNativeBroadcast GroupBroadcast(T x, id local_id) { +EnableIfNativeBroadcast GroupBroadcast(Group g, T x, + id local_id) { if (Dimensions == 1) { - return GroupBroadcast(x, local_id[0]); + return GroupBroadcast(g, x, local_id[0]); } using IdT = vec; using OCLT = detail::ConvertToOpenCLType_t; @@ -208,17 +270,26 @@ EnableIfNativeBroadcast GroupBroadcast(T x, id local_id) { OCLIdT OCLId = detail::convertDataToType(VecId); return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } +template +EnableIfNativeBroadcast +GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, + T x, id<1> local_id) { + // Limited to 1D indices for now because ParentGroup must be sub-group. + return GroupBroadcast(g, x, local_id[0]); +} template -EnableIfBitcastBroadcast GroupBroadcast(T x, id local_id) { +EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, + id local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; auto BroadcastX = bit_cast(x); - BroadcastT Result = GroupBroadcast(BroadcastX, local_id); + BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id); return bit_cast(Result); } template -EnableIfGenericBroadcast GroupBroadcast(T x, id local_id) { +EnableIfGenericBroadcast GroupBroadcast(Group g, T x, + id local_id) { if (Dimensions == 1) { - return GroupBroadcast(x, local_id[0]); + return GroupBroadcast(g, x, local_id[0]); } // Initialize with x to support type T without default constructor T Result = x; @@ -227,7 +298,7 @@ EnableIfGenericBroadcast GroupBroadcast(T x, id local_id) { auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; std::memcpy(&BroadcastX, XBytes + Offset, Size); - BroadcastResult = GroupBroadcast(BroadcastX, local_id); + BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); std::memcpy(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); @@ -801,6 +872,101 @@ EnableIfGenericShuffle SubgroupShuffleUp(T x, uint32_t delta) { return Result; } +template +typename std::enable_if_t< + ext::oneapi::experimental::is_fixed_topology_group_v> +ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { + __spirv_ControlBarrier(group_scope::value, getScope(FenceScope), + getMemorySemanticsMask(Order) | + __spv::MemorySemanticsMask::SubgroupMemory | + __spv::MemorySemanticsMask::WorkgroupMemory | + __spv::MemorySemanticsMask::CrossWorkgroupMemory); +} + +template +typename std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v> +ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { +#if defined(__SPIR__) + // SPIR-V does not define an instruction to synchronize partial groups. + // However, most (possibly all?) of the current SPIR-V targets execute + // work-items in lockstep, so we can probably get away with a MemoryBarrier. + // TODO: Replace this if SPIR-V defines a NonUniformControlBarrier + __spirv_MemoryBarrier(getScope(FenceScope), + getMemorySemanticsMask(Order) | + __spv::MemorySemanticsMask::SubgroupMemory | + __spv::MemorySemanticsMask::WorkgroupMemory | + __spv::MemorySemanticsMask::CrossWorkgroupMemory); +#elif defined(__NVPTX__) + // TODO: Call syncwarp with appropriate mask extracted from the group +#endif +} + +// TODO: Refactor to avoid duplication after design settles +#define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ + template <__spv::GroupOperation Op, typename Group, typename T> \ + inline typename std::enable_if_t< \ + ext::oneapi::experimental::is_fixed_topology_group_v, T> \ + Group##Instruction(Group G, T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = \ + conditional_t() || \ + std::is_same(), \ + cl_int, \ + conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + OCLT Ret = __spirv_Group##Instruction(group_scope::value, \ + static_cast(Op), Arg); \ + return Ret; \ + } \ + \ + template <__spv::GroupOperation Op, typename ParentGroup, typename T> \ + inline T Group##Instruction( \ + ext::oneapi::experimental::ballot_group g, T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = \ + conditional_t() || \ + std::is_same(), \ + cl_int, \ + conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + /* ballot_group partitions its parent into two groups (0 and 1) */ \ + /* We have to force each group down different control flow */ \ + /* Work-items in the "false" group (0) may still be active */ \ + constexpr auto Scope = group_scope::value; \ + constexpr auto OpInt = static_cast(Op); \ + if (g.get_group_id() == 1) { \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } else { \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } \ + } + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(IMulKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMulKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOrKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXorKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAndKHR) + } // namespace spirv } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index 474ced1a5fe91..33039cf7b389e 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -27,6 +27,29 @@ struct sub_group; namespace experimental { template class group_with_scratchpad; +template struct is_fixed_topology_group : std::false_type {}; + +template +inline constexpr bool is_fixed_topology_group_v = + is_fixed_topology_group::value; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +template <> struct is_fixed_topology_group : std::true_type {}; +#endif + +template +struct is_fixed_topology_group> : std::true_type {}; + +template <> +struct is_fixed_topology_group : std::true_type { +}; + +template struct is_user_constructed_group : std::false_type {}; + +template +inline constexpr bool is_user_constructed_group_v = + is_user_constructed_group::value; + namespace detail { template struct is_group_helper : std::false_type {}; diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index 04adcca79fcff..fcdce42652075 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -41,7 +41,7 @@ template class ballot_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); @@ -112,15 +112,17 @@ template class ballot_group { #endif } -private: - sub_group_mask Mask; - bool Predicate; - protected: + const sub_group_mask Mask; + const bool Predicate; + ballot_group(sub_group_mask m, bool p) : Mask(m), Predicate(p) {} friend ballot_group get_ballot_group(ParentGroup g, bool predicate); + + friend uint32_t sycl::detail::IdToMaskPosition>( + ballot_group Group, uint32_t Id); }; template @@ -149,5 +151,10 @@ template struct is_user_constructed_group> : std::true_type {}; } // namespace ext::oneapi::experimental + +template +struct is_group> + : std::true_type {}; + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 8899ae74ae985..c7101fd198c83 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -10,52 +10,62 @@ #include #include #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace ext::oneapi::experimental { - -template struct is_fixed_topology_group : std::false_type {}; - -template -inline constexpr bool is_fixed_topology_group_v = - is_fixed_topology_group::value; - -#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP -template <> struct is_fixed_topology_group : std::true_type {}; -#endif - -template -struct is_fixed_topology_group> : std::true_type {}; - -template <> struct is_fixed_topology_group : std::true_type {}; -template struct is_user_constructed_group : std::false_type {}; - -template -inline constexpr bool is_user_constructed_group_v = - is_user_constructed_group::value; - -#ifdef __SYCL_DEVICE_ONLY__ -// TODO: This may need to be generalized beyond uint32_t for big masks namespace detail { -uint32_t CallerPositionInMask(sub_group_mask Mask) { - // FIXME: It would be nice to be able to jump straight to an __ocl_vec_t + +inline sycl::vec ExtractMask(ext::oneapi::sub_group_mask Mask) { sycl::marray TmpMArray; Mask.extract_bits(TmpMArray); sycl::vec MemberMask; for (int i = 0; i < 4; ++i) { MemberMask[i] = TmpMArray[i]; } + return MemberMask; +} + +#ifdef __SYCL_DEVICE_ONLY__ +// TODO: This may need to be generalized beyond uint32_t for big masks +inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { + sycl::vec MemberMask = ExtractMask(Mask); auto OCLMask = sycl::detail::ConvertToOpenCLType_t>(MemberMask); return __spirv_GroupNonUniformBallotBitCount( __spv::Scope::Subgroup, (int)__spv::GroupOperation::ExclusiveScan, OCLMask); } -} // namespace detail #endif +template +inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { + // TODO: This will need to be optimized + sycl::vec MemberMask = ExtractMask(Group.Mask); + uint32_t Count = 0; + for (int i = 0; i < 4; ++i) { + for (int b = 0; b < 32; ++b) { + if (MemberMask[i] & (1 << b)) { + if (Count == Id) { + return i * 32 + b; + } + Count++; + } + } + } + __builtin_unreachable(); + return Count; +} + +} // namespace detail + +namespace ext::oneapi::experimental { + +// Forward declarations of non-uniform group types for algorithm definitions +template class ballot_group; + } // namespace ext::oneapi::experimental + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index 1e20719aa472f..74fd03608a1cd 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -40,7 +40,7 @@ class opportunistic_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp index d832b644ef6d9..518cb1f118736 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp @@ -41,7 +41,7 @@ template class tangle_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp index b8be0c6573620..7d8d269fde0fb 100644 --- a/sycl/include/sycl/ext/oneapi/functional.hpp +++ b/sycl/include/sycl/ext/oneapi/functional.hpp @@ -61,21 +61,9 @@ struct GroupOpTag< }; #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ - template \ - static T calc(GroupTag, T x, BinaryOperation) { \ - using ConvertedT = detail::ConvertToOpenCLType_t; \ - \ - using OCLT = \ - conditional_t() || \ - std::is_same(), \ - cl_int, \ - conditional_t() || \ - std::is_same(), \ - cl_uint, ConvertedT>>; \ - OCLT Arg = x; \ - OCLT Ret = \ - __spirv_Group##SPIRVOperation(S, static_cast(O), Arg); \ - return Ret; \ + template <__spv::GroupOperation O, typename Group, typename T> \ + static T calc(Group g, GroupTag, T x, BinaryOperation) { \ + return sycl::detail::spirv::Group##SPIRVOperation(g, x); \ } // calc for sycl function objects @@ -105,10 +93,11 @@ __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, BitwiseAndKHR, sycl::bit_and) #undef __SYCL_CALC_OVERLOAD -template class BinaryOperation> -static T calc(typename GroupOpTag::type, T x, BinaryOperation) { - return calc(typename GroupOpTag::type(), x, BinaryOperation()); +static T calc(Group g, typename GroupOpTag::type, T x, + BinaryOperation) { + return calc(g, typename GroupOpTag::type(), x, BinaryOperation()); } } // namespace detail diff --git a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp index 610dc6a14bdd4..6c29b65482067 100644 --- a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp +++ b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp @@ -146,9 +146,10 @@ __SYCL2020_DEPRECATED( detail::enable_if_t<(detail::is_generic_group::value && std::is_trivially_copyable::value && !detail::is_vector_arithmetic::value), - T> broadcast(Group, T x, typename Group::id_type local_id) { + T> broadcast(Group g, T x, + typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(g, x, local_id); #else (void)x; (void)local_id; diff --git a/sycl/include/sycl/ext/oneapi/sub_group.hpp b/sycl/include/sycl/ext/oneapi/sub_group.hpp index 643a9245cb799..52de964cbbc1e 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group.hpp @@ -645,9 +645,8 @@ struct sub_group { "sycl::ext::oneapi::reduce instead.") EnableIfIsScalarArithmetic reduce(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; @@ -676,9 +675,8 @@ struct sub_group { "sycl::ext::oneapi::exclusive_scan instead.") EnableIfIsScalarArithmetic exclusive_scan(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; @@ -715,9 +713,8 @@ struct sub_group { "sycl::ext::oneapi::inclusive_scan instead.") EnableIfIsScalarArithmetic inclusive_scan(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 206cc148e0857..44cc1ab2cfc8d 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -18,6 +18,13 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { class Builder; + +namespace spirv { + +template struct group_scope; + +} // namespace spirv + } // namespace detail namespace ext::oneapi { @@ -33,12 +40,12 @@ namespace ext::oneapi { // need to forward declare sub_group_mask first struct sub_group_mask; template -detail::enable_if_t, sub_group>::value, - sub_group_mask> +sycl::detail::enable_if_t, sub_group>::value, + sub_group_mask> group_ballot(Group g, bool predicate = true); struct sub_group_mask { - friend class detail::Builder; + friend class sycl::detail::Builder; using BitsType = BITS_TYPE; static constexpr size_t max_bits = @@ -231,7 +238,7 @@ struct sub_group_mask { : Bits(rhs.Bits), bits_num(rhs.bits_num) {} template - friend detail::enable_if_t< + friend sycl::detail::enable_if_t< std::is_same, sub_group>::value, sub_group_mask> group_ballot(Group g, bool predicate); @@ -274,17 +281,17 @@ struct sub_group_mask { }; template -detail::enable_if_t, sub_group>::value, - sub_group_mask> +sycl::detail::enable_if_t, sub_group>::value, + sub_group_mask> group_ballot(Group g, bool predicate) { (void)g; #ifdef __SYCL_DEVICE_ONLY__ auto res = __spirv_GroupNonUniformBallot( - detail::spirv::group_scope::value, predicate); + sycl::detail::spirv::group_scope::value, predicate); BITS_TYPE val = res[0]; if constexpr (sizeof(BITS_TYPE) == 8) val |= ((BITS_TYPE)res[1]) << 32; - return detail::Builder::createSubGroupMask( + return sycl::detail::Builder::createSubGroupMask( val, g.get_max_local_range()[0]); #else (void)predicate; diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 1fa39d5ba3b5c..e55cd6d0d11e3 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -197,7 +197,7 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -reduce_over_group(Group, T x, BinaryOperation binary_op) { +reduce_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert( std::is_same::value || @@ -205,9 +205,8 @@ reduce_over_group(Group, T x, BinaryOperation binary_op) { std::is_same::value), "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); @@ -376,9 +375,9 @@ joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { // ---- any_of_group template detail::enable_if_t>, bool> -any_of_group(Group, bool pred) { +any_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAny(pred); + return sycl::detail::spirv::GroupAny(g, pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -415,9 +414,9 @@ joint_any_of(Group g, Ptr first, Ptr last, Predicate pred) { // ---- all_of_group template detail::enable_if_t>, bool> -all_of_group(Group, bool pred) { +all_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(pred); + return sycl::detail::spirv::GroupAll(g, pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -454,9 +453,9 @@ joint_all_of(Group g, Ptr first, Ptr last, Predicate pred) { // ---- none_of_group template detail::enable_if_t>, bool> -none_of_group(Group, bool pred) { +none_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(!pred); + return sycl::detail::spirv::GroupAll(g, !pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -571,9 +570,9 @@ detail::enable_if_t<(is_group_v> && (std::is_trivially_copyable::value || detail::is_vec::value)), T> -group_broadcast(Group, T x, typename Group::id_type local_id) { +group_broadcast(Group g, T x, typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(g, x, local_id); #else (void)x; (void)local_id; @@ -628,16 +627,15 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { +exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert(std::is_same::value || (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); @@ -862,16 +860,15 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { +inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert(std::is_same::value || (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/group_barrier.hpp b/sycl/include/sycl/group_barrier.hpp index 218f1909348c6..af1cb49e1e68e 100644 --- a/sycl/include/sycl/group_barrier.hpp +++ b/sycl/include/sycl/group_barrier.hpp @@ -20,31 +20,14 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { -template struct group_barrier_scope {}; -template <> struct group_barrier_scope { - constexpr static auto Scope = __spv::Scope::Subgroup; -}; -template struct group_barrier_scope> { - constexpr static auto Scope = __spv::Scope::Workgroup; -}; -} // namespace detail - template typename std::enable_if>::type -group_barrier(Group, memory_scope FenceScope = Group::fence_scope) { - (void)FenceScope; -#ifdef __SYCL_DEVICE_ONLY__ +group_barrier(Group G, memory_scope FenceScope = Group::fence_scope) { // Per SYCL spec, group_barrier must perform both control barrier and memory // fence operations. All work-items execute a release fence prior to - // barrier and acquire fence afterwards. The rest of semantics flags specify - // which type of memory this behavior is applied to. - __spirv_ControlBarrier(detail::group_barrier_scope::Scope, - sycl::detail::spirv::getScope(FenceScope), - __spv::MemorySemanticsMask::SequentiallyConsistent | - __spv::MemorySemanticsMask::SubgroupMemory | - __spv::MemorySemanticsMask::WorkgroupMemory | - __spv::MemorySemanticsMask::CrossWorkgroupMemory); + // barrier and acquire fence afterwards. +#ifdef __SYCL_DEVICE_ONLY__ + detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst); #else throw sycl::runtime_error("Barriers are not supported on host device", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp new file mode 100644 index 0000000000000..98fd7174208e5 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -0,0 +1,61 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items. + bool Predicate = WI % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check function return values match Predicate. + // NB: Test currently uses exactly one sub-group, but we use SG + // below in case this changes in future. + bool Match = true; + auto GroupID = (Predicate) ? 1 : 0; + auto LocalID = SG.get_local_id() / 2; + Match &= (BallotGroup.get_group_id() == GroupID); + Match &= (BallotGroup.get_local_id() == LocalID); + Match &= (BallotGroup.get_group_range() == 2); + Match &= (BallotGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = BallotGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp new file mode 100644 index 0000000000000..1667445ada44b --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -0,0 +1,127 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer TmpBuf{sycl::range{32}}; + sycl::buffer BarrierBuf{sycl::range{32}}; + sycl::buffer BroadcastBuf{sycl::range{32}}; + sycl::buffer AnyBuf{sycl::range{32}}; + sycl::buffer AllBuf{sycl::range{32}}; + sycl::buffer NoneBuf{sycl::range{32}}; + sycl::buffer ReduceBuf{sycl::range{32}}; + sycl::buffer ExScanBuf{sycl::range{32}}; + sycl::buffer IncScanBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only}; + sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only}; + sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only}; + sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only}; + sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only}; + sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only}; + sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; + sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; + sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items. + bool Predicate = WI % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check all other members' writes are visible after a barrier. + TmpAcc[WI] = 1; + sycl::group_barrier(BallotGroup); + size_t Visible = 0; + for (size_t Other = 0; Other < 32; ++Other) { + if (WI % 2 == Other % 2) { + Visible += TmpAcc[Other]; + } + } + BarrierAcc[WI] = (Visible == BallotGroup.get_local_linear_range()); + + // Simple check of group algorithms. + uint32_t OriginalLID = SG.get_local_linear_id(); + uint32_t LID = BallotGroup.get_local_linear_id(); + + uint32_t BroadcastResult = + sycl::group_broadcast(BallotGroup, OriginalLID, 0); + if (Predicate) { + BroadcastAcc[WI] = (BroadcastResult == 0); + } else { + BroadcastAcc[WI] = (BroadcastResult == 1); + } + + bool AnyResult = sycl::any_of_group(BallotGroup, (LID == 0)); + AnyAcc[WI] = (AnyResult == true); + + bool AllResult = sycl::all_of_group(BallotGroup, Predicate); + if (Predicate) { + AllAcc[WI] = (AllResult == true); + } else { + AllAcc[WI] = (AllResult == false); + } + + bool NoneResult = sycl::none_of_group(BallotGroup, Predicate); + if (Predicate) { + NoneAcc[WI] = (NoneResult == false); + } else { + NoneAcc[WI] = (NoneResult == true); + } + + uint32_t ReduceResult = + sycl::reduce_over_group(BallotGroup, 1, sycl::plus<>()); + ReduceAcc[WI] = + (ReduceResult == BallotGroup.get_local_linear_range()); + + uint32_t ExScanResult = + sycl::exclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); + ExScanAcc[WI] = (ExScanResult == LID); + + uint32_t IncScanResult = + sycl::inclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); + IncScanAcc[WI] = (IncScanResult == LID + 1); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only}; + sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only}; + sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only}; + sycl::host_accessor AllAcc{AllBuf, sycl::read_only}; + sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only}; + sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; + sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; + sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(BarrierAcc[WI] == true); + assert(BroadcastAcc[WI] == true); + assert(AnyAcc[WI] == true); + assert(AllAcc[WI] == true); + assert(NoneAcc[WI] == true); + assert(ReduceAcc[WI] == true); + assert(ExScanAcc[WI] == true); + assert(IncScanAcc[WI] == true); + } + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp new file mode 100644 index 0000000000000..e1d7634191df3 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp @@ -0,0 +1,62 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +template class TestKernel; + +template void test() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + auto ClusterGroup = syclex::get_cluster_group(SG); + + bool Match = true; + Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize)); + Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize)); + Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize)); + Match &= (ClusterGroup.get_local_range() == ClusterSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = ClusterGroup.leader(); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0)); + } +} + +int main() { + test<1>(); + test<2>(); + test<4>(); + test<8>(); + test<16>(); + test<32>(); + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp new file mode 100644 index 0000000000000..b3b6cd5ba4adf --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp @@ -0,0 +1,12 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +static_assert(syclex::is_fixed_topology_group_v); +#endif +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v); diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp new file mode 100644 index 0000000000000..a3f0085d8eaa0 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp @@ -0,0 +1,14 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<1, sycl::sub_group>>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<2, sycl::sub_group>>); +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v); diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp new file mode 100644 index 0000000000000..925340cee1c6d --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp @@ -0,0 +1,68 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Due to the unpredictable runtime behavior of opportunistic groups, + // some values may change from run to run. Check they're in expected + // ranges and consistent with other groups. + if (item.get_global_id() % 2 == 0) { + auto OpportunisticGroup = + syclex::this_kernel::get_opportunistic_group(); + + bool Match = true; + Match &= (OpportunisticGroup.get_group_id() == 0); + Match &= (OpportunisticGroup.get_local_id() < + OpportunisticGroup.get_local_range()); + Match &= (OpportunisticGroup.get_group_range() == 1); + Match &= (OpportunisticGroup.get_local_linear_range() <= + SG.get_local_linear_range()); + MatchAcc[WI] = Match; + LeaderAcc[WI] = OpportunisticGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + uint32_t NumLeaders = 0; + for (int WI = 0; WI < 32; ++WI) { + if (WI % 2 == 0) { + assert(MatchAcc[WI] == true); + if (LeaderAcc[WI]) { + NumLeaders++; + } + } + } + assert(NumLeaders > 0); + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp new file mode 100644 index 0000000000000..9e57a48633e0a --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items via control flow. + // Branches deliberately duplicated to test impact of optimizations. + // This only reliably works with optimizations disabled right now. + if (item.get_global_id() % 2 == 0) { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } else { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +}