diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index df1cd55cab0a2..4054b9fa1428a 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -960,6 +960,68 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT int
     __spirv_GroupNonUniformBallotFindLSB(__spv::Scope::Flag,
                                          __ocl_vec_t<uint32_t, 4>) noexcept;
 
+template <typename ValueT, typename IdT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+    __spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT);
+
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool
+__spirv_GroupNonUniformAll(__spv::Scope::Flag, bool);
+
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool
+__spirv_GroupNonUniformAny(__spv::Scope::Flag, bool);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT);
+
 extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void
 __clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept;
 
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 2da8ee1a7db90..9534cc5a4f9e9 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -14,6 +14,7 @@
 #include <sycl/detail/generic_type_traits.hpp>
 #include <sycl/detail/helpers.hpp>
 #include <sycl/detail/type_traits.hpp>
+#include <sycl/ext/oneapi/experimental/non_uniform_groups.hpp>
 #include <sycl/id.hpp>
 #include <sycl/memory_enums.hpp>
 
@@ -23,6 +24,9 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace ext {
 namespace oneapi {
 struct sub_group;
+namespace experimental {
+template <typename ParentGroup> class ballot_group;
+} // namespace experimental
 } // namespace oneapi
 } // namespace ext
 
@@ -56,6 +60,11 @@ template <> struct group_scope<::sycl::ext::oneapi::sub_group> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
 };
 
+template <typename ParentGroup>
+struct group_scope<sycl::ext::oneapi::experimental::ballot_group<ParentGroup>> {
+  static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
+};
+
 // Generic shuffles and broadcasts may require multiple calls to
 // intrinsics, and should use the fewest broadcasts possible
 // - Loop over chunks until remaining bytes < chunk size
@@ -94,13 +103,37 @@ void GenericCall(const Functor &ApplyToBytes) {
   }
 }
 
-template <typename Group> bool GroupAll(bool pred) {
+template <typename Group> bool GroupAll(Group, bool pred) {
   return __spirv_GroupAll(group_scope<Group>::value, pred);
 }
+template <typename ParentGroup>
+bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
+              bool pred) {
+  // ballot_group partitions its parent into two groups (0 and 1)
+  // We have to force each group down different control flow
+  // Work-items in the "false" group (0) may still be active
+  if (g.get_group_id() == 1) {
+    return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
+  } else {
+    return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
+  }
+}
 
-template <typename Group> bool GroupAny(bool pred) {
+template <typename Group> bool GroupAny(Group, bool pred) {
   return __spirv_GroupAny(group_scope<Group>::value, pred);
 }
+template <typename ParentGroup>
+bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
+              bool pred) {
+  // ballot_group partitions its parent into two groups (0 and 1)
+  // We have to force each group down different control flow
+  // Work-items in the "false" group (0) may still be active
+  if (g.get_group_id() == 1) {
+    return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
+  } else {
+    return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
+  }
+}
 
 // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
 // FIXME: Do not special-case for half once all backends support all data types.
@@ -157,7 +190,7 @@ template <> struct GroupId<::sycl::ext::oneapi::sub_group> {
   using type = uint32_t;
 };
 template <typename Group, typename T, typename IdT>
-EnableIfNativeBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
+EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
   using GroupIdT = typename GroupId<Group>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(local_id);
   using OCLT = detail::ConvertToOpenCLType_t<T>;
@@ -167,15 +200,43 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
   OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
   return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
 }
+template <typename ParentGroup, typename T, typename IdT>
+EnableIfNativeBroadcast<T, IdT>
+GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
+               T x, IdT local_id) {
+  // Remap local_id to its original numbering in ParentGroup.
+  auto LocalId = detail::IdToMaskPosition(g, local_id);
+
+  // TODO: Refactor to avoid duplication after design settles.
+  using GroupIdT = typename GroupId<ParentGroup>::type;
+  GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
+  using OCLT = detail::ConvertToOpenCLType_t<T>;
+  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
+  using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
+  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
+  OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
+
+  // ballot_group partitions its parent into two groups (0 and 1)
+  // We have to force each group down different control flow
+  // Work-items in the "false" group (0) may still be active
+  if (g.get_group_id() == 1) {
+    return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
+                                            OCLX, OCLId);
+  } else {
+    return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
+                                            OCLX, OCLId);
+  }
+}
+
 template <typename Group, typename T, typename IdT>
-EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
+EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
   using BroadcastT = ConvertToNativeBroadcastType_t<T>;
   auto BroadcastX = bit_cast<BroadcastT>(x);
-  BroadcastT Result = GroupBroadcast<Group>(BroadcastX, local_id);
+  BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
   return bit_cast<T>(Result);
 }
 template <typename Group, typename T, typename IdT>
-EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
+EnableIfGenericBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
   // Initialize with x to support type T without default constructor
   T Result = x;
   char *XBytes = reinterpret_cast<char *>(&x);
@@ -183,7 +244,7 @@ EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
   auto BroadcastBytes = [=](size_t Offset, size_t Size) {
     uint64_t BroadcastX, BroadcastResult;
     std::memcpy(&BroadcastX, XBytes + Offset, Size);
-    BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
+    BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
     std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
   };
   GenericCall<T>(BroadcastBytes);
@@ -192,9 +253,10 @@ EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
 
 // Broadcast with vector local index
 template <typename Group, typename T, int Dimensions>
-EnableIfNativeBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
+EnableIfNativeBroadcast<T> GroupBroadcast(Group g, T x,
+                                          id<Dimensions> local_id) {
   if (Dimensions == 1) {
-    return GroupBroadcast<Group>(x, local_id[0]);
+    return GroupBroadcast(g, x, local_id[0]);
   }
   using IdT = vec<size_t, Dimensions>;
   using OCLT = detail::ConvertToOpenCLType_t<T>;
@@ -208,17 +270,26 @@ EnableIfNativeBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
   OCLIdT OCLId = detail::convertDataToType<IdT, OCLIdT>(VecId);
   return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
 }
+template <typename ParentGroup, typename T>
+EnableIfNativeBroadcast<T>
+GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
+               T x, id<1> local_id) {
+  // Limited to 1D indices for now because ParentGroup must be sub-group.
+  return GroupBroadcast(g, x, local_id[0]);
+}
 template <typename Group, typename T, int Dimensions>
-EnableIfBitcastBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
+EnableIfBitcastBroadcast<T> GroupBroadcast(Group g, T x,
+                                           id<Dimensions> local_id) {
   using BroadcastT = ConvertToNativeBroadcastType_t<T>;
   auto BroadcastX = bit_cast<BroadcastT>(x);
-  BroadcastT Result = GroupBroadcast<Group>(BroadcastX, local_id);
+  BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
   return bit_cast<T>(Result);
 }
 template <typename Group, typename T, int Dimensions>
-EnableIfGenericBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
+EnableIfGenericBroadcast<T> GroupBroadcast(Group g, T x,
+                                           id<Dimensions> local_id) {
   if (Dimensions == 1) {
-    return GroupBroadcast<Group>(x, local_id[0]);
+    return GroupBroadcast(g, x, local_id[0]);
   }
   // Initialize with x to support type T without default constructor
   T Result = x;
@@ -227,7 +298,7 @@ EnableIfGenericBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
   auto BroadcastBytes = [=](size_t Offset, size_t Size) {
     uint64_t BroadcastX, BroadcastResult;
     std::memcpy(&BroadcastX, XBytes + Offset, Size);
-    BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
+    BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
     std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
   };
   GenericCall<T>(BroadcastBytes);
@@ -801,6 +872,101 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
   return Result;
 }
 
+template <typename Group>
+typename std::enable_if_t<
+    ext::oneapi::experimental::is_fixed_topology_group_v<Group>>
+ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
+  __spirv_ControlBarrier(group_scope<Group>::value, getScope(FenceScope),
+                         getMemorySemanticsMask(Order) |
+                             __spv::MemorySemanticsMask::SubgroupMemory |
+                             __spv::MemorySemanticsMask::WorkgroupMemory |
+                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+}
+
+template <typename Group>
+typename std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>>
+ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
+#if defined(__SPIR__)
+  // SPIR-V does not define an instruction to synchronize partial groups.
+  // However, most (possibly all?) of the current SPIR-V targets execute
+  // work-items in lockstep, so we can probably get away with a MemoryBarrier.
+  // TODO: Replace this if SPIR-V defines a NonUniformControlBarrier
+  __spirv_MemoryBarrier(getScope(FenceScope),
+                        getMemorySemanticsMask(Order) |
+                            __spv::MemorySemanticsMask::SubgroupMemory |
+                            __spv::MemorySemanticsMask::WorkgroupMemory |
+                            __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+#elif defined(__NVPTX__)
+  // TODO: Call syncwarp with appropriate mask extracted from the group
+#endif
+}
+
+// TODO: Refactor to avoid duplication after design settles
+#define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction)                          \
+  template <__spv::GroupOperation Op, typename Group, typename T>              \
+  inline typename std::enable_if_t<                                            \
+      ext::oneapi::experimental::is_fixed_topology_group_v<Group>, T>          \
+      Group##Instruction(Group G, T x) {                                       \
+    using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
+                                                                               \
+    using OCLT =                                                               \
+        conditional_t<std::is_same<ConvertedT, cl_char>() ||                   \
+                          std::is_same<ConvertedT, cl_short>(),                \
+                      cl_int,                                                  \
+                      conditional_t<std::is_same<ConvertedT, cl_uchar>() ||    \
+                                        std::is_same<ConvertedT, cl_ushort>(), \
+                                    cl_uint, ConvertedT>>;                     \
+    OCLT Arg = x;                                                              \
+    OCLT Ret = __spirv_Group##Instruction(group_scope<Group>::value,           \
+                                          static_cast<unsigned int>(Op), Arg); \
+    return Ret;                                                                \
+  }                                                                            \
+                                                                               \
+  template <__spv::GroupOperation Op, typename ParentGroup, typename T>        \
+  inline T Group##Instruction(                                                 \
+      ext::oneapi::experimental::ballot_group<ParentGroup> g, T x) {           \
+    using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
+                                                                               \
+    using OCLT =                                                               \
+        conditional_t<std::is_same<ConvertedT, cl_char>() ||                   \
+                          std::is_same<ConvertedT, cl_short>(),                \
+                      cl_int,                                                  \
+                      conditional_t<std::is_same<ConvertedT, cl_uchar>() ||    \
+                                        std::is_same<ConvertedT, cl_ushort>(), \
+                                    cl_uint, ConvertedT>>;                     \
+    OCLT Arg = x;                                                              \
+    /* ballot_group partitions its parent into two groups (0 and 1) */         \
+    /* We have to force each group down different control flow */              \
+    /* Work-items in the "false" group (0) may still be active */              \
+    constexpr auto Scope = group_scope<ParentGroup>::value;                    \
+    constexpr auto OpInt = static_cast<unsigned int>(Op);                      \
+    if (g.get_group_id() == 1) {                                               \
+      return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg);          \
+    } else {                                                                   \
+      return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg);          \
+    }                                                                          \
+  }
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(IMulKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMulKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOrKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXorKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAndKHR)
+
 } // namespace spirv
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index 474ced1a5fe91..33039cf7b389e 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -27,6 +27,29 @@ struct sub_group;
 namespace experimental {
 template <typename Group, std::size_t Extent> class group_with_scratchpad;
 
+template <class T> struct is_fixed_topology_group : std::false_type {};
+
+template <class T>
+inline constexpr bool is_fixed_topology_group_v =
+    is_fixed_topology_group<T>::value;
+
+#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
+template <> struct is_fixed_topology_group<root_group> : std::true_type {};
+#endif
+
+template <int Dimensions>
+struct is_fixed_topology_group<sycl::group<Dimensions>> : std::true_type {};
+
+template <>
+struct is_fixed_topology_group<sycl::ext::oneapi::sub_group> : std::true_type {
+};
+
+template <class T> struct is_user_constructed_group : std::false_type {};
+
+template <class T>
+inline constexpr bool is_user_constructed_group_v =
+    is_user_constructed_group<T>::value;
+
 namespace detail {
 template <typename T> struct is_group_helper : std::false_type {};
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
index 04adcca79fcff..fcdce42652075 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
@@ -41,7 +41,7 @@ template <typename ParentGroup> class ballot_group {
 
   id_type get_local_id() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return detail::CallerPositionInMask(Mask);
+    return sycl::detail::CallerPositionInMask(Mask);
 #else
     throw runtime_error("Non-uniform groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
@@ -112,15 +112,17 @@ template <typename ParentGroup> class ballot_group {
 #endif
   }
 
-private:
-  sub_group_mask Mask;
-  bool Predicate;
-
 protected:
+  const sub_group_mask Mask;
+  const bool Predicate;
+
   ballot_group(sub_group_mask m, bool p) : Mask(m), Predicate(p) {}
 
   friend ballot_group<ParentGroup>
   get_ballot_group<ParentGroup>(ParentGroup g, bool predicate);
+
+  friend uint32_t sycl::detail::IdToMaskPosition<ballot_group<ParentGroup>>(
+      ballot_group<ParentGroup> Group, uint32_t Id);
 };
 
 template <typename Group>
@@ -149,5 +151,10 @@ template <typename ParentGroup>
 struct is_user_constructed_group<ballot_group<ParentGroup>> : std::true_type {};
 
 } // namespace ext::oneapi::experimental
+
+template <typename ParentGroup>
+struct is_group<ext::oneapi::experimental::ballot_group<ParentGroup>>
+    : std::true_type {};
+
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index 8899ae74ae985..c7101fd198c83 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -10,52 +10,62 @@
 #include <CL/__spirv/spirv_ops.hpp>
 #include <CL/__spirv/spirv_vars.hpp>
 #include <sycl/ext/oneapi/sub_group_mask.hpp>
+#include <sycl/types.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace ext::oneapi::experimental {
-
-template <class T> struct is_fixed_topology_group : std::false_type {};
-
-template <class T>
-inline constexpr bool is_fixed_topology_group_v =
-    is_fixed_topology_group<T>::value;
-
-#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
-template <> struct is_fixed_topology_group<root_group> : std::true_type {};
-#endif
-
-template <int Dimensions>
-struct is_fixed_topology_group<sycl::group<Dimensions>> : std::true_type {};
-
-template <> struct is_fixed_topology_group<sycl::sub_group> : std::true_type {};
 
-template <class T> struct is_user_constructed_group : std::false_type {};
-
-template <class T>
-inline constexpr bool is_user_constructed_group_v =
-    is_user_constructed_group<T>::value;
-
-#ifdef __SYCL_DEVICE_ONLY__
-// TODO: This may need to be generalized beyond uint32_t for big masks
 namespace detail {
-uint32_t CallerPositionInMask(sub_group_mask Mask) {
-  // FIXME: It would be nice to be able to jump straight to an __ocl_vec_t
+
+inline sycl::vec<unsigned, 4> ExtractMask(ext::oneapi::sub_group_mask Mask) {
   sycl::marray<unsigned, 4> TmpMArray;
   Mask.extract_bits(TmpMArray);
   sycl::vec<unsigned, 4> MemberMask;
   for (int i = 0; i < 4; ++i) {
     MemberMask[i] = TmpMArray[i];
   }
+  return MemberMask;
+}
+
+#ifdef __SYCL_DEVICE_ONLY__
+// TODO: This may need to be generalized beyond uint32_t for big masks
+inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
+  sycl::vec<unsigned, 4> MemberMask = ExtractMask(Mask);
   auto OCLMask =
       sycl::detail::ConvertToOpenCLType_t<sycl::vec<unsigned, 4>>(MemberMask);
   return __spirv_GroupNonUniformBallotBitCount(
       __spv::Scope::Subgroup, (int)__spv::GroupOperation::ExclusiveScan,
       OCLMask);
 }
-} // namespace detail
 #endif
 
+template <typename NonUniformGroup>
+inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
+  // TODO: This will need to be optimized
+  sycl::vec<unsigned, 4> MemberMask = ExtractMask(Group.Mask);
+  uint32_t Count = 0;
+  for (int i = 0; i < 4; ++i) {
+    for (int b = 0; b < 32; ++b) {
+      if (MemberMask[i] & (1 << b)) {
+        if (Count == Id) {
+          return i * 32 + b;
+        }
+        Count++;
+      }
+    }
+  }
+  __builtin_unreachable();
+  return Count;
+}
+
+} // namespace detail
+
+namespace ext::oneapi::experimental {
+
+// Forward declarations of non-uniform group types for algorithm definitions
+template <typename ParentGroup> class ballot_group;
+
 } // namespace ext::oneapi::experimental
+
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
index 1e20719aa472f..74fd03608a1cd 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
@@ -40,7 +40,7 @@ class opportunistic_group {
 
   id_type get_local_id() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return detail::CallerPositionInMask(Mask);
+    return sycl::detail::CallerPositionInMask(Mask);
 #else
     throw runtime_error("Non-uniform groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
index d832b644ef6d9..518cb1f118736 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
@@ -41,7 +41,7 @@ template <typename ParentGroup> class tangle_group {
 
   id_type get_local_id() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return detail::CallerPositionInMask(Mask);
+    return sycl::detail::CallerPositionInMask(Mask);
 #else
     throw runtime_error("Non-uniform groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp
index b8be0c6573620..7d8d269fde0fb 100644
--- a/sycl/include/sycl/ext/oneapi/functional.hpp
+++ b/sycl/include/sycl/ext/oneapi/functional.hpp
@@ -61,21 +61,9 @@ struct GroupOpTag<
 };
 
 #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation)        \
-  template <typename T, __spv::GroupOperation O, __spv::Scope::Flag S>         \
-  static T calc(GroupTag, T x, BinaryOperation) {                              \
-    using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
-                                                                               \
-    using OCLT =                                                               \
-        conditional_t<std::is_same<ConvertedT, cl_char>() ||                   \
-                          std::is_same<ConvertedT, cl_short>(),                \
-                      cl_int,                                                  \
-                      conditional_t<std::is_same<ConvertedT, cl_uchar>() ||    \
-                                        std::is_same<ConvertedT, cl_ushort>(), \
-                                    cl_uint, ConvertedT>>;                     \
-    OCLT Arg = x;                                                              \
-    OCLT Ret =                                                                 \
-        __spirv_Group##SPIRVOperation(S, static_cast<unsigned int>(O), Arg);   \
-    return Ret;                                                                \
+  template <__spv::GroupOperation O, typename Group, typename T>               \
+  static T calc(Group g, GroupTag, T x, BinaryOperation) {                     \
+    return sycl::detail::spirv::Group##SPIRVOperation<O>(g, x);                \
   }
 
 // calc for sycl function objects
@@ -105,10 +93,11 @@ __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, BitwiseAndKHR, sycl::bit_and<T>)
 
 #undef __SYCL_CALC_OVERLOAD
 
-template <typename T, __spv::GroupOperation O, __spv::Scope::Flag S,
+template <__spv::GroupOperation O, typename Group, typename T,
           template <typename> class BinaryOperation>
-static T calc(typename GroupOpTag<T>::type, T x, BinaryOperation<void>) {
-  return calc<T, O, S>(typename GroupOpTag<T>::type(), x, BinaryOperation<T>());
+static T calc(Group g, typename GroupOpTag<T>::type, T x,
+              BinaryOperation<void>) {
+  return calc<O>(g, typename GroupOpTag<T>::type(), x, BinaryOperation<T>());
 }
 
 } // namespace detail
diff --git a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp
index 610dc6a14bdd4..6c29b65482067 100644
--- a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp
+++ b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp
@@ -146,9 +146,10 @@ __SYCL2020_DEPRECATED(
 detail::enable_if_t<(detail::is_generic_group<Group>::value &&
                      std::is_trivially_copyable<T>::value &&
                      !detail::is_vector_arithmetic<T>::value),
-                    T> broadcast(Group, T x, typename Group::id_type local_id) {
+                    T> broadcast(Group g, T x,
+                                 typename Group::id_type local_id) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupBroadcast<Group>(x, local_id);
+  return sycl::detail::spirv::GroupBroadcast(g, x, local_id);
 #else
   (void)x;
   (void)local_id;
diff --git a/sycl/include/sycl/ext/oneapi/sub_group.hpp b/sycl/include/sycl/ext/oneapi/sub_group.hpp
index 643a9245cb799..52de964cbbc1e 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group.hpp
@@ -645,9 +645,8 @@ struct sub_group {
                     "sycl::ext::oneapi::reduce instead.")
   EnableIfIsScalarArithmetic<T> reduce(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::calc<T, __spv::GroupOperation::Reduce,
-                              __spv::Scope::Subgroup>(
-        typename sycl::detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<__spv::GroupOperation::Reduce>(
+        typename sycl::detail::GroupOpTag<T>::type(), *this, x, op);
 #else
     (void)x;
     (void)op;
@@ -676,9 +675,8 @@ struct sub_group {
                     "sycl::ext::oneapi::exclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::calc<T, __spv::GroupOperation::ExclusiveScan,
-                              __spv::Scope::Subgroup>(
-        typename sycl::detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
+        typename sycl::detail::GroupOpTag<T>::type(), *this, x, op);
 #else
     (void)x;
     (void)op;
@@ -715,9 +713,8 @@ struct sub_group {
                     "sycl::ext::oneapi::inclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::calc<T, __spv::GroupOperation::InclusiveScan,
-                              __spv::Scope::Subgroup>(
-        typename sycl::detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
+        typename sycl::detail::GroupOpTag<T>::type(), *this, x, op);
 #else
     (void)x;
     (void)op;
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index 206cc148e0857..44cc1ab2cfc8d 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -18,6 +18,13 @@ namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
 class Builder;
+
+namespace spirv {
+
+template <typename Group> struct group_scope;
+
+} // namespace spirv
+
 } // namespace detail
 
 namespace ext::oneapi {
@@ -33,12 +40,12 @@ namespace ext::oneapi {
 // need to forward declare sub_group_mask first
 struct sub_group_mask;
 template <typename Group>
-detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
-                    sub_group_mask>
+sycl::detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
+                          sub_group_mask>
 group_ballot(Group g, bool predicate = true);
 
 struct sub_group_mask {
-  friend class detail::Builder;
+  friend class sycl::detail::Builder;
   using BitsType = BITS_TYPE;
 
   static constexpr size_t max_bits =
@@ -231,7 +238,7 @@ struct sub_group_mask {
       : Bits(rhs.Bits), bits_num(rhs.bits_num) {}
 
   template <typename Group>
-  friend detail::enable_if_t<
+  friend sycl::detail::enable_if_t<
       std::is_same<std::decay_t<Group>, sub_group>::value, sub_group_mask>
   group_ballot(Group g, bool predicate);
 
@@ -274,17 +281,17 @@ struct sub_group_mask {
 };
 
 template <typename Group>
-detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
-                    sub_group_mask>
+sycl::detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
+                          sub_group_mask>
 group_ballot(Group g, bool predicate) {
   (void)g;
 #ifdef __SYCL_DEVICE_ONLY__
   auto res = __spirv_GroupNonUniformBallot(
-      detail::spirv::group_scope<Group>::value, predicate);
+      sycl::detail::spirv::group_scope<Group>::value, predicate);
   BITS_TYPE val = res[0];
   if constexpr (sizeof(BITS_TYPE) == 8)
     val |= ((BITS_TYPE)res[1]) << 32;
-  return detail::Builder::createSubGroupMask<sub_group_mask>(
+  return sycl::detail::Builder::createSubGroupMask<sub_group_mask>(
       val, g.get_max_local_range()[0]);
 #else
   (void)predicate;
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 1fa39d5ba3b5c..e55cd6d0d11e3 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -197,7 +197,7 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
-reduce_over_group(Group, T x, BinaryOperation binary_op) {
+reduce_over_group(Group g, T x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(x, x)), T>::value ||
@@ -205,9 +205,8 @@ reduce_over_group(Group, T x, BinaryOperation binary_op) {
            std::is_same<decltype(binary_op(x, x)), float>::value),
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::calc<T, __spv::GroupOperation::Reduce,
-                            sycl::detail::spirv::group_scope<Group>::value>(
-      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<__spv::GroupOperation::Reduce>(
+      g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host.",
                       PI_ERROR_INVALID_DEVICE);
@@ -376,9 +375,9 @@ joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) {
 // ---- any_of_group
 template <typename Group>
 detail::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
-any_of_group(Group, bool pred) {
+any_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupAny<Group>(pred);
+  return sycl::detail::spirv::GroupAny(g, pred);
 #else
   (void)pred;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -415,9 +414,9 @@ joint_any_of(Group g, Ptr first, Ptr last, Predicate pred) {
 // ---- all_of_group
 template <typename Group>
 detail::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
-all_of_group(Group, bool pred) {
+all_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupAll<Group>(pred);
+  return sycl::detail::spirv::GroupAll(g, pred);
 #else
   (void)pred;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -454,9 +453,9 @@ joint_all_of(Group g, Ptr first, Ptr last, Predicate pred) {
 // ---- none_of_group
 template <typename Group>
 detail::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
-none_of_group(Group, bool pred) {
+none_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupAll<Group>(!pred);
+  return sycl::detail::spirv::GroupAll(g, !pred);
 #else
   (void)pred;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -571,9 +570,9 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                      (std::is_trivially_copyable<T>::value ||
                       detail::is_vec<T>::value)),
                     T>
-group_broadcast(Group, T x, typename Group::id_type local_id) {
+group_broadcast(Group g, T x, typename Group::id_type local_id) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupBroadcast<Group>(x, local_id);
+  return sycl::detail::spirv::GroupBroadcast(g, x, local_id);
 #else
   (void)x;
   (void)local_id;
@@ -628,16 +627,15 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
-exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) {
+exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(x, x)), T>::value ||
                     (std::is_same<T, half>::value &&
                      std::is_same<decltype(binary_op(x, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::calc<T, __spv::GroupOperation::ExclusiveScan,
-                            sycl::detail::spirv::group_scope<Group>::value>(
-      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
+      g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host.",
                       PI_ERROR_INVALID_DEVICE);
@@ -862,16 +860,15 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
-inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) {
+inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(x, x)), T>::value ||
                     (std::is_same<T, half>::value &&
                      std::is_same<decltype(binary_op(x, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::calc<T, __spv::GroupOperation::InclusiveScan,
-                            sycl::detail::spirv::group_scope<Group>::value>(
-      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
+      g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host.",
                       PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/group_barrier.hpp b/sycl/include/sycl/group_barrier.hpp
index 218f1909348c6..af1cb49e1e68e 100644
--- a/sycl/include/sycl/group_barrier.hpp
+++ b/sycl/include/sycl/group_barrier.hpp
@@ -20,31 +20,14 @@
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 
-namespace detail {
-template <typename G> struct group_barrier_scope {};
-template <> struct group_barrier_scope<sycl::sub_group> {
-  constexpr static auto Scope = __spv::Scope::Subgroup;
-};
-template <int D> struct group_barrier_scope<sycl::group<D>> {
-  constexpr static auto Scope = __spv::Scope::Workgroup;
-};
-} // namespace detail
-
 template <typename Group>
 typename std::enable_if<is_group_v<Group>>::type
-group_barrier(Group, memory_scope FenceScope = Group::fence_scope) {
-  (void)FenceScope;
-#ifdef __SYCL_DEVICE_ONLY__
+group_barrier(Group G, memory_scope FenceScope = Group::fence_scope) {
   // Per SYCL spec, group_barrier must perform both control barrier and memory
   // fence operations. All work-items execute a release fence prior to
-  // barrier and acquire fence afterwards. The rest of semantics flags specify
-  // which type of memory this behavior is applied to.
-  __spirv_ControlBarrier(detail::group_barrier_scope<Group>::Scope,
-                         sycl::detail::spirv::getScope(FenceScope),
-                         __spv::MemorySemanticsMask::SequentiallyConsistent |
-                             __spv::MemorySemanticsMask::SubgroupMemory |
-                             __spv::MemorySemanticsMask::WorkgroupMemory |
-                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+  // barrier and acquire fence afterwards.
+#ifdef __SYCL_DEVICE_ONLY__
+  detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst);
 #else
   throw sycl::runtime_error("Barriers are not supported on host device",
                             PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
new file mode 100644
index 0000000000000..98fd7174208e5
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
@@ -0,0 +1,61 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items.
+          bool Predicate = WI % 2 == 0;
+          auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
+
+          // Check function return values match Predicate.
+          // NB: Test currently uses exactly one sub-group, but we use SG
+          //     below in case this changes in future.
+          bool Match = true;
+          auto GroupID = (Predicate) ? 1 : 0;
+          auto LocalID = SG.get_local_id() / 2;
+          Match &= (BallotGroup.get_group_id() == GroupID);
+          Match &= (BallotGroup.get_local_id() == LocalID);
+          Match &= (BallotGroup.get_group_range() == 2);
+          Match &= (BallotGroup.get_local_range() == 16);
+          MatchAcc[WI] = Match;
+          LeaderAcc[WI] = BallotGroup.leader();
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == (WI < 2));
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
new file mode 100644
index 0000000000000..1667445ada44b
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
@@ -0,0 +1,127 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<size_t, 1> TmpBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> BarrierBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> BroadcastBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> AnyBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> AllBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> NoneBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> ReduceBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> ExScanBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> IncScanBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only};
+    sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only};
+    sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only};
+    sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only};
+    sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only};
+    sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only};
+    sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
+    sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
+    sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items.
+          bool Predicate = WI % 2 == 0;
+          auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
+
+          // Check all other members' writes are visible after a barrier.
+          TmpAcc[WI] = 1;
+          sycl::group_barrier(BallotGroup);
+          size_t Visible = 0;
+          for (size_t Other = 0; Other < 32; ++Other) {
+            if (WI % 2 == Other % 2) {
+              Visible += TmpAcc[Other];
+            }
+          }
+          BarrierAcc[WI] = (Visible == BallotGroup.get_local_linear_range());
+
+          // Simple check of group algorithms.
+          uint32_t OriginalLID = SG.get_local_linear_id();
+          uint32_t LID = BallotGroup.get_local_linear_id();
+
+          uint32_t BroadcastResult =
+              sycl::group_broadcast(BallotGroup, OriginalLID, 0);
+          if (Predicate) {
+            BroadcastAcc[WI] = (BroadcastResult == 0);
+          } else {
+            BroadcastAcc[WI] = (BroadcastResult == 1);
+          }
+
+          bool AnyResult = sycl::any_of_group(BallotGroup, (LID == 0));
+          AnyAcc[WI] = (AnyResult == true);
+
+          bool AllResult = sycl::all_of_group(BallotGroup, Predicate);
+          if (Predicate) {
+            AllAcc[WI] = (AllResult == true);
+          } else {
+            AllAcc[WI] = (AllResult == false);
+          }
+
+          bool NoneResult = sycl::none_of_group(BallotGroup, Predicate);
+          if (Predicate) {
+            NoneAcc[WI] = (NoneResult == false);
+          } else {
+            NoneAcc[WI] = (NoneResult == true);
+          }
+
+          uint32_t ReduceResult =
+              sycl::reduce_over_group(BallotGroup, 1, sycl::plus<>());
+          ReduceAcc[WI] =
+              (ReduceResult == BallotGroup.get_local_linear_range());
+
+          uint32_t ExScanResult =
+              sycl::exclusive_scan_over_group(BallotGroup, 1, sycl::plus<>());
+          ExScanAcc[WI] = (ExScanResult == LID);
+
+          uint32_t IncScanResult =
+              sycl::inclusive_scan_over_group(BallotGroup, 1, sycl::plus<>());
+          IncScanAcc[WI] = (IncScanResult == LID + 1);
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only};
+  sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only};
+  sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only};
+  sycl::host_accessor AllAcc{AllBuf, sycl::read_only};
+  sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only};
+  sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
+  sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
+  sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(BarrierAcc[WI] == true);
+    assert(BroadcastAcc[WI] == true);
+    assert(AnyAcc[WI] == true);
+    assert(AllAcc[WI] == true);
+    assert(NoneAcc[WI] == true);
+    assert(ReduceAcc[WI] == true);
+    assert(ExScanAcc[WI] == true);
+    assert(IncScanAcc[WI] == true);
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp
new file mode 100644
index 0000000000000..e1d7634191df3
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp
@@ -0,0 +1,62 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+template <size_t ClusterSize> class TestKernel;
+
+template <size_t ClusterSize> void test() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          auto ClusterGroup = syclex::get_cluster_group<ClusterSize>(SG);
+
+          bool Match = true;
+          Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize));
+          Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize));
+          Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize));
+          Match &= (ClusterGroup.get_local_range() == ClusterSize);
+          MatchAcc[WI] = Match;
+          LeaderAcc[WI] = ClusterGroup.leader();
+        };
+    CGH.parallel_for<TestKernel<ClusterSize>>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0));
+  }
+}
+
+int main() {
+  test<1>();
+  test<2>();
+  test<4>();
+  test<8>();
+  test<16>();
+  test<32>();
+  return 0;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
new file mode 100644
index 0000000000000..b3b6cd5ba4adf
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
@@ -0,0 +1,12 @@
+// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <sycl/sycl.hpp>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
+static_assert(syclex::is_fixed_topology_group_v<syclex::root_group>);
+#endif
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<1>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<2>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<3>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::sub_group>);
diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp
new file mode 100644
index 0000000000000..a3f0085d8eaa0
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp
@@ -0,0 +1,14 @@
+// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <sycl/sycl.hpp>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+static_assert(
+    syclex::is_user_constructed_group_v<syclex::ballot_group<sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<
+              syclex::cluster_group<1, sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<
+              syclex::cluster_group<2, sycl::sub_group>>);
+static_assert(
+    syclex::is_user_constructed_group_v<syclex::tangle_group<sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<syclex::opportunistic_group>);
diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
new file mode 100644
index 0000000000000..925340cee1c6d
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
@@ -0,0 +1,68 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Due to the unpredictable runtime behavior of opportunistic groups,
+          // some values may change from run to run. Check they're in expected
+          // ranges and consistent with other groups.
+          if (item.get_global_id() % 2 == 0) {
+            auto OpportunisticGroup =
+                syclex::this_kernel::get_opportunistic_group();
+
+            bool Match = true;
+            Match &= (OpportunisticGroup.get_group_id() == 0);
+            Match &= (OpportunisticGroup.get_local_id() <
+                      OpportunisticGroup.get_local_range());
+            Match &= (OpportunisticGroup.get_group_range() == 1);
+            Match &= (OpportunisticGroup.get_local_linear_range() <=
+                      SG.get_local_linear_range());
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = OpportunisticGroup.leader();
+          }
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  uint32_t NumLeaders = 0;
+  for (int WI = 0; WI < 32; ++WI) {
+    if (WI % 2 == 0) {
+      assert(MatchAcc[WI] == true);
+      if (LeaderAcc[WI]) {
+        NumLeaders++;
+      }
+    }
+  }
+  assert(NumLeaders > 0);
+  return 0;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
new file mode 100644
index 0000000000000..9e57a48633e0a
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items via control flow.
+          // Branches deliberately duplicated to test impact of optimizations.
+          // This only reliably works with optimizations disabled right now.
+          if (item.get_global_id() % 2 == 0) {
+            auto TangleGroup = syclex::get_tangle_group(SG);
+
+            bool Match = true;
+            Match &= (TangleGroup.get_group_id() == 0);
+            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+            Match &= (TangleGroup.get_group_range() == 1);
+            Match &= (TangleGroup.get_local_range() == 16);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = TangleGroup.leader();
+          } else {
+            auto TangleGroup = syclex::get_tangle_group(SG);
+
+            bool Match = true;
+            Match &= (TangleGroup.get_group_id() == 0);
+            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+            Match &= (TangleGroup.get_group_range() == 1);
+            Match &= (TangleGroup.get_local_range() == 16);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = TangleGroup.leader();
+          }
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == (WI < 2));
+  }
+  return 0;
+}