diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
index 1bd9465c57f0c..77d052275e247 100644
--- a/sycl/include/sycl/reduction.hpp
+++ b/sycl/include/sycl/reduction.hpp
@@ -1079,17 +1079,25 @@ using __sycl_reduction_kernel =
     std::conditional_t<std::is_same<KernelName, auto_name>::value, auto_name,
                        MainOrAux<KernelName, Strategy, Ts...>>;
 
+// Enum for specifying work size guarantees in tree-reduction.
+enum class WorkSizeGuarantees { None, Equal, LessOrEqual };
+
 // Implementations.
 
 template <reduction::strategy> struct NDRangeReduction;
 
 template <>
 struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
-                  Reduction &Redu, KernelType &KernelFunc) {
+                  nd_range<Dims> NDRange, size_t, size_t,
+                  PropertiesT &Properties, Reduction &Redu,
+                  KernelType &KernelFunc) {
+    static_assert(Reduction::has_identity,
+                  "Identityless reductions are not supported by the "
+                  "local_atomic_and_atomic_cross_wg strategy.");
+
     std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
@@ -1132,11 +1140,12 @@ struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
 template <>
 struct NDRangeReduction<
     reduction::strategy::group_reduce_and_last_wg_detection> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
-                  Reduction &Redu, KernelType &KernelFunc) {
+                  nd_range<Dims> NDRange, size_t, size_t,
+                  PropertiesT &Properties, Reduction &Redu,
+                  KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
                   "Identityless reductions are not supported by the "
                   "group_reduce_and_last_wg_detection strategy.");
@@ -1276,13 +1285,10 @@ void doTreeReductionHelper(size_t WorkSize, size_t LID, FuncTy Func) {
   }
 }
 
-// Enum for specifying work size guarantees in tree-reduction.
-enum class WorkSizeGuarantees { None, Equal, LessOrEqual };
-
 template <WorkSizeGuarantees WSGuarantee, int Dim, typename LocalRedsTy,
           typename BinOpTy, typename AccessFuncTy>
 void doTreeReduction(size_t WorkSize, nd_item<Dim> NDIt, LocalRedsTy &LocalReds,
-                     BinOpTy &BOp, AccessFuncTy AccessFunc) {
+                     const BinOpTy &BOp, AccessFuncTy AccessFunc) {
   size_t LID = NDIt.get_local_linear_id();
   size_t AdjustedWorkSize;
   if constexpr (WSGuarantee == WorkSizeGuarantees::LessOrEqual ||
@@ -1329,16 +1335,25 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
 }
 
 template <> struct NDRangeReduction<reduction::strategy::range_basic> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees WSG, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  nd_range<Dims> NDRange, size_t ActiveItemsPerWG,
+                  size_t ActiveItemsLastWG, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     std::ignore = Queue;
     size_t NElements = Reduction::num_elements;
     size_t WGSize = NDRange.get_local_range().size();
     size_t NWorkGroups = NDRange.get_group_range().size();
 
+    assert((WSG != WorkSizeGuarantees::Equal ||
+            (ActiveItemsPerWG == WGSize && ActiveItemsLastWG == WGSize)) &&
+           "Work size guarantee violation.");
+
+    // Ignore active items to avoid warnings when uncaptured.
+    std::ignore = ActiveItemsPerWG;
+    std::ignore = ActiveItemsLastWG;
+
     bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
     auto PartialSums =
         Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
@@ -1362,14 +1377,31 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
           typename Reduction::reducer_type(IdentityContainer, BOp);
       KernelFunc(NDId, Reducer);
 
+      size_t NWorkGroups = NDId.get_group_range().size();
+
+      // Compute the number of active items for this work-group.
+      size_t ActiveItems = [&]() {
+        if constexpr (WSG == WorkSizeGuarantees::Equal) {
+          // We are guaranteed exactly one element for each work item, so we
+          // can assume that ActiveItemsPerWG is the same as the workgroup
+          // size.
+          return NDId.get_local_range().size();
+        } else {
+          // Otherwise we either have the number of active items, except the
+          // last group may have less.
+          size_t GrID = NDId.get_group_linear_id();
+          return GrID == NWorkGroups - 1 ? ActiveItemsLastWG : ActiveItemsPerWG;
+        }
+      }();
+
       // If there are multiple values, reduce each separately
       // This prevents local memory from scaling with elements
       size_t LID = NDId.get_local_linear_id();
       for (int E = 0; E < NElements; ++E) {
 
-        doTreeReduction<WorkSizeGuarantees::Equal>(
-            WGSize, NDId, LocalReds, BOp,
-            [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
+        doTreeReduction<WSG>(ActiveItems, NDId, LocalReds, BOp, [&](size_t) {
+          return getReducerAccess(Reducer).getElement(E);
+        });
 
         if (LID == 0) {
           auto V = LocalReds[0];
@@ -1417,11 +1449,16 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
 
 template <>
 struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
-                  Reduction &Redu, KernelType &KernelFunc) {
+                  nd_range<Dims> NDRange, size_t, size_t,
+                  PropertiesT &Properties, Reduction &Redu,
+                  KernelType &KernelFunc) {
+    static_assert(Reduction::has_identity,
+                  "Identityless reductions are not supported by the "
+                  "group_reduce_and_atomic_cross_wg strategy.");
+
     std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
         reduction::MainKrn, KernelName,
@@ -1449,10 +1486,11 @@ struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
 template <>
 struct NDRangeReduction<
     reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees WSG, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  nd_range<Dims> NDRange, size_t ActiveItemsPerWG,
+                  size_t ActiveItemsLastWG, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     std::ignore = Queue;
     using Name = __sycl_reduction_kernel<
@@ -1462,6 +1500,14 @@ struct NDRangeReduction<
       size_t NElements = Reduction::num_elements;
       size_t WGSize = NDRange.get_local_range().size();
 
+      assert((WSG != WorkSizeGuarantees::Equal ||
+              (ActiveItemsPerWG == WGSize && ActiveItemsLastWG == WGSize)) &&
+             "Work size guarantee violation.");
+
+      // Ignore active items to avoid warnings when uncaptured.
+      std::ignore = ActiveItemsPerWG;
+      std::ignore = ActiveItemsLastWG;
+
       // Use local memory to reduce elements in work-groups into zero-th
       // element.
       local_accessor<typename Reduction::result_type, 1> LocalReds{WGSize, CGH};
@@ -1471,17 +1517,33 @@ struct NDRangeReduction<
         typename Reduction::reducer_type Reducer;
         KernelFunc(NDIt, Reducer);
 
-        size_t WGSize = NDIt.get_local_range().size();
         size_t LID = NDIt.get_local_linear_id();
 
+        // Compute the number of active items for this work-group.
+        size_t ActiveItems = [&]() {
+          if constexpr (WSG == WorkSizeGuarantees::Equal) {
+            // We are guaranteed exactly one element for each work item, so we
+            // can assume that ActiveItemsPerWG is the same as the workgroup
+            // size.
+            return NDIt.get_local_range().size();
+          } else {
+            // Otherwise we either have the number of active items, except the
+            // last group may have less.
+            size_t GrID = NDIt.get_group_linear_id();
+            size_t NWorkGroups = NDIt.get_group_range().size();
+            return GrID == NWorkGroups - 1 ? ActiveItemsLastWG
+                                           : ActiveItemsPerWG;
+          }
+        }();
+
         // If there are multiple values, reduce each separately
         // This prevents local memory from scaling with elements
         for (int E = 0; E < NElements; ++E) {
 
           typename Reduction::binary_operation BOp;
-          doTreeReduction<WorkSizeGuarantees::Equal>(
-              WGSize, NDIt, LocalReds, BOp,
-              [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
+          doTreeReduction<WSG>(ActiveItems, NDIt, LocalReds, BOp, [&](size_t) {
+            return getReducerAccess(Reducer).getElement(E);
+          });
 
           if (LID == 0)
             getReducerAccess(Reducer).getElement(E) = LocalReds[0];
@@ -1503,11 +1565,16 @@ struct NDRangeReduction<
 template <>
 struct NDRangeReduction<
     reduction::strategy::group_reduce_and_multiple_kernels> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
-                  Reduction &Redu, KernelType &KernelFunc) {
+                  nd_range<Dims> NDRange, size_t, size_t,
+                  PropertiesT &Properties, Reduction &Redu,
+                  KernelType &KernelFunc) {
+    static_assert(Reduction::has_identity,
+                  "Identityless reductions are not supported by the "
+                  "group_reduce_and_multiple_kernels strategy.");
+
     // Before running the kernels, check that device has enough local memory
     // to hold local arrays that may be required for the reduction algorithm.
     // TODO: If the work-group-size is limited by the local memory, then
@@ -1637,10 +1704,11 @@ struct NDRangeReduction<
 };
 
 template <> struct NDRangeReduction<reduction::strategy::basic> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees WSG, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  nd_range<Dims> NDRange, size_t ActiveItemsPerWG,
+                  size_t ActiveItemsLastWG, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     constexpr bool HFR = Reduction::has_fast_reduce;
     size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
@@ -1660,6 +1728,14 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
     size_t NWorkGroups = NDRange.get_group_range().size();
     auto Out = Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
 
+    assert((WSG != WorkSizeGuarantees::Equal ||
+            (ActiveItemsPerWG == WGSize && ActiveItemsLastWG == WGSize)) &&
+           "Work size guarantee violation.");
+
+    // Ignore active items to avoid warnings when uncaptured.
+    std::ignore = ActiveItemsPerWG;
+    std::ignore = ActiveItemsLastWG;
+
     bool IsUpdateOfUserVar =
         !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
 
@@ -1678,20 +1754,34 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
           typename Reduction::reducer_type(IdentityContainer, BOp);
       KernelFunc(NDIt, Reducer);
 
-      size_t WGSize = NDIt.get_local_range().size();
       size_t LID = NDIt.get_local_linear_id();
+      size_t GrID = NDIt.get_group_linear_id();
+
+      // Compute the number of active items for this work-group.
+      size_t ActiveItems = [&]() {
+        if constexpr (WSG == WorkSizeGuarantees::Equal) {
+          // We are guaranteed exactly one element for each work item, so we
+          // can assume that ActiveItemsPerWG is the same as the workgroup
+          // size.
+          return NDIt.get_local_range().size();
+        } else {
+          // Otherwise we either have the number of active items, except the
+          // last group may have less.
+          size_t NWorkGroups = NDIt.get_group_range().size();
+          return GrID == NWorkGroups - 1 ? ActiveItemsLastWG : ActiveItemsPerWG;
+        }
+      }();
 
       // If there are multiple values, reduce each separately
       // This prevents local memory from scaling with elements
       for (int E = 0; E < NElements; ++E) {
 
-        doTreeReduction<WorkSizeGuarantees::Equal>(
-            WGSize, NDIt, LocalReds, BOp,
-            [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
+        doTreeReduction<WSG>(ActiveItems, NDIt, LocalReds, BOp, [&](size_t) {
+          return getReducerAccess(Reducer).getElement(E);
+        });
 
         // Compute the partial sum/reduction for the work-group.
         if (LID == 0) {
-          size_t GrID = NDIt.get_group_linear_id();
           typename Reduction::result_type PSum = LocalReds[0];
           if (IsUpdateOfUserVar)
             PSum = BOp(Out[0], PSum);
@@ -1929,23 +2019,29 @@ struct IsArrayReduction {
 
 /// All scalar reductions are processed together; there is one loop of log2(N)
 /// steps, and each reduction uses its own storage.
-template <typename... Reductions, int Dims, typename... LocalAccT,
-          typename... OutAccT, typename... ReducerT, typename... Ts,
-          typename... BOPsT, size_t... Is>
+template <WorkSizeGuarantees WSG, typename... Reductions, int Dims,
+          typename... LocalAccT, typename... OutAccT, typename... ReducerT,
+          typename... Ts, typename... BOPsT, size_t... Is>
 void reduCGFuncImplScalar(
-    bool IsOneWG, nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
+    bool IsOneWG, nd_item<Dims> NDIt, size_t ActiveItems,
+    ReduTupleT<LocalAccT...> LocalAccsTuple,
     ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
     ReduTupleT<Ts...> IdentitiesTuple, ReduTupleT<BOPsT...> BOPsTuple,
     std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
     std::index_sequence<Is...> ReduIndices) {
-  size_t WGSize = NDIt.get_local_range().size();
+  static_assert(WSG != WorkSizeGuarantees::None,
+                "reduCGFuncImplScalar requires either equal or less-or-equal "
+                "work size guarantee");
+
   size_t LID = NDIt.get_local_linear_id();
 
-  ((std::get<Is>(LocalAccsTuple)[LID] =
-        getReducerAccess(std::get<Is>(ReducersTuple)).getElement(0)),
-   ...);
+  if (WSG == WorkSizeGuarantees::Equal || LID < ActiveItems)
+    ((std::get<Is>(LocalAccsTuple)[LID] =
+          getReducerAccess(std::get<Is>(ReducersTuple)).getElement(0)),
+     ...);
 
-  doTreeReductionOnTuple(WGSize, LID, LocalAccsTuple, BOPsTuple, ReduIndices);
+  doTreeReductionOnTuple(ActiveItems, LID, LocalAccsTuple, BOPsTuple,
+                         ReduIndices);
 
   // Compute the partial sum/reduction for the work-group.
   if (LID == 0) {
@@ -1957,22 +2053,22 @@ void reduCGFuncImplScalar(
 }
 
 /// Each array reduction is processed separately.
-template <typename Reduction, int Dims, typename LocalAccT, typename OutAccT,
-          typename ReducerT, typename BOPT>
+template <WorkSizeGuarantees WSG, typename Reduction, int Dims,
+          typename LocalAccT, typename OutAccT, typename ReducerT,
+          typename BOPT>
 void reduCGFuncImplArrayHelper(bool IsOneWG, nd_item<Dims> NDIt,
-                               LocalAccT LocalReds, OutAccT Out,
-                               ReducerT &Reducer, BOPT BOp,
+                               size_t ActiveItems, LocalAccT LocalReds,
+                               OutAccT Out, ReducerT &Reducer, BOPT BOp,
                                bool IsInitializeToIdentity) {
-  size_t WGSize = NDIt.get_local_range().size();
   size_t LID = NDIt.get_local_linear_id();
 
   // If there are multiple values, reduce each separately
   // This prevents local memory from scaling with elements
   auto NElements = Reduction::num_elements;
   for (size_t E = 0; E < NElements; ++E) {
-    doTreeReduction<WorkSizeGuarantees::Equal>(
-        WGSize, NDIt, LocalReds, BOp,
-        [&](size_t) { return getReducerAccess(Reducer).getElement(E); });
+    doTreeReduction<WSG>(ActiveItems, NDIt, LocalReds, BOp, [&](size_t) {
+      return getReducerAccess(Reducer).getElement(E);
+    });
 
     // Add the initial value of user's variable to the final result.
     if (LID == 0) {
@@ -1994,34 +2090,40 @@ void reduCGFuncImplArrayHelper(bool IsOneWG, nd_item<Dims> NDIt,
   }
 }
 
-template <typename... Reductions, int Dims, typename... LocalAccT,
-          typename... OutAccT, typename... ReducerT, typename... BOPsT,
-          size_t... Is>
+template <WorkSizeGuarantees WSG, typename... Reductions, int Dims,
+          typename... LocalAccT, typename... OutAccT, typename... ReducerT,
+          typename... BOPsT, size_t... Is>
 void reduCGFuncImplArray(
-    bool IsOneWG, nd_item<Dims> NDIt, ReduTupleT<LocalAccT...> LocalAccsTuple,
+    bool IsOneWG, nd_item<Dims> NDIt, size_t ActiveItems,
+    ReduTupleT<LocalAccT...> LocalAccsTuple,
     ReduTupleT<OutAccT...> OutAccsTuple, std::tuple<ReducerT...> &ReducersTuple,
     ReduTupleT<BOPsT...> BOPsTuple,
     std::array<bool, sizeof...(Reductions)> InitToIdentityProps,
     std::index_sequence<Is...>) {
   using ReductionPack = std::tuple<Reductions...>;
-  (reduCGFuncImplArrayHelper<std::tuple_element_t<Is, ReductionPack>>(
-       IsOneWG, NDIt, std::get<Is>(LocalAccsTuple), std::get<Is>(OutAccsTuple),
-       std::get<Is>(ReducersTuple), std::get<Is>(BOPsTuple),
-       InitToIdentityProps[Is]),
+  (reduCGFuncImplArrayHelper<WSG, std::tuple_element_t<Is, ReductionPack>>(
+       IsOneWG, NDIt, ActiveItems, std::get<Is>(LocalAccsTuple),
+       std::get<Is>(OutAccsTuple), std::get<Is>(ReducersTuple),
+       std::get<Is>(BOPsTuple), InitToIdentityProps[Is]),
    ...);
 }
 
 namespace reduction::main_krn {
 template <class KernelName, class Accessor> struct NDRangeMulti;
 } // namespace reduction::main_krn
-template <typename KernelName, typename KernelType, int Dims,
-          typename PropertiesT, typename... Reductions, size_t... Is>
+template <typename KernelName, WorkSizeGuarantees WSG, typename KernelType,
+          int Dims, typename PropertiesT, typename... Reductions, size_t... Is>
 void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
-                     const nd_range<Dims> &Range, PropertiesT Properties,
+                     const nd_range<Dims> &Range, size_t ActiveItemsPerWG,
+                     size_t ActiveItemsLastWG, PropertiesT Properties,
                      std::tuple<Reductions...> &ReduTuple,
                      std::index_sequence<Is...> ReduIndices) {
   size_t WGSize = Range.get_local_range().size();
 
+  // Ignore active items to avoid warnings when uncaptured.
+  std::ignore = ActiveItemsPerWG;
+  std::ignore = ActiveItemsLastWG;
+
   // Split reduction sequence into two:
   // 1) Scalar reductions
   // 2) Array reductions
@@ -2066,21 +2168,38 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
       std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); },
                  ReducersTuple);
 
+      // Compute the number of active items for this work-group.
+      size_t ActiveItems = [&]() {
+        if constexpr (WSG == WorkSizeGuarantees::Equal) {
+          // We are guaranteed exactly one element for each work item, so we
+          // can assume that ActiveItemsPerWG is the same as the workgroup
+          // size.
+          return NDIt.get_local_range().size();
+        } else {
+          // Otherwise we either have the number of active items, except the
+          // last group may have less.
+          size_t GrID = NDIt.get_group_linear_id();
+          size_t NWorkGroups = NDIt.get_group_range().size();
+          return GrID == NWorkGroups - 1 ? ActiveItemsLastWG : ActiveItemsPerWG;
+        }
+      }();
+
       // Combine and write-back the results of any scalar reductions
       // reduCGFuncImplScalar<Reductions...>(NDIt, LocalAccsTuple, OutAccsTuple,
       // ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
       // ReduIndices);
-      reduCGFuncImplScalar<Reductions...>(
-          IsOneWG, NDIt, LocalAccsTuple, OutAccsTuple, ReducersTuple,
-          IdentitiesTuple, BOPsTuple, InitToIdentityProps, ScalarIs);
+      reduCGFuncImplScalar<WSG, Reductions...>(
+          IsOneWG, NDIt, ActiveItems, LocalAccsTuple, OutAccsTuple,
+          ReducersTuple, IdentitiesTuple, BOPsTuple, InitToIdentityProps,
+          ScalarIs);
 
       // Combine and write-back the results of any array reductions
       // These are handled separately to minimize temporary storage and account
       // for the fact that each array reduction may have a different number of
       // elements to reduce (i.e. a different extent).
-      reduCGFuncImplArray<Reductions...>(IsOneWG, NDIt, LocalAccsTuple,
-                                         OutAccsTuple, ReducersTuple, BOPsTuple,
-                                         InitToIdentityProps, ArrayIs);
+      reduCGFuncImplArray<WSG, Reductions...>(
+          IsOneWG, NDIt, ActiveItems, LocalAccsTuple, OutAccsTuple,
+          ReducersTuple, BOPsTuple, InitToIdentityProps, ArrayIs);
     });
   };
 
@@ -2282,11 +2401,17 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
 }
 
 template <> struct NDRangeReduction<reduction::strategy::multi> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename... RestT>
+  template <typename KernelName, WorkSizeGuarantees WSG, int Dims,
+            typename PropertiesT, typename... RestT>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  nd_range<Dims> NDRange, size_t ActiveItemsPerWG,
+                  size_t ActiveItemsLastWG, PropertiesT &Properties,
                   RestT... Rest) {
+    assert((WSG != WorkSizeGuarantees::Equal ||
+            (ActiveItemsPerWG == NDRange.get_local_range().size() &&
+             ActiveItemsLastWG == NDRange.get_local_range().size())) &&
+           "Work size guarantee violation.");
+
     std::tuple<RestT...> ArgsTuple(Rest...);
     constexpr size_t NumArgs = sizeof...(RestT);
     auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
@@ -2305,8 +2430,9 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
                                     std::to_string(MaxWGSize),
                                 PI_ERROR_INVALID_WORK_GROUP_SIZE);
 
-    reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
-                                ReduIndices);
+    reduCGFuncMulti<KernelName, WSG>(CGH, KernelFunc, NDRange, ActiveItemsPerWG,
+                                     ActiveItemsLastWG, Properties, ReduTuple,
+                                     ReduIndices);
     reduction::finalizeHandler(CGH);
 
     size_t NWorkItems = NDRange.get_group_range().size();
@@ -2326,14 +2452,16 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
   using Impl = NDRangeReduction<Strategy>;
   using Strat = reduction::strategy;
 
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
+  template <typename KernelName, WorkSizeGuarantees WSG, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  nd_range<Dims> NDRange, size_t ActiveItemsPerWG,
+                  size_t ActiveItemsLastWG, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     auto Delegate = [&](auto Impl) {
-      Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
-                                    KernelFunc);
+      Impl.template run<KernelName, WSG>(CGH, Queue, NDRange, ActiveItemsPerWG,
+                                         ActiveItemsLastWG, Properties, Redu,
+                                         KernelFunc);
     };
 
     if constexpr (Reduction::has_float64_atomics) {
@@ -2359,22 +2487,39 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
 
     assert(false && "Must be unreachable!");
   }
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename... RestT>
+  template <typename KernelName, WorkSizeGuarantees WSG, int Dims,
+            typename PropertiesT, typename... RestT>
   static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  nd_range<Dims> NDRange, size_t ActiveItemsPerWG,
+                  size_t ActiveItemsLastWG, PropertiesT &Properties,
                   RestT... Rest) {
-    return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
-                                               Rest...);
+    return Impl<Strat::multi>::run<KernelName, WSG>(
+        CGH, Queue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG, Properties,
+        Rest...);
   }
 };
 
+template <typename KernelName, reduction::strategy Strategy,
+          WorkSizeGuarantees WSG, int Dims, typename PropertiesT,
+          typename... RestT>
+void reduction_parallel_for_impl(handler &CGH,
+                                 std::shared_ptr<detail::queue_impl> &Queue,
+                                 nd_range<Dims> NDRange,
+                                 size_t ActiveItemsPerWG,
+                                 size_t ActiveItemsLastWG,
+                                 PropertiesT Properties, RestT... Rest) {
+  NDRangeReduction<Strategy>::template run<KernelName, WSG>(
+      CGH, Queue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG, Properties,
+      Rest...);
+}
+
 template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename... RestT>
 void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
                             PropertiesT Properties, RestT... Rest) {
-  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
-                                                       Properties, Rest...);
+  size_t WGSize = NDRange.get_local_range().size();
+  reduction_parallel_for_impl<KernelName, Strategy, WorkSizeGuarantees::Equal>(
+      CGH, CGH.MQueue, NDRange, WGSize, WGSize, Properties, Rest...);
 }
 
 __SYCL_EXPORT uint32_t
@@ -2398,7 +2543,8 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
     if constexpr (sizeof...(RestT) == 2) {
       using Reduction = std::tuple_element_t<0, decltype(ReduTuple)>;
       constexpr bool IsTreeReduction =
-          !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
+          !(Reduction::has_fast_reduce && Reduction::has_identity) &&
+          !Reduction::has_fast_atomics;
       return IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
     } else {
       return reduGetMemPerWorkItem(ReduTuple, ReduIndices);
@@ -2428,6 +2574,9 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
   nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
 
   size_t PerGroup = Range.size() / NWorkGroups;
+  size_t ActiveItemsPerWG = std::min(PerGroup, WGSize);
+  size_t ActiveItemsLastWG =
+      std::min(NWorkItems - PerGroup * (NWorkGroups - 1), WGSize);
   // Iterate through the index space by assigning contiguous chunks to each
   // work-group, then iterating through each chunk using a stride equal to the
   // work-group's local range, which gives much better performance than using
@@ -2484,13 +2633,17 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
         return reduction::strategy::range_basic;
     }();
 
-    reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
-                                                      Redu, UpdatedKernelFunc);
+    reduction_parallel_for_impl<KernelName, StrategyToUse,
+                                WorkSizeGuarantees::LessOrEqual>(
+        CGH, CGH.MQueue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG,
+        Properties, Redu, UpdatedKernelFunc);
   } else {
     return std::apply(
         [&](auto &...Reds) {
-          return reduction_parallel_for<KernelName, Strategy>(
-              CGH, NDRange, Properties, Reds..., UpdatedKernelFunc);
+          return reduction_parallel_for_impl<KernelName, Strategy,
+                                             WorkSizeGuarantees::LessOrEqual>(
+              CGH, CGH.MQueue, NDRange, ActiveItemsPerWG, ActiveItemsLastWG,
+              Properties, Reds..., UpdatedKernelFunc);
         },
         ReduTuple);
   }