diff --git a/src/alpaka/AlpakaCore/alpakaConfigAcc.h b/src/alpaka/AlpakaCore/alpakaConfigAcc.h
index e0da1cf1b..65629e9ef 100644
--- a/src/alpaka/AlpakaCore/alpakaConfigAcc.h
+++ b/src/alpaka/AlpakaCore/alpakaConfigAcc.h
@@ -9,6 +9,7 @@ namespace alpaka_cuda_async {
   using namespace alpaka_common;
   using Acc1D = alpaka::AccGpuCudaRt<Dim1D, Extent>;
   using Acc2D = alpaka::AccGpuCudaRt<Dim2D, Extent>;
+  using Acc3D = alpaka::AccGpuCudaRt<Dim3D, Extent>;
   using Queue = alpaka::QueueCudaRtNonBlocking;
 }  // namespace alpaka_cuda_async
 
@@ -25,6 +26,7 @@ namespace alpaka_serial_sync {
   using namespace alpaka_common;
   using Acc1D = alpaka::AccCpuSerial<Dim1D, Extent>;
   using Acc2D = alpaka::AccCpuSerial<Dim2D, Extent>;
+  using Acc3D = alpaka::AccCpuSerial<Dim3D, Extent>;
   using Queue = alpaka::QueueCpuBlocking;
 }  // namespace alpaka_serial_sync
 
@@ -41,6 +43,7 @@ namespace alpaka_tbb_async {
   using namespace alpaka_common;
   using Acc1D = alpaka::AccCpuTbbBlocks<Dim1D, Extent>;
   using Acc2D = alpaka::AccCpuTbbBlocks<Dim2D, Extent>;
+  using Acc3D = alpaka::AccCpuTbbBlocks<Dim3D, Extent>;
   using Queue = alpaka::QueueCpuNonBlocking;
 }  // namespace alpaka_tbb_async
 
@@ -57,6 +60,7 @@ namespace alpaka_omp2_async {
   using namespace alpaka_common;
   using Acc1D = alpaka::AccCpuOmp2Blocks<Dim1D, Extent>;
   using Acc2D = alpaka::AccCpuOmp2Blocks<Dim2D, Extent>;
+  using Acc3D = alpaka::AccCpuOmp2Blocks<Dim3D, Extent>;
   using Queue = alpaka::QueueCpuNonBlocking;
 }  // namespace alpaka_omp2_async
 
@@ -73,6 +77,7 @@ namespace alpaka_omp4_async {
   using namespace alpaka_common;
   using Acc1D = alpaka::AccCpuOmp4<Dim1D, Extent>;
   using Acc2D = alpaka::AccCpuOmp4<Dim2D, Extent>;
+  using Acc3D = alpaka::AccCpuOmp4<Dim3D, Extent>;
   using Queue = alpaka::QueueCpuNonBlocking;
 }  // namespace alpaka_omp4_async
 
@@ -92,11 +97,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
   using Device = alpaka::Dev<Acc1D>;
   using Platform = alpaka::Pltf<Device>;
   static_assert(std::is_same_v<Device, alpaka::Dev<Acc2D>>,
-                STRINGIFY(alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>) " and " STRINGIFY(
-                    alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc2D>) " are different types.");
+                STRINGIFY(alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>) " and " STRINGIFY(
+                    alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc2D>) " are different types.");
+  static_assert(std::is_same_v<Device, alpaka::Dev<Acc3D>>,
+                STRINGIFY(alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>) " and " STRINGIFY(
+                    alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc3D>) " are different types.");
   static_assert(std::is_same_v<Platform, alpaka::Pltf<alpaka::Dev<Acc2D>>>,
-                STRINGIFY(alpaka::Pltf<alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>>) " and " STRINGIFY(
-                    alpaka::Pltf<alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc2D>>) " are different types.");
+                STRINGIFY(alpaka::Pltf<alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>>) " and " STRINGIFY(
+                    alpaka::Pltf<alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc2D>>) " are different types.");
+  static_assert(std::is_same_v<Platform, alpaka::Pltf<alpaka::Dev<Acc3D>>>,
+                STRINGIFY(alpaka::Pltf<alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>>) " and " STRINGIFY(
+                    alpaka::Pltf<alpaka::Dev<ALPAKA_ACCELERATOR_NAMESPACE::Acc3D>>) " are different types.");
 
   using Event = alpaka::Event<Queue>;
 
diff --git a/src/alpaka/AlpakaCore/alpakaConfigCommon.h b/src/alpaka/AlpakaCore/alpakaConfigCommon.h
index c160ef243..47d9f7e92 100644
--- a/src/alpaka/AlpakaCore/alpakaConfigCommon.h
+++ b/src/alpaka/AlpakaCore/alpakaConfigCommon.h
@@ -12,16 +12,19 @@ namespace alpaka_common {
 
   using Dim1D = alpaka::DimInt<1u>;
   using Dim2D = alpaka::DimInt<2u>;
+  using Dim3D = alpaka::DimInt<3u>;
 
   template <typename TDim>
   using Vec = alpaka::Vec<TDim, Idx>;
   using Vec1D = Vec<Dim1D>;
   using Vec2D = Vec<Dim2D>;
+  using Vec3D = Vec<Dim3D>;
 
   template <typename TDim>
   using WorkDiv = alpaka::WorkDivMembers<TDim, Idx>;
   using WorkDiv1D = WorkDiv<Dim1D>;
   using WorkDiv2D = WorkDiv<Dim2D>;
+  using WorkDiv3D = WorkDiv<Dim3D>;
 
   template <typename TData>
   using AlpakaHostBuf = alpaka::Buf<DevHost, TData, Dim1D, Idx>;
diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
index 1bcec521a..472d07ee6 100644
--- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
+++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h
@@ -8,12 +8,12 @@ using namespace alpaka_common;
 namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
 
   /*********************************************
-     *              WORKDIV CREATION
-     ********************************************/
+   *              WORKDIV CREATION
+   ********************************************/
 
   /*
-     * Creates the accelerator-dependent workdiv.
-     */
+   * Creates the accelerator-dependent workdiv.
+   */
   template <typename TDim>
   WorkDiv<TDim> make_workdiv(const Vec<TDim>& blocksPerGrid, const Vec<TDim>& threadsPerBlockOrElementsPerThread) {
     // On the GPU:
@@ -32,25 +32,25 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*********************************************
-     *           RANGE COMPUTATION
-     ********************************************/
+   *           RANGE COMPUTATION
+   ********************************************/
 
   /*
-     * Computes the range of the elements indexes, local to the block.
-     * Warning: the max index is not truncated by the max number of elements of interest.
-     */
+   * Computes the range of the elements indexes, local to the block.
+   * Warning: the max index is not truncated by the max number of elements of interest.
+   */
   template <typename TAcc>
   ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_block(const TAcc& acc,
                                                                  const Idx elementIdxShift,
                                                                  const unsigned int dimIndex = 0u) {
     // Take into account the thread index in block.
-    const Idx threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndex]);
+    const Idx threadIndex(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndex]);
     const Idx threadDimension(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[dimIndex]);
 
     // Compute the elements indexes in block.
     // Obviously relevant for CPU only.
     // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift.
-    const Idx firstElementIdxLocal = threadIdxLocal * threadDimension;
+    const Idx firstElementIdxLocal = threadIndex * threadDimension;
     const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift;  // Add the shift!
     const Idx endElementIdxUncut = firstElementIdx + threadDimension;
 
@@ -59,9 +59,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*
-     * Computes the range of the elements indexes, local to the block.
-     * Truncated by the max number of elements of interest.
-     */
+   * Computes the range of the elements indexes, local to the block.
+   * Truncated by the max number of elements of interest.
+   */
   template <typename TAcc>
   ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_block_truncated(const TAcc& acc,
                                                                            const Idx maxNumberOfElements,
@@ -80,28 +80,28 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*
-     * Computes the range of the elements indexes in grid.
-     * Warning: the max index is not truncated by the max number of elements of interest.
-     */
+   * Computes the range of the elements indexes in grid.
+   * Warning: the max index is not truncated by the max number of elements of interest.
+   */
   template <typename TAcc>
   ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_grid(const TAcc& acc,
                                                                 Idx elementIdxShift,
                                                                 const unsigned int dimIndex = 0u) {
     // Take into account the block index in grid.
-    const Idx blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
+    const Idx blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
     const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
 
     // Shift to get global indices in grid (instead of local to the block)
-    elementIdxShift += blockIdxInGrid * blockDimension;
+    elementIdxShift += blockIndex * blockDimension;
 
     // Return element indexes, shifted by elementIdxShift.
     return element_index_range_in_block(acc, elementIdxShift, dimIndex);
   }
 
   /*
-     * Computes the range of the elements indexes in grid.
-     * Truncated by the max number of elements of interest.
-     */
+   * Computes the range of the elements indexes in grid.
+   * Truncated by the max number of elements of interest.
+   */
   template <typename TAcc>
   ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_grid_truncated(const TAcc& acc,
                                                                           const Idx maxNumberOfElements,
@@ -120,9 +120,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*
-     * Computes the range of the element(s) index(es) in grid.
-     * Truncated by the max number of elements of interest.
-     */
+   * Computes the range of the element(s) index(es) in grid.
+   * Truncated by the max number of elements of interest.
+   */
   template <typename TAcc>
   ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_grid_truncated(const TAcc& acc,
                                                                           const Idx maxNumberOfElements,
@@ -132,14 +132,14 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*********************************************
-     *           LOOP ON ALL ELEMENTS
-     ********************************************/
+   *           LOOP ON ALL ELEMENTS
+   ********************************************/
 
   /*
-     * Loop on all (CPU) elements.
-     * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
-     * Indexes are local to the BLOCK.
-     */
+   * Loop on all (CPU) elements.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Indexes are local to the BLOCK.
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc,
                                                const Idx maxNumberOfElements,
@@ -147,8 +147,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
                                                const Func func,
                                                const unsigned int dimIndex = 0) {
     const auto& [firstElementIdx, endElementIdx] =
-        ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block_truncated(
-            acc, maxNumberOfElements, elementIdxShift, dimIndex);
+        element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex);
 
     for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) {
       func(elementIdx);
@@ -156,23 +155,22 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*
-     * Overload for elementIdxShift = 0
-     */
+   * Overload for elementIdxShift = 0
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc,
                                                const Idx maxNumberOfElements,
                                                const Func func,
                                                const unsigned int dimIndex = 0) {
     const Idx elementIdxShift = 0;
-    ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(
-        acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+    for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
   }
 
   /*
-     * Loop on all (CPU) elements.
-     * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
-     * Indexes are expressed in GRID 'frame-of-reference'.
-     */
+   * Loop on all (CPU) elements.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Indexes are expressed in GRID 'frame-of-reference'.
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc,
                                               const Idx maxNumberOfElements,
@@ -180,36 +178,35 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
                                               const Func func,
                                               const unsigned int dimIndex = 0) {
     // Take into account the block index in grid to compute the element indices.
-    const Idx blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
+    const Idx blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
     const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
-    elementIdxShift += blockIdxInGrid * blockDimension;
+    elementIdxShift += blockIndex * blockDimension;
 
     for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
   }
 
   /*
-     * Overload for elementIdxShift = 0
-     */
+   * Overload for elementIdxShift = 0
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc,
                                               const Idx maxNumberOfElements,
                                               const Func func,
                                               const unsigned int dimIndex = 0) {
     const Idx elementIdxShift = 0;
-    ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid(
-        acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+    for_each_element_in_grid(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
   }
 
   /**************************************************************
-     *          LOOP ON ALL ELEMENTS, WITH STRIDED ACCESS
-     **************************************************************/
+   *          LOOP ON ALL ELEMENTS, WITH STRIDED ACCESS
+   **************************************************************/
 
   /*
-     * (CPU) Loop on all elements + (CPU/GPU) Strided access.
-     * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
-     * Stride to full problem size, by BLOCK size.
-     * Indexes are local to the BLOCK.
-     */
+   * (CPU) Loop on all elements + (CPU/GPU) Strided access.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Stride to full problem size, by BLOCK size.
+   * Indexes are local to the BLOCK.
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc,
                                                        const Idx maxNumberOfElements,
@@ -218,7 +215,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
                                                        const unsigned int dimIndex = 0) {
     // Get thread / element indices in block.
     const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
-        ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, elementIdxShift, dimIndex);
+        element_index_range_in_block(acc, elementIdxShift, dimIndex);
 
     // Stride = block size.
     const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
@@ -238,24 +235,23 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*
-     * Overload for elementIdxShift = 0
-     */
+   * Overload for elementIdxShift = 0
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc,
                                                        const Idx maxNumberOfElements,
                                                        const Func func,
                                                        const unsigned int dimIndex = 0) {
     const Idx elementIdxShift = 0;
-    ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(
-        acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+    for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
   }
 
   /*
-     * (CPU) Loop on all elements + (CPU/GPU) Strided access.
-     * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
-     * Stride to full problem size, by GRID size.
-     * Indexes are local to the GRID.
-     */
+   * (CPU) Loop on all elements + (CPU/GPU) Strided access.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Stride to full problem size, by GRID size.
+   * Indexes are local to the GRID.
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc,
                                                       const Idx maxNumberOfElements,
@@ -264,7 +260,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
                                                       const unsigned int dimIndex = 0) {
     // Get thread / element indices in block.
     const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
-        ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_grid(acc, elementIdxShift, dimIndex);
+        element_index_range_in_grid(acc, elementIdxShift, dimIndex);
 
     // Stride = grid size.
     const Idx gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndex]);
@@ -284,32 +280,31 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
   }
 
   /*
-     * Overload for elementIdxShift = 0
-     */
+   * Overload for elementIdxShift = 0
+   */
   template <typename TAcc, typename Func>
   ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc,
                                                       const Idx maxNumberOfElements,
                                                       const Func func,
                                                       const unsigned int dimIndex = 0) {
     const Idx elementIdxShift = 0;
-    ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-        acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+    for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
   }
 
   /**************************************************************
-     *          LOOP ON ALL ELEMENTS WITH ONE LOOP
-     **************************************************************/
+   *          LOOP ON ALL ELEMENTS WITH ONE LOOP
+   **************************************************************/
 
   /*
-     * Case where the input index i has reached the end of threadDimension: strides the input index.
-     * Otherwise: do nothing.
-     * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops
-     * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, 
-     * because of the additional loop over elements in Alpaka model). 
-     * This allows to keep the 'continue' and 'break' statements as-is from legacy code, 
-     * and hence avoids a lot of legacy code reshuffling.
-     * NB 2: Modifies i, firstElementIdx and endElementIdx.
-     */
+   * Case where the input index i has reached the end of threadDimension: strides the input index.
+   * Otherwise: do nothing.
+   * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops
+   * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example,
+   * because of the additional loop over elements in Alpaka model).
+   * This allows to keep the 'continue' and 'break' statements as-is from legacy code,
+   * and hence avoids a lot of legacy code reshuffling.
+   * NB 2: Modifies i, firstElementIdx and endElementIdx.
+   */
   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided(
       Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) {
     bool isNextStrideElementValid = true;
@@ -324,6 +319,464 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
     return isNextStrideElementValid;
   }
 
+  /*
+   * Class which simplifies "for" loops over elements index
+   */
+  template <typename T_Acc, typename T>
+  class elements_with_stride {
+  public:
+    ALPAKA_FN_ACC elements_with_stride(const T_Acc& acc,
+                                       T extent,
+                                       Idx elementIdxShift = 0,
+                                       const unsigned int dimIndex = 0) {
+      const Idx threadIndex = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndex];
+      const Idx blockIndex = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex];
+      const Idx blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex];
+      const Idx gridDimension = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex];
+
+      thread_ = blockDimension * blockIndex + threadIndex;
+      thread_ = thread_ + elementIdxShift;  // Add the shift
+      stride_ = gridDimension * blockDimension;
+      blockDim_ = blockDimension;
+
+      extent_ = extent;
+    }
+
+    ALPAKA_FN_ACC elements_with_stride(const T_Acc& acc) {
+      const Idx gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0]);
+      elements_with_stride(acc, gridDimension);
+    }
+
+    class iterator {
+      friend class elements_with_stride;
+
+    public:
+      ALPAKA_FN_ACC constexpr T operator*() const { return index_; }
+
+      ALPAKA_FN_ACC constexpr iterator& operator++() {
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+        // CUDA backend: iterate over all the elements with a grid-wise stride
+        index_ += stride_;
+        if (index_ < extent_) {
+          return *this;
+        }
+#else
+        // CPU backend: iterate over all the elements for one thread
+        index_ += 1;
+        if (index_ < old_index_ + blockDim_ and index_ < extent_) {
+          return *this;
+        }
+#endif
+        // the iterator has reached or overflown the end of the extent, clamp it to the extent
+        index_ = extent_;
+        return *this;
+      }
+
+      ALPAKA_FN_ACC constexpr iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); }
+
+      ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const { return index_ < other.index_; }
+
+    private:
+      ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent, T blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {}
+
+      ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent, T index, T blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {}
+
+      T thread_;
+      T stride_;
+      T extent_;
+      T index_;
+      T old_index_;
+      T blockDim_;
+    };
+
+    ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); }
+
+    ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); }
+
+  private:
+    T thread_;
+    T stride_;
+    T extent_;
+    T blockDim_;
+  };
+
+  /*
+   * Class which simplifies "for" loops over elements index
+   * Iterates over one dimension
+   */
+  template <typename T_Acc>
+  class elements_with_stride_1d {
+  public:
+    ALPAKA_FN_ACC elements_with_stride_1d(const T_Acc& acc) {
+      const Vec3D threadIndex(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+      const Vec3D blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+
+      const Vec3D blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+      const Vec3D gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
+
+      thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u],
+                 blockDimension[1u] * blockIndex[1u] + threadIndex[1u],
+                 blockDimension[2u] * blockIndex[2u] + threadIndex[2u]};
+      stride_ = {blockDimension[0u] * gridDimension[0u], Idx{1}, Idx{1}};
+      extent_ = stride_;
+
+      blockDim_ = blockDimension;
+    }
+
+    ALPAKA_FN_ACC elements_with_stride_1d(const T_Acc& acc, Vec3D extent, Vec3D elementIdxShift = Vec3D::all(0))
+        : extent_(extent + elementIdxShift) {
+      const Vec3D threadIndex(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+      const Vec3D blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+      const Vec3D blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+      const Vec3D gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
+
+      thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u],
+                 blockDimension[1u] * blockIndex[1u] + threadIndex[1u],
+                 blockDimension[2u] * blockIndex[2u] + threadIndex[2u]};
+      thread_ = thread_ + elementIdxShift;
+      stride_ = {blockDimension[0u] * gridDimension[0u], Idx{1}, Idx{1}};
+
+      blockDim_ = blockDimension;
+    }
+
+    class iterator {
+      friend class elements_with_stride_1d;
+
+    public:
+      ALPAKA_FN_ACC Vec3D operator*() const { return index_; }
+
+      ALPAKA_FN_ACC constexpr iterator& operator++() {
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+        // increment the first coordinate
+        index_[0u] += stride_[0u];
+        if (index_[0u] < extent_[0u])
+          return *this;
+#else
+        // increment the 3rd index and check its value
+        index_[2u] += 1;
+        if (index_[2u] == old_index_[2u] + blockDim_[2u])
+          index_[2u] = old_index_[2u];
+
+        //  if the 3rd index was reset, increment the 2nd index
+        if (index_[2u] == old_index_[2u])
+          index_[1u] += 1;
+        if (index_[1u] == old_index_[1u] + blockDim_[1u])
+          index_[1u] = old_index_[1u];
+
+        // if the 3rd and 2nd indices were set, increment the first coordinate
+        if (index_[1u] == old_index_[1u] and index_[2u] == old_index_[2u])
+          index_[0u] += 1;
+
+        if (index_[0u] < old_index_[0u] + blockDim_[0u] and index_[0u] < extent_[0u]) {
+          return *this;
+        }
+#endif
+
+        // the iterator has reached or overflown the end of the extent, clamp it
+        // to the extent
+        index_ = extent_;
+        return *this;
+      }
+
+      ALPAKA_FN_ACC constexpr iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); }
+
+      ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const { return index_[0u] < other.index_[0u]; }
+
+    private:
+      ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {}
+
+      ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D index, Vec3D blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {}
+
+      Vec3D thread_;
+      Vec3D stride_;
+      Vec3D extent_;
+      Vec3D index_;
+      Vec3D old_index_;
+      Vec3D blockDim_;
+    };
+
+    ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); }
+
+    ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); }
+
+  private:
+    Vec3D thread_ = Vec3D::all(1);
+    Vec3D stride_ = Vec3D::all(1);
+    Vec3D extent_ = Vec3D::all(1);
+    Vec3D blockDim_ = Vec3D::all(1);
+  };
+
+  /*
+   * Class which simplifies "for" loops over elements index
+   * Iterates over two dimensions
+   */
+  template <typename T_Acc>
+  class elements_with_stride_2d {
+  public:
+    ALPAKA_FN_ACC elements_with_stride_2d(const T_Acc& acc) {
+      const Vec3D threadIndex(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+      const Vec3D blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+      const Vec3D blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+      const Vec3D gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
+
+      thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u],
+                 blockDimension[1u] * blockIndex[1u] + threadIndex[1u],
+                 blockDimension[2u] * blockIndex[2u] + threadIndex[2u]};
+      stride_ = {blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], Idx{1}};
+      extent_ = stride_;
+
+      blockDim_ = blockDimension;
+    }
+
+    ALPAKA_FN_ACC elements_with_stride_2d(const T_Acc& acc, Vec3D extent, Vec3D elementIdxShift = Vec3D::all(0))
+        : extent_(extent + elementIdxShift) {
+      const Vec3D threadIndex(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+      const Vec3D blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+      const Vec3D blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+      const Vec3D gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
+
+      thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u],
+                 blockDimension[1u] * blockIndex[1u] + threadIndex[1u],
+                 blockDimension[2u] * blockIndex[2u] + threadIndex[2u]};
+      thread_ = thread_ + elementIdxShift;
+      stride_ = {blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], Idx{1}};
+
+      blockDim_ = blockDimension;
+    }
+
+    class iterator {
+      friend class elements_with_stride_2d;
+
+    public:
+      ALPAKA_FN_ACC Vec3D operator*() const { return index_; }
+
+      ALPAKA_FN_ACC constexpr iterator& operator++() {
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+        // increment the first coordinate
+        index_[0u] += stride_[0u];
+        if (index_[0u] < extent_[0u])
+          return *this;
+
+        // if the first coordinate overflows, reset it and increment the second
+        // coordinate
+        index_[0u] = thread_[0u];
+        index_[1u] += stride_[1u];
+        if (index_[1u] < extent_[1u])
+          return *this;
+#else
+        // increment the 3rd index and check its value
+        index_[2u] += 1;
+        if (index_[2u] == old_index_[2u] + blockDim_[2u])
+          index_[2u] = old_index_[2u];
+
+        //  if the 3rd index was reset, increment the 2nd index
+        if (index_[2u] == old_index_[2u])
+          index_[1u] += 1;
+        if (index_[1u] == old_index_[1u] + blockDim_[1u] || index_[1u] == extent_[1u])
+          index_[1u] = old_index_[1u];
+
+        // if the 3rd and 2nd indices were set, increment the first coordinate
+        if (index_[1u] == old_index_[1u] and index_[2u] == old_index_[2u])
+          index_[0u] += 1;
+
+        if (index_[0u] < old_index_[0u] + blockDim_[0u] and index_[0u] < extent_[0u] and index_[1u] < extent_[1u]) {
+          return *this;
+        }
+#endif
+
+        // the iterator has reached or overflown the end of the extent, clamp it
+        // to the extent
+        index_ = extent_;
+        return *this;
+      }
+
+      ALPAKA_FN_ACC constexpr iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); }
+
+      ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const {
+        return (index_[0u] < other.index_[0u] and index_[1u] < other.index_[1u]);
+      }
+
+    private:
+      ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {}
+
+      ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D index, Vec3D blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {}
+
+      Vec3D thread_;
+      Vec3D stride_;
+      Vec3D extent_;
+      Vec3D index_;
+      Vec3D old_index_;
+      Vec3D blockDim_;
+    };
+
+    ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); }
+
+    ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); }
+
+  private:
+    Vec3D thread_ = Vec3D::all(1);
+    Vec3D stride_ = Vec3D::all(1);
+    Vec3D extent_ = Vec3D::all(1);
+    Vec3D blockDim_ = Vec3D::all(1);
+  };
+
+  /*
+   * Class which simplifies "for" loops over elements index
+   * Iterates over three dimensions
+   */
+  template <typename T_Acc>
+  class elements_with_stride_3d {
+  public:
+    ALPAKA_FN_ACC elements_with_stride_3d(const T_Acc& acc) {
+      const Vec3D threadIndex(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+      const Vec3D blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+      const Vec3D blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+      const Vec3D gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
+
+      thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u],
+                 blockDimension[1u] * blockIndex[1u] + threadIndex[1u],
+                 blockDimension[2u] * blockIndex[2u] + threadIndex[2u]};
+      stride_ = {blockDimension[0u] * gridDimension[0u],
+                 blockDimension[1u] * gridDimension[1u],
+                 blockDimension[2u] * gridDimension[2u]};
+      extent_ = stride_;
+
+      blockDim_ = blockDimension;
+    }
+
+    ALPAKA_FN_ACC elements_with_stride_3d(const T_Acc& acc, Vec3D extent, Vec3D elementIdxShift = Vec3D::all(0))
+        : extent_(extent + elementIdxShift) {
+      const Vec3D threadIndex(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+      const Vec3D blockIndex(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+      const Vec3D blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+      const Vec3D gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc));
+
+      thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u],
+                 blockDimension[1u] * blockIndex[1u] + threadIndex[1u],
+                 blockDimension[2u] * blockIndex[2u] + threadIndex[2u]};
+      thread_ = thread_ + elementIdxShift;
+      stride_ = {blockDimension[0u] * gridDimension[0u],
+                 blockDimension[1u] * gridDimension[1u],
+                 blockDimension[2u] * gridDimension[2u]};
+
+      blockDim_ = blockDimension;
+    }
+
+    class iterator {
+      friend class elements_with_stride_3d;
+
+    public:
+      ALPAKA_FN_ACC Vec3D operator*() const { return index_; }
+
+      ALPAKA_FN_ACC constexpr iterator& operator++() {
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+        // increment the first coordinate
+        index_[0u] += stride_[0u];
+        if (index_[0u] < extent_[0u])
+          return *this;
+
+        // if the first coordinate overflows, reset it and increment the second
+        // coordinate
+        index_[0u] = thread_[0u];
+        index_[1u] += stride_[1u];
+        if (index_[1u] < extent_[1u])
+          return *this;
+
+        // if the second coordinate overflows, reset it and increment the third
+        // coordinate
+        index_[1u] = thread_[1u];
+        index_[2u] += stride_[2u];
+        if (index_[2u] < extent_[2u])
+          return *this;
+#else
+        // increment the 3rd index and check its value
+        index_[2u] += 1;
+        if (index_[2u] == old_index_[2u] + blockDim_[2u] || index_[2u] == extent_[2u])
+          index_[2u] = old_index_[2u];
+
+        //  if the 3rd index was reset, increment the 2nd index
+        if (index_[2u] == old_index_[2u])
+          index_[1u] += 1;
+        if (index_[1u] == old_index_[1u] + blockDim_[1u] || index_[1u] == extent_[1u])
+          index_[1u] = old_index_[1u];
+
+        // if the 3rd and 2nd indices were set, increment the first coordinate
+        if (index_[1u] == old_index_[1u] and index_[2u] == old_index_[2u])
+          index_[0u] += 1;
+        if (index_[0u] < old_index_[0u] + blockDim_[0u] and index_[0u] < extent_[0u] and index_[1u] < extent_[1u] and
+            index_[2u] < extent_[2u]) {
+          return *this;
+        }
+#endif
+
+        // the iterator has reached or overflown the end of the extent, clamp it
+        // to the extent
+        index_ = extent_;
+        return *this;
+      }
+
+      ALPAKA_FN_ACC constexpr iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); }
+
+      ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const {
+        return (index_[0u] < other.index_[0u] and index_[1u] < other.index_[1u] and index_[2u] < other.index_[2u]);
+      }
+
+    private:
+      ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {}
+
+      ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D index, Vec3D blockDim)
+          : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {}
+
+      Vec3D thread_;
+      Vec3D stride_;
+      Vec3D extent_;
+      Vec3D index_;
+      Vec3D old_index_;
+      Vec3D blockDim_;
+    };
+
+    ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); }
+
+    ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); }
+
+  private:
+    Vec3D thread_ = Vec3D::all(1);
+    Vec3D stride_ = Vec3D::all(1);
+    Vec3D extent_ = Vec3D::all(1);
+    Vec3D blockDim_ = Vec3D::all(1);
+  };
+
 }  // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE
 
 #endif  // AlpakaCore_alpakaWorkDivHelper_h
diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h
index 0295fb783..51a4ca715 100644
--- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h
+++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h
@@ -72,14 +72,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       }
 
       const auto ntNbins = foundNtuplets->nbins();
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, ntNbins, [&](uint32_t idx) {
-            if (foundNtuplets->size(idx) > 5)
-              printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
-            ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(idx) < 6);
-            for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
-              ALPAKA_ASSERT_OFFLOAD(*ih < nHits);
-          });
+
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNbins)) {
+        if (foundNtuplets->size(idx) > 5)
+          printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
+        ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(idx) < 6);
+        for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+          ALPAKA_ASSERT_OFFLOAD(*ih < nHits);
+      }
 #endif
 
       if (0 == threadIdx) {
@@ -94,25 +94,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       }
 
       const auto ntNCells = (*nCells);
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, ntNCells, [&](uint32_t idx) {
-            auto const &thisCell = cells[idx];
-            if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
-              printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
-            if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
-              printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
-            if (thisCell.theDoubletId < 0)
-              alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{});
-            if (0 == thisCell.theUsed)
-              alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{});
-            if (thisCell.tracks().empty())
-              alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{});
-          });
-
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, nHits, [&](uint32_t idx) {
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNCells)) {
+        auto const &thisCell = cells[idx];
+        if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+          printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
+        if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+          printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
+        if (thisCell.theDoubletId < 0)
+          alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{});
+        if (0 == thisCell.theUsed)
+          alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{});
+        if (thisCell.tracks().empty())
+          alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{});
+      }
+
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, nHits)) {
         if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
           printf("OuterHitOfCell overflow %d\n", idx);
-      });
+      }
     }
   };
 
@@ -125,15 +124,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       constexpr auto bad = trackQuality::bad;
 
       const auto ntNCells = (*nCells);
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, ntNCells, [&](uint32_t idx) {
-            auto const &thisCell = cells[idx];
 
-            if (thisCell.theDoubletId < 0) {
-              for (auto it : thisCell.tracks())
-                quality[it] = bad;
-            }
-          });
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNCells)) {
+        auto const &thisCell = cells[idx];
+
+        if (thisCell.theDoubletId < 0) {
+          for (auto it : thisCell.tracks())
+            quality[it] = bad;
+        }
+      }
     }
   };
 
@@ -150,28 +149,28 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
       ALPAKA_ASSERT_OFFLOAD(nCells);
       const auto ntNCells = (*nCells);
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, ntNCells, [&](uint32_t idx) {
-            auto const &thisCell = cells[idx];
 
-            if (thisCell.tracks().size() >= 2) {
-              //if (0==thisCell.theUsed) continue;
-              // if (thisCell.theDoubletId < 0) continue;
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNCells)) {
+        auto const &thisCell = cells[idx];
 
-              uint32_t maxNh = 0;
+        if (thisCell.tracks().size() >= 2) {
+          //if (0==thisCell.theUsed) continue;
+          // if (thisCell.theDoubletId < 0) continue;
 
-              // find maxNh
-              for (auto it : thisCell.tracks()) {
-                auto nh = foundNtuplets->size(it);
-                maxNh = std::max(nh, maxNh);
-              }
+          uint32_t maxNh = 0;
 
-              for (auto it : thisCell.tracks()) {
-                if (foundNtuplets->size(it) != maxNh)
-                  quality[it] = dup;  //no race:  simple assignment of the same constant
-              }
-            }
-          });
+          // find maxNh
+          for (auto it : thisCell.tracks()) {
+            auto nh = foundNtuplets->size(it);
+            maxNh = std::max(nh, maxNh);
+          }
+
+          for (auto it : thisCell.tracks()) {
+            if (foundNtuplets->size(it) != maxNh)
+              quality[it] = dup;  //no race:  simple assignment of the same constant
+          }
+        }
+      }
     }
   };
 
@@ -188,34 +187,33 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
       ALPAKA_ASSERT_OFFLOAD(nCells);
 
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, (*nCells), [&](uint32_t idx) {
-            auto const &thisCell = cells[idx];
-            if (thisCell.tracks().size() >= 2) {
-              // if (thisCell.theDoubletId < 0) continue;
-
-              float mc = 10000.f;
-              uint16_t im = 60000;
-
-              auto score = [&](auto it) {
-                return std::abs(tracks->tip(it));  // tip
-                // return tracks->chi2(it);  //chi2
-              };
-
-              // find min socre
-              for (auto it : thisCell.tracks()) {
-                if (tracks->quality(it) == loose && score(it) < mc) {
-                  mc = score(it);
-                  im = it;
-                }
-              }
-              // mark all other duplicates
-              for (auto it : thisCell.tracks()) {
-                if (tracks->quality(it) != bad && it != im)
-                  tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
-              }
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, *nCells)) {
+        auto const &thisCell = cells[idx];
+        if (thisCell.tracks().size() >= 2) {
+          // if (thisCell.theDoubletId < 0) continue;
+
+          float mc = 10000.f;
+          uint16_t im = 60000;
+
+          auto score = [&](auto it) {
+            return std::abs(tracks->tip(it));  // tip
+            // return tracks->chi2(it);  //chi2
+          };
+
+          // find min socre
+          for (auto it : thisCell.tracks()) {
+            if (tracks->quality(it) == loose && score(it) < mc) {
+              mc = score(it);
+              im = it;
             }
-          });
+          }
+          // mark all other duplicates
+          for (auto it : thisCell.tracks()) {
+            if (tracks->quality(it) != bad && it != im)
+              tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
+          }
+        }
+      }
     }
   };
 
@@ -237,6 +235,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   float dcaCutOuterTriplet) const {
       auto const &hh = *hhp;
 
+      const Idx elementShift = 0;
       const uint32_t dimIndexY = 0u;
       const uint32_t dimIndexX = 1u;
       const uint32_t threadIdxY(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[dimIndexY]);
@@ -247,63 +246,59 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         (*apc2) = 0;
       }  // ready for next kernel
 
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc,
-          (*nCells),
-          0u,
-          [&](uint32_t idx) {
-            auto cellIndex = idx;
-            auto &thisCell = cells[idx];
-            //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
-            //  continue;
-            auto innerHitId = thisCell.get_inner_hit_id();
-            int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
-            const auto *__restrict__ vi = isOuterHitOfCell[innerHitId].data();
-
-            constexpr uint32_t last_bpix1_detIndex = 96;
-            constexpr uint32_t last_barrel_detIndex = 1184;
-            auto ri = thisCell.get_inner_r(hh);
-            auto zi = thisCell.get_inner_z(hh);
-
-            auto ro = thisCell.get_outer_r(hh);
-            auto zo = thisCell.get_outer_z(hh);
-            auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
-
-            ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(
-                acc,
-                numberOfPossibleNeighbors,
-                0u,
-                [&](uint32_t j) {
-                  auto otherCell = vi[j];  // NB: Was with __ldg in legacy
-                  auto &oc = cells[otherCell];
-                  // if (cells[otherCell].theDoubletId < 0 ||
-                  //    cells[otherCell].theUsed>1 )
-                  //  continue;
-                  auto r1 = oc.get_inner_r(hh);
-                  auto z1 = oc.get_inner_z(hh);
-                  // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
-                  bool aligned = GPUCACell::areAlignedRZ(
-                      r1,
-                      z1,
-                      ri,
-                      zi,
-                      ro,
-                      zo,
-                      ptmin,
-                      isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-                  if (aligned && thisCell.dcaCut(hh,
-                                                 oc,
-                                                 oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet
-                                                                                                 : dcaCutOuterTriplet,
-                                                 hardCurvCut)) {  // FIXME tune cuts
-                    oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors);
-                    thisCell.theUsed |= 1;
-                    oc.theUsed |= 1;
-                  }
-                },
-                dimIndexX);  // loop on inner cells
-          },
-          dimIndexY);  // loop on outer cells
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(
+               acc, *nCells, elementShift, dimIndexY)) {
+        auto cellIndex = idx;
+        auto &thisCell = cells[idx];
+        //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
+        //  continue;
+        auto innerHitId = thisCell.get_inner_hit_id();
+        int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+        const auto *__restrict__ vi = isOuterHitOfCell[innerHitId].data();
+
+        constexpr uint32_t last_bpix1_detIndex = 96;
+        constexpr uint32_t last_barrel_detIndex = 1184;
+        auto ri = thisCell.get_inner_r(hh);
+        auto zi = thisCell.get_inner_z(hh);
+
+        auto ro = thisCell.get_outer_r(hh);
+        auto zo = thisCell.get_outer_z(hh);
+        auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
+
+        ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(
+            acc,
+            numberOfPossibleNeighbors,
+            0u,
+            [&](uint32_t j) {
+              auto otherCell = vi[j];  // NB: Was with __ldg in legacy
+              auto &oc = cells[otherCell];
+              // if (cells[otherCell].theDoubletId < 0 ||
+              //    cells[otherCell].theUsed>1 )
+              //  continue;
+              auto r1 = oc.get_inner_r(hh);
+              auto z1 = oc.get_inner_z(hh);
+              // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
+              bool aligned = GPUCACell::areAlignedRZ(
+                  r1,
+                  z1,
+                  ri,
+                  zi,
+                  ro,
+                  zo,
+                  ptmin,
+                  isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+              if (aligned && thisCell.dcaCut(hh,
+                                             oc,
+                                             oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                             : dcaCutOuterTriplet,
+                                             hardCurvCut)) {  // FIXME tune cuts
+                oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors);
+                thisCell.theUsed |= 1;
+                oc.theUsed |= 1;
+              }
+            },
+            dimIndexX);  // loop on inner cells
+      }                  // loop on outer cells
     }
   };
 
@@ -323,23 +318,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
       //auto first = threadIdx.x + blockIdx.x * blockDim.x;
       //for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, (*nCells), [&](uint32_t idx) {
-            auto const &thisCell = cells[idx];
-            if (thisCell.theDoubletId >= 0) {  // cut by earlyFishbone
-
-              auto pid = thisCell.theLayerPairId;
-              auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
-              if (doit) {
-                GPUCACell::TmpTuple stack;
-                stack.reset();
-                thisCell.find_ntuplets<6>(
-                    acc, hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
-                ALPAKA_ASSERT_OFFLOAD(stack.empty());
-                // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
-              }
-            }
-          });
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, *nCells)) {
+        auto const &thisCell = cells[idx];
+        if (thisCell.theDoubletId >= 0) {  // cut by earlyFishbone
+
+          auto pid = thisCell.theLayerPairId;
+          auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
+          if (doit) {
+            GPUCACell::TmpTuple stack;
+            stack.reset();
+            thisCell.find_ntuplets<6>(
+                acc, hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
+            ALPAKA_ASSERT_OFFLOAD(stack.empty());
+            // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
+          }
+        }
+      }
     }
   };
 
@@ -350,12 +344,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   GPUCACell *__restrict__ cells,
                                   uint32_t const *nCells) const {
       // auto const &hh = *hhp;
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, (*nCells), [&](uint32_t idx) {
-            auto &thisCell = cells[idx];
-            if (!thisCell.tracks().empty())
-              thisCell.theUsed |= 2;
-          });
+
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, *nCells)) {
+        auto &thisCell = cells[idx];
+        if (!thisCell.tracks().empty())
+          thisCell.theUsed |= 2;
+      }
     }
   };
 
@@ -365,17 +359,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   HitContainer const *__restrict__ foundNtuplets,
                                   Quality const *__restrict__ quality,
                                   CAConstants::TupleMultiplicity *tupleMultiplicity) const {
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, foundNtuplets->nbins(), [&](uint32_t it) {
-            auto nhits = foundNtuplets->size(it);
-            if (nhits >= 3 && quality[it] != trackQuality::dup) {
-              ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad);
-              if (nhits > 5)
-                printf("wrong mult %d %d\n", it, nhits);
-              ALPAKA_ASSERT_OFFLOAD(nhits < 8);
-              tupleMultiplicity->countDirect(acc, nhits);
-            }
-          });
+      for (uint32_t it :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, foundNtuplets->nbins())) {
+        auto nhits = foundNtuplets->size(it);
+        if (nhits >= 3 && quality[it] != trackQuality::dup) {
+          ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad);
+          if (nhits > 5)
+            printf("wrong mult %d %d\n", it, nhits);
+          ALPAKA_ASSERT_OFFLOAD(nhits < 8);
+          tupleMultiplicity->countDirect(acc, nhits);
+        }
+      }
     }
   };
 
@@ -385,17 +379,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   HitContainer const *__restrict__ foundNtuplets,
                                   Quality const *__restrict__ quality,
                                   CAConstants::TupleMultiplicity *tupleMultiplicity) const {
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, foundNtuplets->nbins(), [&](uint32_t it) {
-            auto nhits = foundNtuplets->size(it);
-            if (nhits >= 3 && quality[it] != trackQuality::dup) {
-              ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad);
-              if (nhits > 5)
-                printf("wrong mult %d %d\n", it, nhits);
-              ALPAKA_ASSERT_OFFLOAD(nhits < 8);
-              tupleMultiplicity->fillDirect(acc, nhits, it);
-            }
-          });
+      for (uint32_t it :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, foundNtuplets->nbins())) {
+        auto nhits = foundNtuplets->size(it);
+        if (nhits >= 3 && quality[it] != trackQuality::dup) {
+          ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad);
+          if (nhits > 5)
+            printf("wrong mult %d %d\n", it, nhits);
+          ALPAKA_ASSERT_OFFLOAD(nhits < 8);
+          tupleMultiplicity->fillDirect(acc, nhits, it);
+        }
+      }
     }
   };
 
@@ -406,64 +400,63 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   TkSoA const *__restrict__ tracks,
                                   CAHitNtupletGeneratorKernels::QualityCuts cuts,
                                   Quality *__restrict__ quality) const {
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, tuples->nbins(), [&](uint32_t it) {
-            auto nhits = tuples->size(it);
-            if (nhits == 0)
-              return;  // guard
-
-            // if duplicate: not even fit
-            // mark doublets as bad
-            if (quality[it] != trackQuality::dup && nhits >= 3) {
-              ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad);
-
-              // if the fit has any invalid parameters, mark it as bad
-              bool isNaN = false;
-              for (int i = 0; i < 5; ++i) {
-                isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
-              }
-              if (!isNaN) {
+      for (uint32_t it : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) {
+        auto nhits = tuples->size(it);
+        if (nhits == 0)
+          return;  // guard
+
+        // if duplicate: not even fit
+        // mark doublets as bad
+        if (quality[it] != trackQuality::dup && nhits >= 3) {
+          ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad);
+
+          // if the fit has any invalid parameters, mark it as bad
+          bool isNaN = false;
+          for (int i = 0; i < 5; ++i) {
+            isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
+          }
+          if (!isNaN) {
 #ifdef NTUPLE_DEBUG
-                printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
+            printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
 #endif
 
-                // compute a pT-dependent chi2 cut
-                // default parameters:
-                //   - chi2MaxPt = 10 GeV
-                //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
-                //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
-                // (see CAHitNtupletGeneratorGPU.cc)
-                float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
-                float chi2Cut =
-                    cuts.chi2Scale *
-                    (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
-                // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
-                if (3.f * tracks->chi2(it) < chi2Cut) {
+            // compute a pT-dependent chi2 cut
+            // default parameters:
+            //   - chi2MaxPt = 10 GeV
+            //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
+            //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
+            // (see CAHitNtupletGeneratorGPU.cc)
+            float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
+            float chi2Cut =
+                cuts.chi2Scale *
+                (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+            // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
+            if (3.f * tracks->chi2(it) < chi2Cut) {
 #ifdef NTUPLE_DEBUG
-                  printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
-                         it,
-                         tuples->size(it),
-                         tracks->pt(it),
-                         tracks->eta(it),
-                         3.f * tracks->chi2(it));
+              printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
+                     it,
+                     tuples->size(it),
+                     tracks->pt(it),
+                     tracks->eta(it),
+                     3.f * tracks->chi2(it));
 #endif
 
-                  // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
-                  // default cuts:
-                  //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
-                  //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
-                  // (see CAHitNtupletGeneratorGPU.cc)
-                  auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
-                  bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
-                              (std::abs(tracks->zip(it)) < region.maxZip);
-
-                  if (isOk)
-                    quality[it] = trackQuality::loose;
-
-                }  // chi2Cut
-              }    // !isNaN
-            }      // trackQuality and nhits
-          });
+              // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+              // default cuts:
+              //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+              //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+              // (see CAHitNtupletGeneratorGPU.cc)
+              auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
+              bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
+                          (std::abs(tracks->zip(it)) < region.maxZip);
+
+              if (isOk)
+                quality[it] = trackQuality::loose;
+
+            }  // chi2Cut
+          }    // !isNaN
+        }      // trackQuality and nhits
+      }
     }
   };
 
@@ -473,14 +466,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   HitContainer const *__restrict__ tuples,
                                   Quality const *__restrict__ quality,
                                   CAHitNtupletGeneratorKernels::Counters *counters) const {
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, tuples->nbins(), [&](uint32_t idx) {
-            if (tuples->size(idx) == 0)
-              return;  //guard
-            if (quality[idx] == trackQuality::loose) {
-              alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{});
-            }
-          });
+      for (uint32_t idx :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) {
+        if (tuples->size(idx) == 0)
+          return;  //guard
+        if (quality[idx] == trackQuality::loose) {
+          alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{});
+        }
+      }
     }
   };
 
@@ -490,15 +483,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   HitContainer const *__restrict__ tuples,
                                   Quality const *__restrict__ quality,
                                   CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) const {
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, tuples->nbins(), [&](uint32_t idx) {
-            if (tuples->size(idx) == 0)
-              return;  // guard
-            if (quality[idx] == trackQuality::loose) {
-              for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-                hitToTuple->countDirect(acc, *h);
-            }
-          });
+      for (uint32_t idx :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) {
+        if (tuples->size(idx) == 0)
+          return;  // guard
+        if (quality[idx] == trackQuality::loose) {
+          for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+            hitToTuple->countDirect(acc, *h);
+        }
+      }
     }
   };
 
@@ -508,15 +501,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   HitContainer const *__restrict__ tuples,
                                   Quality const *__restrict__ quality,
                                   CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) const {
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, tuples->nbins(), [&](uint32_t idx) {
-            if (tuples->size(idx) == 0)
-              return;  // guard
-            if (quality[idx] == trackQuality::loose) {
-              for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-                hitToTuple->fillDirect(acc, *h, idx);
-            }
-          });
+      for (uint32_t idx :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) {
+        if (tuples->size(idx) == 0)
+          return;  // guard
+        if (quality[idx] == trackQuality::loose) {
+          for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+            hitToTuple->fillDirect(acc, *h, idx);
+        }
+      }
     }
   };
 
@@ -527,18 +520,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   TrackingRecHit2DSOAView const *__restrict__ hhp,
                                   HitContainer *__restrict__ hitDetIndices) const {
       // copy offsets
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, tuples->totbins(), [&](uint32_t idx) { hitDetIndices->off[idx] = tuples->off[idx]; });
+      for (uint32_t idx :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->totbins())) {
+        hitDetIndices->off[idx] = tuples->off[idx];
+      }
       // fill hit indices
       auto const &hh = *hhp;
 #ifndef NDEBUG
       auto nhits = hh.nHits();
 #endif
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, tuples->size(), [&](uint32_t idx) {
-            ALPAKA_ASSERT_OFFLOAD(tuples->bins[idx] < nhits);
-            hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
-          });
+      for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->size())) {
+        ALPAKA_ASSERT_OFFLOAD(tuples->bins[idx] < nhits);
+        hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+      }
     }
   };
 
@@ -548,14 +542,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                   CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ hitToTuple,
                                   CAHitNtupletGeneratorKernels::Counters *counters) const {
       auto &c = *counters;
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, hitToTuple->nbins(), [&](uint32_t idx) {
-            if (hitToTuple->size(idx) != 0) {  // SHALL NOT BE break
-              alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{});
-              if (hitToTuple->size(idx) > 1)
-                alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{});
-            }
-          });
+
+      for (uint32_t idx :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, hitToTuple->nbins())) {
+        if (hitToTuple->size(idx) != 0) {  // SHALL NOT BE break
+          alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{});
+          if (hitToTuple->size(idx) > 1)
+            alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{});
+        }
+      }
     }
   };
 
@@ -578,45 +573,45 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       //  auto const & hh = *hhp;
       // auto l1end = hh.hitsLayerStart_d[1];
 
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(
-          acc, phitToTuple->nbins(), [&](uint32_t idx) {
-            if (hitToTuple.size(idx) >= 2) {
-              float mc = 10000.f;
-              uint16_t im = 60000;
-              uint32_t maxNh = 0;
-
-              // find maxNh
-              for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-                uint32_t nh = foundNtuplets.size(*it);
-                maxNh = std::max(nh, maxNh);
-              }
-              // kill all tracks shorter than maxHn (only triplets???)
-              for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-                uint32_t nh = foundNtuplets.size(*it);
-                if (maxNh != nh)
-                  quality[*it] = dup;
+      for (uint32_t idx :
+           ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, phitToTuple->nbins())) {
+        if (hitToTuple.size(idx) >= 2) {
+          float mc = 10000.f;
+          uint16_t im = 60000;
+          uint32_t maxNh = 0;
+
+          // find maxNh
+          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+            uint32_t nh = foundNtuplets.size(*it);
+            maxNh = std::max(nh, maxNh);
+          }
+          // kill all tracks shorter than maxHn (only triplets???)
+          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+            uint32_t nh = foundNtuplets.size(*it);
+            if (maxNh != nh)
+              quality[*it] = dup;
+          }
+
+          if (maxNh <= 3) {
+            // if (idx>=l1end) continue;  // only for layer 1
+            // for triplets choose best tip!
+            for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+              auto const it = *ip;
+              if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
+                mc = std::abs(tracks.tip(it));
+                im = it;
               }
+            }
+            // mark duplicates
+            for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+              auto const it = *ip;
+              if (quality[it] != bad && it != im)
+                quality[it] = dup;  //no race:  simple assignment of the same constant
+            }
 
-              if (maxNh <= 3) {
-                // if (idx>=l1end) continue;  // only for layer 1
-                // for triplets choose best tip!
-                for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-                  auto const it = *ip;
-                  if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
-                    mc = std::abs(tracks.tip(it));
-                    im = it;
-                  }
-                }
-                // mark duplicates
-                for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-                  auto const it = *ip;
-                  if (quality[it] != bad && it != im)
-                    quality[it] = dup;  //no race:  simple assignment of the same constant
-                }
-
-              }  // maxNh
-            }    // hitToTuple.size
-          });    // loop over hits
+          }  // maxNh
+        }    // hitToTuple.size
+      }      // loop over hits
     }
   };
 
@@ -633,7 +628,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       auto const &foundNtuplets = *ptuples;
       auto const &tracks = *ptracks;
       const auto np = std::min(maxPrint, foundNtuplets.nbins());
-      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, np, [&](uint32_t i) {
+
+      for (uint32_t i : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, np)) {
         auto nh = foundNtuplets.size(i);
         if (nh >= 3) {
           printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
@@ -654,7 +650,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                  nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1,
                  nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1);
         }  // nh
-      });
+      }
     }
   };
 
diff --git a/src/alpaka/test/alpaka/atomtest.cc b/src/alpaka/test/alpaka/atomtest.cc
new file mode 100644
index 000000000..678143272
--- /dev/null
+++ b/src/alpaka/test/alpaka/atomtest.cc
@@ -0,0 +1,205 @@
+#include <iostream>
+#include <string>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+template <typename T, typename Data>
+struct shared_block {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const {
+    auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+    auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+    Data b = 1.0;
+    Data c = -1.0;
+
+    auto& s = alpaka::declareSharedVar<Data, __COUNTER__>(acc);
+
+    if (threadIdxLocal == 0) {
+      s = 0;
+    }
+
+    syncBlockThreads(acc);
+
+    for ([[maybe_unused]] T index :
+         ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) {
+      for (int i = 0; i < 200000; i++) {
+        alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{});
+        alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{});
+      }
+      alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{});
+    }
+
+    syncBlockThreads(acc);
+
+    if (threadIdxLocal == 0) {
+      vec[blockIdxInGrid] = s;
+    }
+  }
+};
+
+template <typename T, typename Data>
+struct global_block {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const {
+    auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+    Data b = 1.0;
+    Data c = -1.0;
+
+    for ([[maybe_unused]] T index :
+         ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) {
+      for (int i = 0; i < 200000; i++) {
+        alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{});
+        alpaka::atomicAdd(acc, &vec[blockIdxInGrid], c, alpaka::hierarchy::Grids{});
+      }
+      alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{});
+    }
+  }
+};
+
+template <typename T, typename Data>
+struct global_grid {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const {
+    Data b = 1.0;
+    Data c = -1.0;
+
+    for ([[maybe_unused]] T index :
+         ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) {
+      for (int i = 0; i < 200000; i++) {
+        alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{});  //alpaka::hierarchy::Blocks/Threads/Grids
+        alpaka::atomicAdd(acc, &vec[0], c, alpaka::hierarchy::Grids{});
+      }
+      alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{});
+    }
+  }
+};
+
+template <typename T, typename Data>
+struct shared_grid {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const {
+    auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+    Data b = 1.0;
+    Data c = -1.0;
+
+    auto& s = alpaka::declareSharedVar<Data, __COUNTER__>(acc);
+
+    if (threadIdxLocal == 0) {
+      s = 0;
+    }
+
+    syncBlockThreads(acc);
+
+    for ([[maybe_unused]] T index :
+         ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) {
+      for (int i = 0; i < 200000; i++) {
+        alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{});  //alpaka::hierarchy::Blocks/Threads/Grids
+        alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{});
+      }
+      alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{});
+    }
+
+    syncBlockThreads(acc);
+
+    if (threadIdxLocal == 0) {
+      alpaka::atomicAdd(acc, &vec[0], s, alpaka::hierarchy::Grids{});
+    }
+  }
+};
+
+int main(void) {
+  using Dim = alpaka::DimInt<1u>;
+  using Data = float;
+  const Idx num_items = 1 << 15;
+  Idx nThreadsInit = 256;
+  Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;
+
+  const Device device_1(alpaka::getDevByIdx<Platform>(0u));
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_0(device_1);
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_1(device_1);
+
+  const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit));
+  const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit));
+  auto workDivMultiBlockInit1 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(
+      blocksPerGrid1, threadsPerBlockOrElementsPerThread1);
+
+  using DevHost = alpaka::DevCpu;
+  auto const devHost = alpaka::getDevByIdx<DevHost>(0u);
+
+  using BufHost = alpaka::Buf<DevHost, Data, Dim, Idx>;
+  BufHost bufHostA(alpaka::allocBuf<Data, Idx>(devHost, num_items));
+  BufHost res(alpaka::allocBuf<Data, Idx>(devHost, num_items));
+
+  Data* const pBufHostA(alpaka::getPtrNative(bufHostA));
+  Data* const res_ptr(alpaka::getPtrNative(res));
+
+  for (Idx i = 0; i < num_items; i++) {
+    pBufHostA[i] = 0.0;
+  }
+
+  using BufAcc = alpaka::Buf<Device, Data, Dim, Idx>;
+  BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items));
+
+  printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]);
+
+  // Run on shared memory
+  alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
+  auto beginT = std::chrono::high_resolution_clock::now();
+  alpaka::enqueue(queue_1_0,
+                  alpaka::createTaskKernel<Acc1D>(
+                      workDivMultiBlockInit1, shared_block<Idx, Data>(), alpaka::getPtrNative(order), num_items));
+  alpaka::wait(queue_1_0);
+  auto endT = std::chrono::high_resolution_clock::now();
+  std::cout << "Shared Block: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;
+  alpaka::memcpy(queue_1_0, res, order, num_items);
+  for (Idx i = 0; i < nBlocksInit; i++) {
+    if (res_ptr[i] != (Data)nThreadsInit)
+      std::cout << "[" << i << "]:  " << res_ptr[i] << " != " << (Data)num_items << std::endl;
+  }
+
+  // Run on global memory
+  alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
+  beginT = std::chrono::high_resolution_clock::now();
+  alpaka::enqueue(queue_1_0,
+                  alpaka::createTaskKernel<Acc1D>(
+                      workDivMultiBlockInit1, global_block<Idx, Data>(), alpaka::getPtrNative(order), num_items));
+  alpaka::wait(queue_1_0);
+  endT = std::chrono::high_resolution_clock::now();
+  std::cout << "Global Block: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;
+  alpaka::memcpy(queue_1_0, res, order, num_items);
+  for (Idx i = 0; i < nBlocksInit; i++) {
+    if (res_ptr[i] != (Data)nThreadsInit)
+      std::cout << "[" << i << "]:  " << res_ptr[i] << " != " << (Data)num_items << std::endl;
+  }
+
+  // Run on Shared memory
+  alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
+  beginT = std::chrono::high_resolution_clock::now();
+  alpaka::enqueue(queue_1_0,
+                  alpaka::createTaskKernel<Acc1D>(
+                      workDivMultiBlockInit1, shared_grid<Idx, Data>(), alpaka::getPtrNative(order), num_items));
+  alpaka::wait(queue_1_0);
+  endT = std::chrono::high_resolution_clock::now();
+  std::cout << "Shared Grid: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;
+  alpaka::memcpy(queue_1_0, res, order, num_items);
+  if (res_ptr[0] != (Data)num_items)
+    std::cout << "[0]:  " << res_ptr[0] << " != " << (Data)num_items << std::endl << std::endl;
+
+  // Run on Global memory
+  alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
+  beginT = std::chrono::high_resolution_clock::now();
+  alpaka::enqueue(queue_1_0,
+                  alpaka::createTaskKernel<Acc1D>(
+                      workDivMultiBlockInit1, global_grid<Idx, Data>(), alpaka::getPtrNative(order), num_items));
+  alpaka::wait(queue_1_0);
+  endT = std::chrono::high_resolution_clock::now();
+  std::cout << "Global Grid: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;
+  alpaka::memcpy(queue_1_0, res, order, num_items);
+  if (res_ptr[0] != (Data)num_items)
+    std::cout << "[0]:  " << res_ptr[0] << " != " << (Data)num_items << std::endl;
+
+  return 0;
+}
diff --git a/src/alpaka/test/alpaka/barrier_fence.cc b/src/alpaka/test/alpaka/barrier_fence.cc
new file mode 100644
index 000000000..3e42104ea
--- /dev/null
+++ b/src/alpaka/test/alpaka/barrier_fence.cc
@@ -0,0 +1,115 @@
+#include <iostream>
+#include <string>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+#include "AlpakaCore/threadfence.h"
+
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+template <typename T, typename Data>
+struct global_fence {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const {
+    auto blockIdxLocal(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+    int no_blocks = 128;
+
+    for (int i = 0; i < no_blocks * no_blocks * 10; i++) {
+      if (i % no_blocks == (int)blockIdxLocal) {
+        if (i % no_blocks > 0) {
+          vec[blockIdxLocal] = vec[blockIdxLocal - 1] + 1;
+        }
+      }
+      cms::alpakatools::threadfence(acc);
+    }
+  }
+};
+
+template <typename T, typename Data>
+struct shared_fence {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const {
+    auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+    auto blockIdxLocal(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+
+    auto& s = alpaka::declareSharedVar<Data[256], __COUNTER__>(acc);
+
+    for (int i = 0; i < 256 * 256 * 10; i++) {
+      if (i % 256 == (int)threadIdxLocal && threadIdxLocal > 0) {
+        s[threadIdxLocal] = s[threadIdxLocal - 1] + 1;
+      }
+      cms::alpakatools::threadfence(acc);
+    }
+
+    if (threadIdxLocal == 0) {
+      vec[blockIdxLocal] = s[127] + s[129];
+    }
+  }
+};
+
+int main(void) {
+  using Dim = alpaka::DimInt<1u>;
+  using Data = float;
+  const Idx num_items = 1 << 15;
+  Idx nThreadsInit = 256;
+  Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;
+
+  const Device device_1(alpaka::getDevByIdx<Platform>(0u));
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_0(device_1);
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_1(device_1);
+
+  const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit));
+  const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit));
+  auto workDivMultiBlockInit1 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);
+
+  using DevHost = alpaka::DevCpu;
+  auto const devHost = alpaka::getDevByIdx<DevHost>(0u);
+
+  using BufHost = alpaka::Buf<DevHost, Data, Dim, Idx>;
+  BufHost bufHostA(alpaka::allocBuf<Data, Idx>(devHost, num_items));
+  BufHost res(alpaka::allocBuf<Data, Idx>(devHost, num_items));
+
+  Data* const pBufHostA(alpaka::getPtrNative(bufHostA));
+  Data* const res_ptr(alpaka::getPtrNative(res));
+
+  for (Idx i = 0; i < num_items; i++) {
+    pBufHostA[i] = 0.0;
+  }
+
+  using BufAcc = alpaka::Buf<Device, Data, Dim, Idx>;
+  BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items));
+
+  printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]);
+
+  // Run on shared memory
+  alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
+  auto beginT = std::chrono::high_resolution_clock::now();
+  alpaka::enqueue(queue_1_0,
+                  alpaka::createTaskKernel<Acc1D>(
+                      workDivMultiBlockInit1, shared_fence<Idx, Data>(), alpaka::getPtrNative(order), num_items));
+  alpaka::wait(queue_1_0);
+  auto endT = std::chrono::high_resolution_clock::now();
+  std::cout << "Shared time: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;
+  alpaka::memcpy(queue_1_0, res, order, num_items);
+  for (int i = 0; i < 128; i++) {
+    if (res_ptr[i] != 256.0)
+      printf("Error: d[%d] != r (%f, %d)\n", i, res_ptr[i], i);
+  }
+
+  // Run on global memory
+  alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
+  beginT = std::chrono::high_resolution_clock::now();
+  alpaka::enqueue(queue_1_0,
+                  alpaka::createTaskKernel<Acc1D>(
+                      workDivMultiBlockInit1, global_fence<Idx, Data>(), alpaka::getPtrNative(order), num_items));
+  alpaka::wait(queue_1_0);
+  endT = std::chrono::high_resolution_clock::now();
+  std::cout << "Global time: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;
+  alpaka::memcpy(queue_1_0, res, order, num_items);
+  for (int i = 0; i < 128; i++) {
+    if (res_ptr[i] != Data(i))
+      printf("Error: d[%d] != r (%f, %d)\n", i, res_ptr[i], i);
+  }
+
+  return 0;
+}
diff --git a/src/alpaka/test/alpaka/barrier_sync.cc b/src/alpaka/test/alpaka/barrier_sync.cc
new file mode 100644
index 000000000..96541209a
--- /dev/null
+++ b/src/alpaka/test/alpaka/barrier_sync.cc
@@ -0,0 +1,62 @@
+#include <iostream>
+#include <string>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+template <typename T, typename Data>
+struct check_sync {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const {
+    int n = (int)elements;
+
+    auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+    for (int i = 0; i < n * n; i++) {
+      if (i % n == (int)threadIdxLocal) {
+        for (int j = i; j < 10000; j++) {
+          if (j % 2 == 0) {
+            // Do random stuff
+            int sum = 0;
+            for (int k = 0; k < 1000; k++)
+              sum += k;
+          }
+        }
+      }
+      syncBlockThreads(acc);
+    }
+  }
+};
+
+int main(void) {
+  using Dim = alpaka::DimInt<1u>;
+  using Data = float;
+  const Idx num_items = 1 << 10;
+  Idx nThreadsInit = 1024;
+  Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;
+
+  const Device device_1(alpaka::getDevByIdx<Platform>(0u));
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_0(device_1);
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_1(device_1);
+
+  const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit));
+  const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit));
+  auto workDivMultiBlockInit1 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);
+
+  using BufAcc = alpaka::Buf<Device, Data, Dim, Idx>;
+  BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items));
+
+  printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]);
+
+  // Run function
+  auto beginT = std::chrono::high_resolution_clock::now();
+  alpaka::enqueue(queue_1_0,
+                  alpaka::createTaskKernel<Acc1D>(
+                      workDivMultiBlockInit1, check_sync<Idx, Data>(), alpaka::getPtrNative(order), nThreadsInit));
+  alpaka::wait(queue_1_0);
+  auto endT = std::chrono::high_resolution_clock::now();
+  std::cout << "Time: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;
+
+  return 0;
+}
diff --git a/src/alpaka/test/alpaka/elements_stride.cc b/src/alpaka/test/alpaka/elements_stride.cc
new file mode 100644
index 000000000..15301a2a6
--- /dev/null
+++ b/src/alpaka/test/alpaka/elements_stride.cc
@@ -0,0 +1,184 @@
+#include <iostream>
+#include <string>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaWorkDivHelper.h"
+
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+template <typename T>
+struct explicit_loop {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, T elements) const {
+    const T threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+    const T blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+
+    const T blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
+    const T gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u]);
+
+    T thread = threadIdxLocal + blockDimension * blockIdxInGrid;
+    T stride = blockDimension * gridDimension;
+
+    for (T index(thread); index < elements; index += stride) {
+      printf("explicit_loop:%d\n", index);
+    }
+  }
+};
+
+template <typename T>
+struct range_loop {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, T elements) const {
+    for (T index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) {
+      printf("range:%d\n", index);
+    }
+  }
+};
+
+template <typename T>
+struct explicit_loop_1d {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const {
+    auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+    auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+
+    auto blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+    auto gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc));
+
+    Vec3D thread({blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], Idx{0}, Idx{0}});
+    Vec3D stride({blockDimension[0u] * gridDimension[0u], Idx{1}, Idx{1}});
+    Vec3D index(thread);
+
+    for (T i = index[0u]; i < elements[0u]; i += stride[0u]) {
+      printf("explicit_loop_1d:%d:%d:%d\n", i, index[1u], index[2u]);
+    }
+  }
+};
+
+template <typename T>
+struct range_loop_1d {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const {
+    for (Vec3D index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride_1d(acc, elements)) {
+      printf("range_1d:%d:%d:%d\n", index[0u], index[1u], index[2u]);
+    }
+  }
+};
+
+template <typename T>
+struct explicit_loop_2d {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const {
+    auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+    auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+
+    auto blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+    auto gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc));
+
+    Vec3D thread({blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u],
+                  blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u],
+                  Idx{0}});
+    Vec3D stride({blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], Idx{1}});
+    Vec3D index(thread);
+
+    for (T i = index[1u]; i < elements[1u]; i += stride[1u]) {
+      for (T i = index[0u]; i < elements[0u]; i += stride[0u]) {
+        printf("explicit_loop_2d:%d:%d:%d\n", index[0u], index[1u], index[2u]);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct range_loop_2d {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const {
+    for (Vec3D index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride_2d(acc, elements)) {
+      printf("range_2d:%d:%d:%d\n", index[0u], index[1u], index[2u]);
+    }
+  }
+};
+
+template <typename T>
+struct explicit_loop_3d {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const {
+    auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+    auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc));
+
+    auto blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc));
+    auto gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc));
+
+    Vec3D thread({blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u],
+                  blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u],
+                  blockDimension[2u] * blockIdxInGrid[2u] + threadIdxLocal[2u]});
+    Vec3D stride({blockDimension[0u] * gridDimension[0u],
+                  blockDimension[1u] * gridDimension[1u],
+                  blockDimension[2u] * gridDimension[2u]});
+    Vec3D index(thread);
+
+    for (T i = index[2u]; i < elements[2u]; i += stride[2u]) {
+      for (T i = index[1u]; i < elements[1u]; i += stride[1u]) {
+        for (T i = index[0u]; i < elements[0u]; i += stride[0u]) {
+          printf("explicit_loop_3d:%d:%d:%d\n", index[0u], index[1u], index[2u]);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+struct range_loop_3d {
+  template <typename T_Acc>
+  ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const {
+    for (Vec3D index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride_3d(acc, elements)) {
+      printf("range_3d:%d:%d:%d\n", index[0u], index[1u], index[2u]);
+    }
+  }
+};
+
+int main(void) {
+  const Idx num_items = 10;
+  Idx nThreadsInit = 7;
+  Vec3D elements(Vec3D::all(num_items));
+  Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;
+
+  /*
+  // Case for elements_with_stride (only one dimension)
+  const Device device_1(alpaka::getDevByIdx<Platform>(0u));
+
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_0(device_1);
+  alpaka::Queue<Device, alpaka::Blocking> queue_1_1(device_1);
+
+  const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit));
+  const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit));
+  auto workDivMultiBlockInit1 =
+      ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);
+
+  printf("threads/block = %d and blocks/grid = %d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]);
+
+  alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1D>(workDivMultiBlockInit1, 
+                  explicit_loop<Idx>(), num_items));
+
+  alpaka::enqueue(queue_1_1, alpaka::createTaskKernel<Acc1D>(workDivMultiBlockInit1, 
+                  range_loop<Idx>(), num_items));
+  
+  */
+  // Case for elements_with_stride_Xd (3 dimensions)
+  const Device device3(alpaka::getDevByIdx<Platform>(0u));
+
+  alpaka::Queue<Device, alpaka::Blocking> queue_3_0(device3);
+  alpaka::Queue<Device, alpaka::Blocking> queue_3_1(device3);
+
+  const Vec3D threadsPerBlockOrElementsPerThread3(Vec3D::all(nThreadsInit));
+  const Vec3D blocksPerGrid3(Vec3D::all(nBlocksInit));
+  auto workDivMultiBlockInit3 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(
+      blocksPerGrid3, threadsPerBlockOrElementsPerThread3);
+
+  printf("threads/block = %d and blocks/grid = %d\n", threadsPerBlockOrElementsPerThread3[0u], blocksPerGrid3[0u]);
+  alpaka::enqueue(queue_3_0,
+                  alpaka::createTaskKernel<Acc3D>(workDivMultiBlockInit3, explicit_loop_1d<Idx>(), elements));
+  alpaka::enqueue(queue_3_1, alpaka::createTaskKernel<Acc3D>(workDivMultiBlockInit3, range_loop_1d<Idx>(), elements));
+
+  return 0;
+}
diff --git a/src/cuda/test/atomtest.cu b/src/cuda/test/atomtest.cu
new file mode 100644
index 000000000..a2ee8fc4d
--- /dev/null
+++ b/src/cuda/test/atomtest.cu
@@ -0,0 +1,137 @@
+#include <cassert>
+#include <iostream>
+#include <new>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <sys/time.h>
+
+using Data = float;
+
+__global__ void shared_block(Data *d, int n) {
+  __shared__ Data s;
+  int block = blockIdx.x;
+  if (threadIdx.x == 0) {
+    s = 0.0;
+  }
+  __syncthreads();
+
+  for (int i = 0; i < 200000; i++) {
+    atomicAdd(&s, 1.0);
+    atomicAdd(&s, -1.0);
+  }
+  atomicAdd(&s, 1.0);
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    d[block] = s;
+  }
+}
+
+__global__ void global_block(Data *d, int n) {
+  int block = blockIdx.x;
+  for (int i = 0; i < 200000; i++) {
+    atomicAdd(&d[block], 1.0);
+    atomicAdd(&d[block], -1.0);
+  }
+  atomicAdd(&d[block], 1.0);
+}
+
+__global__ void shared_grid(Data *d, int n) {
+  __shared__ Data var;
+  if (threadIdx.x == 0) {
+    var = 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < 200000; i++) {
+    atomicAdd(&var, 1.0);
+    atomicAdd(&var, -1.0);
+  }
+  atomicAdd(&var, 1.0);
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(&d[0], var);
+  }
+}
+
+__global__ void global_grid(Data *d, int n) {
+  for (int i = 0; i < 200000; i++) {
+    atomicAdd(&d[0], 1.0);
+    atomicAdd(&d[0], -1.0);
+  }
+  atomicAdd(&d[0], 1.0);
+}
+
+int main(void) {
+  const int n = 1 << 15;
+
+  const int thr = 256;
+  const int blocks = (n + thr - 1) / thr;
+  Data order[n], backup[n];
+
+  printf("Threads/block:%d blocks/grid:%d\n", thr, blocks);
+
+  for (int i = 0; i < n; i++)
+    order[i] = 0.0;
+
+  Data *d_d;
+  cudaMalloc(&d_d, n * sizeof(Data));
+  struct timeval t1, t2;
+
+  // run version with static shared memory
+  cudaMemcpy(d_d, &order, n * sizeof(Data), cudaMemcpyHostToDevice);
+  gettimeofday(&t1, 0);
+  shared_block<<<blocks, thr>>>(d_d, n);
+  cudaDeviceSynchronize();
+  gettimeofday(&t2, 0);
+  double time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0;
+  printf("Shared block:  %f s \n", time);
+  cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < blocks; i++) {
+    if (backup[i] != (Data)thr)
+      printf("Error: d[%d] != r (%f, %d)\n", i, backup[i], thr);
+  }
+
+  // run version with global memory
+  cudaMemcpy(d_d, &order, n * sizeof(Data), cudaMemcpyHostToDevice);
+  gettimeofday(&t1, 0);
+  global_block<<<blocks, thr>>>(d_d, n);
+  cudaDeviceSynchronize();
+  gettimeofday(&t2, 0);
+  time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0;
+  printf("Global block:  %f s \n", time);
+  cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < blocks; i++) {
+    if (backup[i] != (Data)thr)
+      printf("Error: d[%d] !=r (%f, %d)\n", i, backup[i], thr);
+  }
+
+  // run version with shared memory
+  cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice);
+  gettimeofday(&t1, 0);
+  shared_grid<<<blocks, thr>>>(d_d, n);
+  cudaDeviceSynchronize();
+  gettimeofday(&t2, 0);
+  time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0;
+  printf("Shared grid:  %f s \n", time);
+  cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost);
+  if (backup[0] != (Data)n)
+    printf("Error: d !=r (%f, %d)\n", backup[0], n);
+
+  // run version with global memory
+  cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice);
+  gettimeofday(&t1, 0);
+  global_grid<<<blocks, thr>>>(d_d, n);
+  cudaDeviceSynchronize();
+  gettimeofday(&t2, 0);
+  time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0;
+  printf("Global grid:  %f s \n", time);
+  cudaMemcpy(backup, d_d, 1 * sizeof(Data), cudaMemcpyDeviceToHost);
+  if (backup[0] != (Data)n)
+    printf("Error: d !=r (%f, %d)\n", backup[0], n);
+}
\ No newline at end of file
diff --git a/src/cuda/test/barrier_fence.cu b/src/cuda/test/barrier_fence.cu
new file mode 100644
index 000000000..905dc0874
--- /dev/null
+++ b/src/cuda/test/barrier_fence.cu
@@ -0,0 +1,85 @@
+#include <cassert>
+#include <iostream>
+#include <new>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <sys/time.h>
+#include <stdio.h>
+
+using Data = float;
+
+__global__ void global_thread(Data *d, int n) {
+  int no_blocks = 128;
+
+  for (int i = 0; i < no_blocks * no_blocks * 10; i++) {
+    if (i % no_blocks == blockIdx.x) {
+      if (i % no_blocks > 0) {
+        d[blockIdx.x] = d[blockIdx.x - 1] + 1;
+      }
+    }
+    __threadfence();
+  }
+}
+
+__global__ void shared_fence(Data *d, int n) {
+  __shared__ Data s[256];
+  int block = blockIdx.x;
+  int no_threads = 256;
+
+  for (int i = 0; i < no_threads * no_threads * 10; i++) {
+    if (i % no_threads == threadIdx.x && threadIdx.x > 0) {
+      s[threadIdx.x] = s[threadIdx.x - 1] + 1;
+    }
+    __threadfence();
+  }
+
+  if (threadIdx.x == 0) {
+    d[block] = s[127] + s[129];
+  }
+}
+
+int main(void) {
+  const int n = 1 << 15;
+
+  const int thr = 256;
+  const int blocks = (n + thr - 1) / thr;
+  Data order[n], backup[n];
+
+  printf("Threads/block:%d blocks/grid:%d\n", thr, blocks);
+
+  for (int i = 0; i < n; i++)
+    order[i] = 0.0;
+
+  Data *d_d;
+  cudaMalloc(&d_d, n * sizeof(Data));
+  struct timeval t1, t2;
+
+  // run version with shared memory
+  cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice);
+  gettimeofday(&t1, 0);
+  shared_fence<<<blocks, thr>>>(d_d, n);
+  cudaDeviceSynchronize();
+  gettimeofday(&t2, 0);
+  auto time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0;
+  printf("Shared time: %f\n", time);
+  cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < 128; i++) {
+    if (backup[i] != 256.0)
+      printf("Error1: d[%d] != r (%f, %d)\n", i, backup[i], n / 128);
+  }
+
+  // run version with global memory
+  cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice);
+  gettimeofday(&t1, 0);
+  global_thread<<<blocks, thr>>>(d_d, n);
+  cudaDeviceSynchronize();
+  gettimeofday(&t2, 0);
+  time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0;
+  printf("Global time: %f\n", time);
+  cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < 128; i++) {
+    if (backup[i] != Data(i))
+      printf("Error1: d[%d] != r (%f, %d)\n", i, backup[i], i);
+  }
+}
\ No newline at end of file
diff --git a/src/cuda/test/barrier_sync.cu b/src/cuda/test/barrier_sync.cu
new file mode 100644
index 000000000..c3fba7d96
--- /dev/null
+++ b/src/cuda/test/barrier_sync.cu
@@ -0,0 +1,52 @@
+#include <cassert>
+#include <iostream>
+#include <new>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <sys/time.h>
+
+using Data = float;
+
+__device__ void rand_func() {
+  int sum = 0;
+  for (int i = 0; i < 1000; i++)
+    sum += i;
+}
+
+__device__ void iterate(int id) {
+  for (int j = id; j < 10000; j++) {
+    if (j % 2 == 0) {
+      rand_func();
+    }
+  }
+}
+
+__global__ void check_sync(Data *d, int no_threads) {
+  for (int i = 0; i < no_threads * no_threads; i++) {
+    if ((i % no_threads) == threadIdx.x) {
+      iterate(threadIdx.x);
+    }
+    __syncthreads();
+  }
+}
+
+int main(void) {
+  const int n = 1 << 10;
+
+  const int thr = 1024;
+  const int blocks = (n + thr - 1) / thr;
+
+  printf("Threads/block:%d blocks/grid:%d\n", thr, blocks);
+
+  Data *d_d;
+  cudaMalloc(&d_d, n * sizeof(Data));
+  struct timeval t1, t2;
+
+  gettimeofday(&t1, 0);
+  check_sync<<<blocks, thr>>>(d_d, n);
+  cudaDeviceSynchronize();
+  gettimeofday(&t2, 0);
+  auto time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0;
+  printf("Time: %f s\n", time);
+}
\ No newline at end of file