diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 7449bb153c9f7..39f19fe463745 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -75,6 +75,19 @@ namespace cms::alpakatools {
     }
   }
 
+  /* ElementIndex
+   *
+   * an aggregate that containes the .global and .local indices of an element; returned by iterating over elements_in_block.
+   */
+
+  struct ElementIndex {
+    Idx global;
+    Idx local;
+  };
+
+  /* elements_with_stride
+   */
+
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class elements_with_stride {
   public:
@@ -326,6 +339,187 @@ namespace cms::alpakatools {
     const Vec extent_;
   };
 
+  /* blocks_with_stride
+   *
+   * `blocks_with_stride(acc, size)` returns a range than spans the (virtual) block indices required to cover the given
+   * problem size.
+   *
+   * For example, if size is 1000 and the block size is 16, it will return the range from 1 to 62.
+   * If the work division has more than 63 blocks, only the first 63 will perform one iteration of the loop, and the
+   * other will exit immediately.
+   * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
+   * cover then whole problem space.
+   *
+   * All threads in a block see the same loop iterations, while threads in different blocks may see a different number
+   * of iterations.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class blocks_with_stride {
+  public:
+    ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          extent_{stride_} {}
+
+    // extent is the total number of elements (not blocks)
+    ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc, Idx extent)
+        : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
+          extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u])} {}
+
+    class iterator {
+      friend class blocks_with_stride;
+
+      ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first)
+          : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        // increment the first-element-in-block index by the grid stride
+        first_ += stride_;
+        if (first_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); }
+
+  private:
+    const Idx first_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
+  /* elements_in_block
+   *
+   * `elements_in_block(acc, block, size)` returns a range that spans all the elements within the given block.
+   * Iterating over the range yields values of type ElementIndex, that contain both .global and .local indices
+   * of the corresponding element.
+   *
+   * If the work division has only one element per thread, the loop will perform at most one iteration.
+   * If the work division has more than one elements per thread, the loop will perform that number of iterations,
+   * or less if it reaches size.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class elements_in_block {
+  public:
+    ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block)
+        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
+          local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
+                 alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]} {}
+
+    ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block, Idx extent)
+        : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
+          local_{std::min(extent - first_,
+                          alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
+                              alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])},
+          range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])} {}
+
+    class iterator {
+      friend class elements_in_block;
+
+      ALPAKA_FN_ACC inline iterator(Idx local, Idx first, Idx range) : index_{local}, first_{first}, range_{range} {}
+
+    public:
+      ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // increment the index along the elements processed by the current thread
+          ++index_;
+          if (index_ < range_)
+            return *this;
+        }
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        index_ = range_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (index_ == other.index_); }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // modified by the pre/post-increment operator
+      Idx index_;
+      // non-const to support iterator copy and assignment
+      Idx first_;
+      Idx range_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(local_, first_, range_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(range_, first_, range_); }
+
+  private:
+    const Idx first_;
+    const Idx local_;
+    const Idx range_;
+  };
+
+  /* once_per_grid
+   *
+   * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
+   *
+   * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
+    return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
+  }
+
+  /* once_per_block
+   *
+   * `once_per_block(acc)` returns true for a single thread within the block.
+   *
+   * Usually the condition is true for thread 0, but this index should not be relied upon.
+   */
+
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
+    return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
+  }
+
 }  // namespace cms::alpakatools
 
 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
index c35965fa8793b..300f139b0c6e3 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -13,8 +13,6 @@
 // each test binary is built for a single Alpaka backend
 using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 
-static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel) "]";
-
 struct VectorAddKernel {
   template <typename TAcc, typename T>
   ALPAKA_FN_ACC void operator()(
@@ -58,233 +56,316 @@ struct VectorAddKernel3D {
   }
 };
 
-TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) {
-  SECTION("VectorAddKernel") {
-    // get the list of devices on the current platform
-    auto const& devices = cms::alpakatools::devices<Platform>();
-    if (devices.empty()) {
-      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
-                << ", the test will be skipped.\n";
-      return;
-    }
-
-    // random number generator with a gaussian distribution
-    std::random_device rd{};
-    std::default_random_engine rand{rd()};
-    std::normal_distribution<float> dist{0., 1.};
-
-    // tolerance
-    constexpr float epsilon = 0.000001;
-
-    // buffer size
-    constexpr size_t size = 1024 * 1024;
-
-    // allocate input and output host buffers in pinned memory accessible by the Platform devices
-    auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+/* This is not an efficient approach; it is only a test of using dynamic shared memory,
+ * split block and element loops, and block-level synchronisation
+ */
 
-    // fill the input buffers with random data, and the output buffer with zeros
-    for (size_t i = 0; i < size; ++i) {
-      in1_h[i] = dist(rand);
-      in2_h[i] = dist(rand);
-      out_h[i] = 0.;
+struct VectorAddBlockKernel {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
+    // block size
+    auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+    // get the dynamic shared memory buffer
+    T* buffer = alpaka::getDynSharedMem<T>(acc);
+    // the outer loop is needed to repeat the "block" as many times as needed to cover the whole problem space
+    // the inner loop is needed for backends that use more than one element per thread
+    for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
+      // only one thread per block: initialise the shared memory
+      if (cms::alpakatools::once_per_block(acc)) {
+        // not really necessary, just to show how to use "once_per_block"
+        for (Idx local = 0; local < blockSize; ++local)
+          buffer[local] = 0.;
+      }
+      // synchronise all threads in the block
+      alpaka::syncBlockThreads(acc);
+      // read the first set of data into shared memory
+      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+        buffer[index.local] = in1[index.global];
+      }
+      // synchronise all threads in the block
+      alpaka::syncBlockThreads(acc);
+      // add the second set of data into shared memory
+      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+        buffer[index.local] += in2[index.global];
+      }
+      // synchronise all threads in the block
+      alpaka::syncBlockThreads(acc);
+      // store the results into global memory
+      for (auto index : cms::alpakatools::elements_in_block(acc, block, size)) {
+        out[index.global] = buffer[index.local];
+      }
     }
+  }
+};
 
-    // run the test on each device
-    for (auto const& device : devices) {
-      std::cout << "Test 1D vector addition on " << alpaka::getName(device) << '\n';
-      auto queue = Queue(device);
-
-      // allocate input and output buffers on the device
-      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-
-      // copy the input data to the device; the size is known from the buffer objects
-      alpaka::memcpy(queue, in1_d, in1_h);
-      alpaka::memcpy(queue, in2_d, in2_h);
-
-      // fill the output buffer with zeros; the size is known from the buffer objects
-      alpaka::memset(queue, out_d, 0.);
-
-      // launch the 1-dimensional kernel with scalar size
-      auto div = cms::alpakatools::make_workdiv<Acc1D>(4, 4);
-      alpaka::exec<Acc1D>(queue, div, VectorAddKernel{}, in1_d.data(), in2_d.data(), out_d.data(), size);
-
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
-
-      // wait for all the operations to complete
-      alpaka::wait(queue);
+/* Run all operations in a single thread.
+ * Written in an inefficient way to test "once_per_grid".
+ */
 
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
+struct VectorAddKernelSerial {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
+    // the operations are performed by a single thread
+    if (cms::alpakatools::once_per_grid(acc)) {
+      for (Idx index = 0; index < size; ++index) {
+        out[index] += in1[index];
+        out[index] += in2[index];
       }
+    }
+  }
+};
 
-      // reset the output buffer on the device to all zeros
-      alpaka::memset(queue, out_d, 0.);
-
-      // launch the 1-dimensional kernel with vector size
-      alpaka::exec<Acc1D>(queue, div, VectorAddKernel1D{}, in1_d.data(), in2_d.data(), out_d.data(), size);
-
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
-
-      // wait for all the operations to complete
-      alpaka::wait(queue);
+/* Run all operations in one thread per block.
+ * Written in an inefficient way to test "once_per_block".
+ */
 
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
+struct VectorAddKernelBlockSerial {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, size_t size) const {
+    // block size
+    auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+    // the loop is used to repeat the "block" as many times as needed to cover the whole problem space
+    for (auto block : cms::alpakatools::blocks_with_stride(acc, size)) {
+      // the operations are performed by a single thread in each "logical" block
+      const auto first = blockSize * block;
+      const auto range = std::min<size_t>(first + blockSize, size);
+      if (cms::alpakatools::once_per_block(acc)) {
+        for (Idx index = first; index < range; ++index) {
+          out[index] += in1[index];
+          out[index] += in2[index];
+        }
       }
     }
   }
-}
+};
 
-TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel2D), s_tag) {
-  SECTION("VectorAddKernel2D") {
-    // get the list of devices on the current platform
-    auto const& devices = cms::alpakatools::devices<Platform>();
-    if (devices.empty()) {
-      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
-                << ", the test will be skipped.\n";
-      return;
+namespace alpaka::trait {
+  // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
+  // block shared dynamic memory for the VectorAddBlockKernel kernel
+  template <typename TAcc>
+  struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, TAcc> {
+    // the size in bytes of the shared memory allocated for a block
+    template <typename T>
+    ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(VectorAddBlockKernel const& /* kernel */,
+                                                                        Vec1D threads,
+                                                                        Vec1D elements,
+                                                                        T const* __restrict__ /* in1 */,
+                                                                        T const* __restrict__ /* in2 */,
+                                                                        T* __restrict__ /* out */,
+                                                                        size_t size) {
+      return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
     }
+  };
+}  // namespace alpaka::trait
+
+// test the 1-dimensional kernel on all devices
+template <typename TKernel>
+void testVectorAddKernel(std::size_t problem_size, std::size_t grid_size, std::size_t block_size, TKernel kernel) {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine rand{rd()};
+  std::normal_distribution<float> dist{0., 1.};
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // buffer size
+  const size_t size = problem_size;
+
+  // allocate input and output host buffers in pinned memory accessible by the Platform devices
+  auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in1_h[i] = dist(rand);
+    in2_h[i] = dist(rand);
+    out_h[i] = 0.;
+  }
 
-    // random number generator with a gaussian distribution
-    std::random_device rd{};
-    std::default_random_engine rand{rd()};
-    std::normal_distribution<float> dist{0., 1.};
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    std::cout << "Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with "
+              << grid_size << " blocks of " << block_size << " elements\n";
+    auto queue = Queue(device);
 
-    // tolerance
-    constexpr float epsilon = 0.000001;
+    // allocate input and output buffers on the device
+    auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
 
-    // 3-dimensional and linearised buffer size
-    constexpr Vec2D ndsize = {16, 16};
-    constexpr size_t size = ndsize.prod();
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::memcpy(queue, in1_d, in1_h);
+    alpaka::memcpy(queue, in2_d, in2_h);
 
-    // allocate input and output host buffers in pinned memory accessible by the Platform devices
-    auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::memset(queue, out_d, 0.);
 
-    // fill the input buffers with random data, and the output buffer with zeros
+    // launch the 1-dimensional kernel with scalar size
+    auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
+    alpaka::exec<Acc1D>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size);
+
+    // copy the results from the device to the host
+    alpaka::memcpy(queue, out_h, out_d);
+
+    // wait for all the operations to complete
+    alpaka::wait(queue);
+
+    // check the results
     for (size_t i = 0; i < size; ++i) {
-      in1_h[i] = dist(rand);
-      in2_h[i] = dist(rand);
-      out_h[i] = 0.;
+      float sum = in1_h[i] + in2_h[i];
+      REQUIRE(out_h[i] < sum + epsilon);
+      REQUIRE(out_h[i] > sum - epsilon);
     }
+  }
+}
+
+// test the N-dimensional kernels on all devices
+template <typename TDim, typename TKernel>
+void testVectorAddKernelND(Vec<TDim> problem_size, Vec<TDim> grid_size, Vec<TDim> block_size, TKernel kernel) {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine rand{rd()};
+  std::normal_distribution<float> dist{0., 1.};
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // linearised buffer size
+  const size_t size = problem_size.prod();
+
+  // allocate input and output host buffers in pinned memory accessible by the Platform devices
+  auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in1_h[i] = dist(rand);
+    in2_h[i] = dist(rand);
+    out_h[i] = 0.;
+  }
 
-    // run the test on each device
-    for (auto const& device : devices) {
-      std::cout << "Test 2D vector addition on " << alpaka::getName(device) << '\n';
-      auto queue = Queue(device);
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    std::cout << "Test " << TDim::value << "D vector addition on " << alpaka::getName(device) << " over "
+              << problem_size << " values with " << grid_size << " blocks of " << block_size << " elements\n";
+    auto queue = Queue(device);
 
-      // allocate input and output buffers on the device
-      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    // allocate input and output buffers on the device
+    auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
 
-      // copy the input data to the device; the size is known from the buffer objects
-      alpaka::memcpy(queue, in1_d, in1_h);
-      alpaka::memcpy(queue, in2_d, in2_h);
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::memcpy(queue, in1_d, in1_h);
+    alpaka::memcpy(queue, in2_d, in2_h);
 
-      // fill the output buffer with zeros; the size is known from the buffer objects
-      alpaka::memset(queue, out_d, 0.);
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::memset(queue, out_d, 0.);
 
-      // launch the 3-dimensional kernel
-      auto div = cms::alpakatools::make_workdiv<Acc2D>({4, 4}, {32, 32});
-      alpaka::exec<Acc2D>(queue, div, VectorAddKernel2D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);
+    // launch the 3-dimensional kernel
+    using AccND = Acc<TDim>;
+    auto div = cms::alpakatools::make_workdiv<AccND>(grid_size, block_size);
+    alpaka::exec<AccND>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size);
 
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
+    // copy the results from the device to the host
+    alpaka::memcpy(queue, out_h, out_d);
 
-      // wait for all the operations to complete
-      alpaka::wait(queue);
+    // wait for all the operations to complete
+    alpaka::wait(queue);
 
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
-      }
+    // check the results
+    for (size_t i = 0; i < size; ++i) {
+      float sum = in1_h[i] + in2_h[i];
+      REQUIRE(out_h[i] < sum + epsilon);
+      REQUIRE(out_h[i] > sum - epsilon);
     }
   }
 }
 
-TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) {
-  SECTION("VectorAddKernel3D") {
+TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
+          "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
+  SECTION("Alpaka N-dimensional kernels") {
     // get the list of devices on the current platform
     auto const& devices = cms::alpakatools::devices<Platform>();
     if (devices.empty()) {
-      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
-                << ", the test will be skipped.\n";
-      return;
+      INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE));
+      REQUIRE(not devices.empty());
     }
 
-    // random number generator with a gaussian distribution
-    std::random_device rd{};
-    std::default_random_engine rand{rd()};
-    std::normal_distribution<float> dist{0., 1.};
-
-    // tolerance
-    constexpr float epsilon = 0.000001;
-
-    // 3-dimensional and linearised buffer size
-    constexpr Vec3D ndsize = {50, 125, 16};
-    constexpr size_t size = ndsize.prod();
-
-    // allocate input and output host buffers in pinned memory accessible by the Platform devices
-    auto in1_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto in2_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-    auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
-
-    // fill the input buffers with random data, and the output buffer with zeros
-    for (size_t i = 0; i < size; ++i) {
-      in1_h[i] = dist(rand);
-      in2_h[i] = dist(rand);
-      out_h[i] = 0.;
-    }
-
-    // run the test on each device
-    for (auto const& device : devices) {
-      std::cout << "Test 3D vector addition on " << alpaka::getName(device) << '\n';
-      auto queue = Queue(device);
-
-      // allocate input and output buffers on the device
-      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
-
-      // copy the input data to the device; the size is known from the buffer objects
-      alpaka::memcpy(queue, in1_d, in1_h);
-      alpaka::memcpy(queue, in2_d, in2_h);
-
-      // fill the output buffer with zeros; the size is known from the buffer objects
-      alpaka::memset(queue, out_d, 0.);
-
-      // launch the 3-dimensional kernel
-      auto div = cms::alpakatools::make_workdiv<Acc3D>({5, 5, 1}, {4, 4, 4});
-      alpaka::exec<Acc3D>(queue, div, VectorAddKernel3D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);
-
-      // copy the results from the device to the host
-      alpaka::memcpy(queue, out_h, out_d);
-
-      // wait for all the operations to complete
-      alpaka::wait(queue);
-
-      // check the results
-      for (size_t i = 0; i < size; ++i) {
-        float sum = in1_h[i] + in2_h[i];
-        REQUIRE(out_h[i] < sum + epsilon);
-        REQUIRE(out_h[i] > sum - epsilon);
-      }
-    }
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddKernel{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddKernel{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector addition with small block size\n";
+    testVectorAddKernelND<Dim1D>({10000}, {32}, {32}, VectorAddKernel1D{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector addition with large block size\n";
+    testVectorAddKernelND<Dim1D>({100}, {1}, {1024}, VectorAddKernel1D{});
+
+    // launch the 2-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 2D vector addition with small block size\n";
+    testVectorAddKernelND<Dim2D>({400, 250}, {4, 4}, {16, 16}, VectorAddKernel2D{});
+
+    // launch the 2-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 2D vector addition with large block size\n";
+    testVectorAddKernelND<Dim2D>({20, 20}, {1, 1}, {32, 32}, VectorAddKernel2D{});
+
+    // launch the 3-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 3D vector addition with small block size\n";
+    testVectorAddKernelND<Dim3D>({50, 125, 16}, {5, 5, 1}, {4, 4, 4}, VectorAddKernel3D{});
+
+    // launch the 3-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 3D vector addition with large block size\n";
+    testVectorAddKernelND<Dim3D>({5, 5, 5}, {1, 1, 1}, {8, 8, 8}, VectorAddKernel3D{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector block-level addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddBlockKernel{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector block-level addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddBlockKernel{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector single-threaded serial addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddKernelSerial{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector single-threaded seria addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddKernelSerial{});
+
+    // launch the 1-dimensional kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test 1D vector block-level serial addition with small block size, using scalar dimensions\n";
+    testVectorAddKernel(10000, 32, 32, VectorAddKernelBlockSerial{});
+
+    // launch the 1-dimensional kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test 1D vector block-level serial addition with large block size, using scalar dimensions\n";
+    testVectorAddKernel(100, 1, 1024, VectorAddKernelBlockSerial{});
   }
 }