diff --git a/include/CLUEstering/data_structures/detail/AssociationMap.hpp b/include/CLUEstering/data_structures/detail/AssociationMap.hpp
index 7e379565..1790979b 100644
--- a/include/CLUEstering/data_structures/detail/AssociationMap.hpp
+++ b/include/CLUEstering/data_structures/detail/AssociationMap.hpp
@@ -304,27 +304,11 @@ namespace clue {
                        bin_buffer.data(),
                        sizes_buffer.data(),
                        size);
-
-    auto block_counter = make_device_buffer<int32_t>(queue);
-    alpaka::memset(queue, block_counter, 0);
+    alpaka::wait(queue);
 
     auto temp_offsets = make_device_buffer<int32_t[]>(queue, m_nbins + 1);
-    alpaka::memset(queue, temp_offsets, 0u, 1u);
-    const auto blocksize_multiblockscan = 1024;
-    auto gridsize_multiblockscan = divide_up_by(m_nbins, blocksize_multiblockscan);
-    const auto workdiv_multiblockscan =
-        make_workdiv<TAcc>(gridsize_multiblockscan, blocksize_multiblockscan);
-    const auto dev = alpaka::getDev(queue);
-    auto warp_size = alpaka::getPreferredWarpSize(dev);
-    alpaka::exec<TAcc>(queue,
-                       workdiv_multiblockscan,
-                       multiBlockPrefixScan<int32_t>{},
-                       sizes_buffer.data(),
-                       temp_offsets.data() + 1,
-                       m_nbins,
-                       gridsize_multiblockscan,
-                       block_counter.data(),
-                       warp_size);
+    internal::algorithm::inclusive_scan(
+        sizes_buffer.data(), sizes_buffer.data() + m_nbins, temp_offsets.data() + 1);
 
     alpaka::memcpy(queue,
                    make_device_view(alpaka::getDev(queue), m_offsets.data(), m_nbins + 1),
@@ -381,27 +365,11 @@ namespace clue {
                        associations.data(),
                        sizes_buffer.data(),
                        size);
-
-    auto block_counter = make_device_buffer<int32_t>(queue);
-    alpaka::memset(queue, block_counter, 0);
+    alpaka::wait(queue);
 
     auto temp_offsets = make_device_buffer<key_type[]>(queue, m_nbins + 1);
-    alpaka::memset(queue, temp_offsets, 0u, 1u);
-    const auto blocksize_multiblockscan = 1024;
-    auto gridsize_multiblockscan = divide_up_by(m_nbins, blocksize_multiblockscan);
-    const auto workdiv_multiblockscan =
-        make_workdiv<TAcc>(gridsize_multiblockscan, blocksize_multiblockscan);
-    const auto dev = alpaka::getDev(queue);
-    auto warp_size = alpaka::getPreferredWarpSize(dev);
-    alpaka::exec<TAcc>(queue,
-                       workdiv_multiblockscan,
-                       multiBlockPrefixScan<key_type>{},
-                       sizes_buffer.data(),
-                       temp_offsets.data() + 1,
-                       m_nbins,
-                       gridsize_multiblockscan,
-                       block_counter.data(),
-                       warp_size);
+    internal::algorithm::inclusive_scan(
+        sizes_buffer.data(), sizes_buffer.data() + m_nbins, temp_offsets.data() + 1);
 
     alpaka::memcpy(queue,
                    make_device_view(alpaka::getDev(queue), m_offsets.data(), m_nbins + 1),
diff --git a/include/CLUEstering/internal/algorithm/scan/scan.hpp b/include/CLUEstering/internal/algorithm/scan/scan.hpp
index 421a8e4c..fb7a7e43 100644
--- a/include/CLUEstering/internal/algorithm/scan/scan.hpp
+++ b/include/CLUEstering/internal/algorithm/scan/scan.hpp
@@ -1,250 +1,184 @@
 
 #pragma once
 
+#include "CLUEstering/internal/algorithm/default_policy.hpp"
 #include <alpaka/alpaka.hpp>
 
-#include "CLUEstering/internal/alpaka/config.hpp"
-#include "CLUEstering/internal/alpaka/work_division.hpp"
-#include "CLUEstering/detail/concepts.hpp"
-
-namespace clue {
-
-  template <std::integral T>
-  constexpr bool isPowerOf2(T v) {
-    // returns true iif v has only one bit set.
-    while (v) {
-      if (v & 1)
-        return !(v >> 1);
-      else
-        v >>= 1;
-    }
-    return false;
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#else
+#include <algorithm>
+#endif
+
+namespace clue::internal::algorithm {
+
+  template <typename InputIterator, typename OutputIterator>
+  ALPAKA_FN_HOST inline constexpr void inclusive_scan(InputIterator first,
+                                                      InputIterator last,
+                                                      OutputIterator output) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::inclusive_scan(thrust::device, first, last, output);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::inclusive_scan(thrust::hip::par, first, last, output);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::inclusive_scan(oneapi::dpl::execution::dpcpp_default, first, last, output);
+#else
+    std::inclusive_scan(default_policy, first, last, output);
+#endif
   }
 
-  template <concepts::accelerator TAcc, typename T>
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(
-      const TAcc& acc, int32_t laneId, T const* ci, T* co, int32_t i, bool active = true) {
-    // ci and co may be the same
-    T x = active ? ci[i] : 0;
-    for (int32_t offset = 1; offset < alpaka::warp::getSize(acc); offset <<= 1) {
-      // Force the exact type for integer types otherwise the compiler will find the template resolution ambiguous.
-      using dataType = std::conditional_t<std::is_floating_point_v<T>, T, std::int32_t>;
-      T y = alpaka::warp::shfl(acc, static_cast<dataType>(x), laneId - offset);
-      if (laneId >= offset)
-        x += y;
-    }
-    if (active)
-      co[i] = x;
+  template <typename ExecutionPolicy, typename ForwardIterator>
+  ALPAKA_FN_HOST inline constexpr void inclusive_scan(ExecutionPolicy&& policy,
+                                                      ForwardIterator first,
+                                                      ForwardIterator last,
+                                                      ForwardIterator output) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output);
+#else
+    std::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output);
+#endif
   }
 
-  template <concepts::accelerator TAcc, typename T>
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(
-      const TAcc& acc, int32_t laneId, T* c, int32_t i, bool active = true) {
-    warpPrefixScan(acc, laneId, c, c, i, active);
+  template <typename InputIterator, typename OutputIterator, typename BinaryOperator>
+  ALPAKA_FN_HOST inline constexpr void inclusive_scan(InputIterator first,
+                                                      InputIterator last,
+                                                      OutputIterator output,
+                                                      BinaryOperator op) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::inclusive_scan(thrust::device, first, last, output, op);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::inclusive_scan(thrust::hip::par, first, last, output, op);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::inclusive_scan(oneapi::dpl::execution::dpcpp_default, first, last, output, op);
+#else
+    std::inclusive_scan(default_policy, first, last, output, op);
+#endif
   }
 
-  // limited to warpSize² elements
-  template <concepts::accelerator TAcc, typename T>
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan(
-      const TAcc& acc, T const* ci, T* co, int32_t size, T* ws = nullptr) {
-    if constexpr (!requires_single_thread_per_block_v<TAcc>) {
-      const auto warpSize = alpaka::warp::getSize(acc);
-      int32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
-      int32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-      ALPAKA_ASSERT_ACC(ws);
-      ALPAKA_ASSERT_ACC(size <= warpSize * warpSize);
-      ALPAKA_ASSERT_ACC(0 == blockDimension % warpSize);
-      auto first = blockThreadIdx;
-      ALPAKA_ASSERT_ACC(isPowerOf2(warpSize));
-      auto laneId = blockThreadIdx & (warpSize - 1);
-      auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize;
-
-      for (auto i = first; i < warpUpRoundedSize; i += blockDimension) {
-        // When padding the warp, warpPrefixScan is a noop
-        warpPrefixScan(acc, laneId, ci, co, i, i < size);
-        if (i < size) {
-          // Skipped in warp padding threads.
-          auto warpId = i / warpSize;
-          ALPAKA_ASSERT_ACC(warpId < warpSize);
-          if ((warpSize - 1) == laneId)
-            ws[warpId] = co[i];
-        }
-      }
-      alpaka::syncBlockThreads(acc);
-      if (size <= warpSize)
-        return;
-      if (blockThreadIdx < warpSize) {
-        warpPrefixScan(acc, laneId, ws, blockThreadIdx);
-      }
-      alpaka::syncBlockThreads(acc);
-      for (auto i = first + warpSize; i < size; i += blockDimension) {
-        int32_t warpId = i / warpSize;
-        co[i] += ws[warpId - 1];
-      }
-      alpaka::syncBlockThreads(acc);
-    } else {
-      co[0] = ci[0];
-      for (int32_t i = 1; i < size; ++i)
-        co[i] = ci[i] + co[i - 1];
-    }
+  template <typename ExecutionPolicy, typename ForwardIterator, typename BinaryOperator>
+  ALPAKA_FN_HOST inline constexpr void inclusive_scan(ExecutionPolicy&& policy,
+                                                      ForwardIterator first,
+                                                      ForwardIterator last,
+                                                      ForwardIterator output,
+                                                      BinaryOperator op) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, op);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, op);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, op);
+#else
+    std::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, op);
+#endif
   }
 
-  template <concepts::accelerator TAcc, typename T>
-  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc& acc,
-                                                           T* __restrict__ c,
-                                                           int32_t size,
-                                                           T* __restrict__ ws = nullptr) {
-    if constexpr (!requires_single_thread_per_block_v<TAcc>) {
-      const auto warpSize = alpaka::warp::getSize(acc);
-      int32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
-      int32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-      ALPAKA_ASSERT_ACC(ws);
-      ALPAKA_ASSERT_ACC(size <= warpSize * warpSize);
-      ALPAKA_ASSERT_ACC(0 == blockDimension % warpSize);
-      auto first = blockThreadIdx;
-      auto laneId = blockThreadIdx & (warpSize - 1);
-      auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize;
-
-      for (auto i = first; i < warpUpRoundedSize; i += blockDimension) {
-        // When padding the warp, warpPrefixScan is a noop
-        warpPrefixScan(acc, laneId, c, i, i < size);
-        if (i < size) {
-          // Skipped in warp padding threads.
-          auto warpId = i / warpSize;
-          ALPAKA_ASSERT_ACC(warpId < warpSize);
-          if ((warpSize - 1) == laneId)
-            ws[warpId] = c[i];
-        }
-      }
-      alpaka::syncBlockThreads(acc);
-      if (size <= warpSize)
-        return;
-      if (blockThreadIdx < warpSize) {
-        warpPrefixScan(acc, laneId, ws, blockThreadIdx);
-      }
-      alpaka::syncBlockThreads(acc);
-      for (auto i = first + warpSize; i < size; i += blockDimension) {
-        auto warpId = i / warpSize;
-        c[i] += ws[warpId - 1];
-      }
-      alpaka::syncBlockThreads(acc);
-    } else {
-      for (int32_t i = 1; i < size; ++i)
-        c[i] += c[i - 1];
-    }
+  template <typename InputIterator, typename OutputIterator, typename BinaryOperator, typename T>
+  ALPAKA_FN_HOST inline constexpr void inclusive_scan(
+      InputIterator first, InputIterator last, OutputIterator output, BinaryOperator op, T init) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::inclusive_scan(thrust::device, first, last, output, op, init);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::inclusive_scan(thrust::hip::par, first, last, output, op, init);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::inclusive_scan(
+        oneapi::dpl::execution::dpcpp_default, first, last, output, op, init);
+#else
+    std::inclusive_scan(default_policy, first, last, output, op, init);
+#endif
   }
 
-  // in principle not limited....
-  template <typename T>
-  struct multiBlockPrefixScan {
-    template <typename TAcc>
-    ALPAKA_FN_ACC void operator()(const TAcc& acc,
-                                  T const* ci,
-                                  T* co,
-                                  std::size_t size,
-                                  int32_t /* numBlocks */,
-                                  int32_t* pc,
-                                  std::size_t warpSize) const {
-      // Get shared variable. The workspace is needed only for multi-threaded accelerators.
-      T* ws = nullptr;
-      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
-        ws = alpaka::getDynSharedMem<T>(acc);
-      }
-      ALPAKA_ASSERT_ACC(warpSize == static_cast<std::size_t>(alpaka::warp::getSize(acc)));
-      [[maybe_unused]] const auto elementsPerGrid =
-          alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u];
-      const auto elementsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
-      const auto threadsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
-      const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
-      const auto blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
-      const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
-      ALPAKA_ASSERT_ACC(elementsPerGrid >= size);
-      // first each block does a scan
-      [[maybe_unused]] int off = elementsPerBlock * blockIdx;
-      if (size - off > 0) {
-        blockPrefixScan(acc,
-                        ci + off,
-                        co + off,
-                        std::min(elementsPerBlock, static_cast<uint32_t>(size - off)),
-                        ws);
-      }
-
-      // count blocks that finished
-      auto& isLastBlockDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
-      //__shared__ bool isLastBlockDone;
-      if (0 == threadIdx) {
-        alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
-        auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{});  // block counter
-        isLastBlockDone = (value == (int(blocksPerGrid) - 1));
-      }
-
-      alpaka::syncBlockThreads(acc);
-
-      if (!isLastBlockDone)
-        return;
-
-      ALPAKA_ASSERT_ACC(int(blocksPerGrid) == *pc);
-
-      // good each block has done its work and now we are left in last block
+  template <typename ExecutionPolicy, typename ForwardIterator, typename BinaryOperator, typename T>
+  ALPAKA_FN_HOST inline constexpr void inclusive_scan(ExecutionPolicy&& policy,
+                                                      ForwardIterator first,
+                                                      ForwardIterator last,
+                                                      ForwardIterator output,
+                                                      BinaryOperator op,
+                                                      T init) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, op, init);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, op, init);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::inclusive_scan(
+        std::forward<ExecutionPolicy>(policy), first, last, output, op, init);
+#else
+    std::inclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, op, init);
+#endif
+  }
 
-      // let's get the partial sums from each block except the last, which receives 0.
-      T* psum = nullptr;
-      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
-        psum = ws + warpSize;
-      } else {
-        psum = alpaka::getDynSharedMem<T>(acc);
-      }
-      for (int32_t i = threadIdx, ni = blocksPerGrid; i < ni; i += threadsPerBlock) {
-        auto j = elementsPerBlock * i + elementsPerBlock - 1;
-        psum[i] = (j < size) ? co[j] : T(0);
-      }
-      alpaka::syncBlockThreads(acc);
-      blockPrefixScan(acc, psum, psum, blocksPerGrid, ws);
+  template <typename InputIterator, typename OutputIterator, typename T>
+  ALPAKA_FN_HOST inline constexpr void exclusive_scan(InputIterator first,
+                                                      InputIterator last,
+                                                      OutputIterator output,
+                                                      T init) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::exclusive_scan(thrust::device, first, last, output, init);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::exclusive_scan(thrust::hip::par, first, last, output, init);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::exclusive_scan(oneapi::dpl::execution::dpcpp_default, first, last, output, init);
+#else
+    std::exclusive_scan(default_policy, first, last, output, init);
+#endif
+  }
 
-      // now it would have been handy to have the other blocks around...
-      // Simplify the computation by having one version where threads per block = block size
-      // and a second for the one thread per block accelerator.
-      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
-        //  Here threadsPerBlock == elementsPerBlock
-        for (std::size_t i = threadIdx + threadsPerBlock, k = 0; i < size;
-             i += threadsPerBlock, ++k) {
-          co[i] += psum[k];
-        }
-      } else {
-        // We are single threaded here, adding partial sums starting with the 2nd block.
-        for (std::size_t i = elementsPerBlock; i < size; i++) {
-          co[i] += psum[i / elementsPerBlock - 1];
-        }
-      }
-    }
-  };
-}  // namespace clue
+  template <typename ExecutionPolicy, typename ForwardIterator, typename T>
+  ALPAKA_FN_HOST inline constexpr void exclusive_scan(ExecutionPolicy&& policy,
+                                                      ForwardIterator first,
+                                                      ForwardIterator last,
+                                                      ForwardIterator output,
+                                                      T init) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::exclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, init);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::exclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, init);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::exclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, init);
+#else
+    std::exclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, init);
+#endif
+  }
 
-// declare the amount of block shared memory used by the multiBlockPrefixScan kernel
-namespace alpaka::trait {
+  template <typename InputIterator, typename OutputIterator, typename T, typename BinaryOperator>
+  ALPAKA_FN_HOST inline constexpr void exclusive_scan(
+      InputIterator first, InputIterator last, OutputIterator output, T init, BinaryOperator op) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::exclusive_scan(thrust::device, first, last, output, init, op);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::exclusive_scan(thrust::hip::par, first, last, output, init, op);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::exclusive_scan(
+        oneapi::dpl::execution::dpcpp_default, first, last, output, init, op);
+#else
+    std::exclusive_scan(default_policy, first, last, output, init, op);
+#endif
+  }
 
-  // Variable size shared mem
-  template <clue::concepts::accelerator TAcc, typename T>
-  struct BlockSharedMemDynSizeBytes<clue::multiBlockPrefixScan<T>, TAcc> {
-    template <typename TVec>
-    ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
-        clue::multiBlockPrefixScan<T> const& /* kernel */,
-        TVec const& /* blockThreadExtent */,
-        TVec const& /* threadElemExtent */,
-        T const* /* ci */,
-        T const* /* co */,
-        int32_t /* size */,
-        int32_t numBlocks,
-        int32_t const* /* pc */,
-        // This trait function does not receive the accelerator object to look up the warp size
-        std::size_t warpSize) {
-      // We need workspace (T[warpsize]) + partial sums (T[numblocks]).
-      if constexpr (clue::requires_single_thread_per_block_v<TAcc>) {
-        return sizeof(T) * numBlocks;
-      } else {
-        return sizeof(T) * (warpSize + numBlocks);
-      }
-    }
-  };
+  template <typename ExecutionPolicy, typename ForwardIterator, typename T, typename BinaryOperator>
+  ALPAKA_FN_HOST inline constexpr void exclusive_scan(ExecutionPolicy&& policy,
+                                                      ForwardIterator first,
+                                                      ForwardIterator last,
+                                                      ForwardIterator output,
+                                                      T init,
+                                                      BinaryOperator op) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    thrust::exclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, init, op);
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    thrust::exclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, init, op);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+    oneapi::dpl::exclusive_scan(
+        std::forward<ExecutionPolicy>(policy), first, last, output, init, op);
+#else
+    std::exclusive_scan(std::forward<ExecutionPolicy>(policy), first, last, output, init, op);
+#endif
+  }
 
-}  // namespace alpaka::trait
+}  // namespace clue::internal::algorithm