diff --git a/include/CLUEstering/data_structures/detail/AssociationMap.hpp b/include/CLUEstering/data_structures/detail/AssociationMap.hpp index 7e379565..1790979b 100644 --- a/include/CLUEstering/data_structures/detail/AssociationMap.hpp +++ b/include/CLUEstering/data_structures/detail/AssociationMap.hpp @@ -304,27 +304,11 @@ namespace clue { bin_buffer.data(), sizes_buffer.data(), size); - - auto block_counter = make_device_buffer(queue); - alpaka::memset(queue, block_counter, 0); + alpaka::wait(queue); auto temp_offsets = make_device_buffer(queue, m_nbins + 1); - alpaka::memset(queue, temp_offsets, 0u, 1u); - const auto blocksize_multiblockscan = 1024; - auto gridsize_multiblockscan = divide_up_by(m_nbins, blocksize_multiblockscan); - const auto workdiv_multiblockscan = - make_workdiv(gridsize_multiblockscan, blocksize_multiblockscan); - const auto dev = alpaka::getDev(queue); - auto warp_size = alpaka::getPreferredWarpSize(dev); - alpaka::exec(queue, - workdiv_multiblockscan, - multiBlockPrefixScan{}, - sizes_buffer.data(), - temp_offsets.data() + 1, - m_nbins, - gridsize_multiblockscan, - block_counter.data(), - warp_size); + internal::algorithm::inclusive_scan( + sizes_buffer.data(), sizes_buffer.data() + m_nbins, temp_offsets.data() + 1); alpaka::memcpy(queue, make_device_view(alpaka::getDev(queue), m_offsets.data(), m_nbins + 1), @@ -381,27 +365,11 @@ namespace clue { associations.data(), sizes_buffer.data(), size); - - auto block_counter = make_device_buffer(queue); - alpaka::memset(queue, block_counter, 0); + alpaka::wait(queue); auto temp_offsets = make_device_buffer(queue, m_nbins + 1); - alpaka::memset(queue, temp_offsets, 0u, 1u); - const auto blocksize_multiblockscan = 1024; - auto gridsize_multiblockscan = divide_up_by(m_nbins, blocksize_multiblockscan); - const auto workdiv_multiblockscan = - make_workdiv(gridsize_multiblockscan, blocksize_multiblockscan); - const auto dev = alpaka::getDev(queue); - auto warp_size = alpaka::getPreferredWarpSize(dev); - alpaka::exec(queue, - workdiv_multiblockscan, - multiBlockPrefixScan{}, - sizes_buffer.data(), - temp_offsets.data() + 1, - m_nbins, - gridsize_multiblockscan, - block_counter.data(), - warp_size); + internal::algorithm::inclusive_scan( + sizes_buffer.data(), sizes_buffer.data() + m_nbins, temp_offsets.data() + 1); alpaka::memcpy(queue, make_device_view(alpaka::getDev(queue), m_offsets.data(), m_nbins + 1), diff --git a/include/CLUEstering/internal/algorithm/scan/scan.hpp b/include/CLUEstering/internal/algorithm/scan/scan.hpp index 421a8e4c..fb7a7e43 100644 --- a/include/CLUEstering/internal/algorithm/scan/scan.hpp +++ b/include/CLUEstering/internal/algorithm/scan/scan.hpp @@ -1,250 +1,184 @@ #pragma once +#include "CLUEstering/internal/algorithm/default_policy.hpp" #include -#include "CLUEstering/internal/alpaka/config.hpp" -#include "CLUEstering/internal/alpaka/work_division.hpp" -#include "CLUEstering/detail/concepts.hpp" - -namespace clue { - - template - constexpr bool isPowerOf2(T v) { - // returns true iif v has only one bit set. - while (v) { - if (v & 1) - return !(v >> 1); - else - v >>= 1; - } - return false; +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) +#include +#include +#elif defined(ALPAKA_ACC_SYCL_ENABLED) +#include +#include +#else +#include +#endif + +namespace clue::internal::algorithm { + + template + ALPAKA_FN_HOST inline constexpr void inclusive_scan(InputIterator first, + InputIterator last, + OutputIterator output) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::inclusive_scan(thrust::device, first, last, output); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::inclusive_scan(thrust::hip::par, first, last, output); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::inclusive_scan(oneapi::dpl::execution::dpcpp_default, first, last, output); +#else + std::inclusive_scan(default_policy, first, last, output); +#endif } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan( - const TAcc& acc, int32_t laneId, T const* ci, T* co, int32_t i, bool active = true) { - // ci and co may be the same - T x = active ? ci[i] : 0; - for (int32_t offset = 1; offset < alpaka::warp::getSize(acc); offset <<= 1) { - // Force the exact type for integer types otherwise the compiler will find the template resolution ambiguous. - using dataType = std::conditional_t, T, std::int32_t>; - T y = alpaka::warp::shfl(acc, static_cast(x), laneId - offset); - if (laneId >= offset) - x += y; - } - if (active) - co[i] = x; + template + ALPAKA_FN_HOST inline constexpr void inclusive_scan(ExecutionPolicy&& policy, + ForwardIterator first, + ForwardIterator last, + ForwardIterator output) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::inclusive_scan(std::forward(policy), first, last, output); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::inclusive_scan(std::forward(policy), first, last, output); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::inclusive_scan(std::forward(policy), first, last, output); +#else + std::inclusive_scan(std::forward(policy), first, last, output); +#endif } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan( - const TAcc& acc, int32_t laneId, T* c, int32_t i, bool active = true) { - warpPrefixScan(acc, laneId, c, c, i, active); + template + ALPAKA_FN_HOST inline constexpr void inclusive_scan(InputIterator first, + InputIterator last, + OutputIterator output, + BinaryOperator op) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::inclusive_scan(thrust::device, first, last, output, op); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::inclusive_scan(thrust::hip::par, first, last, output, op); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::inclusive_scan(oneapi::dpl::execution::dpcpp_default, first, last, output, op); +#else + std::inclusive_scan(default_policy, first, last, output, op); +#endif } - // limited to warpSize² elements - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan( - const TAcc& acc, T const* ci, T* co, int32_t size, T* ws = nullptr) { - if constexpr (!requires_single_thread_per_block_v) { - const auto warpSize = alpaka::warp::getSize(acc); - int32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - int32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - ALPAKA_ASSERT_ACC(ws); - ALPAKA_ASSERT_ACC(size <= warpSize * warpSize); - ALPAKA_ASSERT_ACC(0 == blockDimension % warpSize); - auto first = blockThreadIdx; - ALPAKA_ASSERT_ACC(isPowerOf2(warpSize)); - auto laneId = blockThreadIdx & (warpSize - 1); - auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize; - - for (auto i = first; i < warpUpRoundedSize; i += blockDimension) { - // When padding the warp, warpPrefixScan is a noop - warpPrefixScan(acc, laneId, ci, co, i, i < size); - if (i < size) { - // Skipped in warp padding threads. - auto warpId = i / warpSize; - ALPAKA_ASSERT_ACC(warpId < warpSize); - if ((warpSize - 1) == laneId) - ws[warpId] = co[i]; - } - } - alpaka::syncBlockThreads(acc); - if (size <= warpSize) - return; - if (blockThreadIdx < warpSize) { - warpPrefixScan(acc, laneId, ws, blockThreadIdx); - } - alpaka::syncBlockThreads(acc); - for (auto i = first + warpSize; i < size; i += blockDimension) { - int32_t warpId = i / warpSize; - co[i] += ws[warpId - 1]; - } - alpaka::syncBlockThreads(acc); - } else { - co[0] = ci[0]; - for (int32_t i = 1; i < size; ++i) - co[i] = ci[i] + co[i - 1]; - } + template + ALPAKA_FN_HOST inline constexpr void inclusive_scan(ExecutionPolicy&& policy, + ForwardIterator first, + ForwardIterator last, + ForwardIterator output, + BinaryOperator op) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::inclusive_scan(std::forward(policy), first, last, output, op); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::inclusive_scan(std::forward(policy), first, last, output, op); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::inclusive_scan(std::forward(policy), first, last, output, op); +#else + std::inclusive_scan(std::forward(policy), first, last, output, op); +#endif } - template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc& acc, - T* __restrict__ c, - int32_t size, - T* __restrict__ ws = nullptr) { - if constexpr (!requires_single_thread_per_block_v) { - const auto warpSize = alpaka::warp::getSize(acc); - int32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - int32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - ALPAKA_ASSERT_ACC(ws); - ALPAKA_ASSERT_ACC(size <= warpSize * warpSize); - ALPAKA_ASSERT_ACC(0 == blockDimension % warpSize); - auto first = blockThreadIdx; - auto laneId = blockThreadIdx & (warpSize - 1); - auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize; - - for (auto i = first; i < warpUpRoundedSize; i += blockDimension) { - // When padding the warp, warpPrefixScan is a noop - warpPrefixScan(acc, laneId, c, i, i < size); - if (i < size) { - // Skipped in warp padding threads. - auto warpId = i / warpSize; - ALPAKA_ASSERT_ACC(warpId < warpSize); - if ((warpSize - 1) == laneId) - ws[warpId] = c[i]; - } - } - alpaka::syncBlockThreads(acc); - if (size <= warpSize) - return; - if (blockThreadIdx < warpSize) { - warpPrefixScan(acc, laneId, ws, blockThreadIdx); - } - alpaka::syncBlockThreads(acc); - for (auto i = first + warpSize; i < size; i += blockDimension) { - auto warpId = i / warpSize; - c[i] += ws[warpId - 1]; - } - alpaka::syncBlockThreads(acc); - } else { - for (int32_t i = 1; i < size; ++i) - c[i] += c[i - 1]; - } + template + ALPAKA_FN_HOST inline constexpr void inclusive_scan( + InputIterator first, InputIterator last, OutputIterator output, BinaryOperator op, T init) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::inclusive_scan(thrust::device, first, last, output, op, init); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::inclusive_scan(thrust::hip::par, first, last, output, op, init); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::inclusive_scan( + oneapi::dpl::execution::dpcpp_default, first, last, output, op, init); +#else + std::inclusive_scan(default_policy, first, last, output, op, init); +#endif } - // in principle not limited.... - template - struct multiBlockPrefixScan { - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, - T const* ci, - T* co, - std::size_t size, - int32_t /* numBlocks */, - int32_t* pc, - std::size_t warpSize) const { - // Get shared variable. The workspace is needed only for multi-threaded accelerators. - T* ws = nullptr; - if constexpr (!requires_single_thread_per_block_v) { - ws = alpaka::getDynSharedMem(acc); - } - ALPAKA_ASSERT_ACC(warpSize == static_cast(alpaka::warp::getSize(acc))); - [[maybe_unused]] const auto elementsPerGrid = - alpaka::getWorkDiv(acc)[0u]; - const auto elementsPerBlock = alpaka::getWorkDiv(acc)[0u]; - const auto threadsPerBlock = alpaka::getWorkDiv(acc)[0u]; - const auto blocksPerGrid = alpaka::getWorkDiv(acc)[0u]; - const auto blockIdx = alpaka::getIdx(acc)[0u]; - const auto threadIdx = alpaka::getIdx(acc)[0u]; - ALPAKA_ASSERT_ACC(elementsPerGrid >= size); - // first each block does a scan - [[maybe_unused]] int off = elementsPerBlock * blockIdx; - if (size - off > 0) { - blockPrefixScan(acc, - ci + off, - co + off, - std::min(elementsPerBlock, static_cast(size - off)), - ws); - } - - // count blocks that finished - auto& isLastBlockDone = alpaka::declareSharedVar(acc); - //__shared__ bool isLastBlockDone; - if (0 == threadIdx) { - alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); - auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter - isLastBlockDone = (value == (int(blocksPerGrid) - 1)); - } - - alpaka::syncBlockThreads(acc); - - if (!isLastBlockDone) - return; - - ALPAKA_ASSERT_ACC(int(blocksPerGrid) == *pc); - - // good each block has done its work and now we are left in last block + template + ALPAKA_FN_HOST inline constexpr void inclusive_scan(ExecutionPolicy&& policy, + ForwardIterator first, + ForwardIterator last, + ForwardIterator output, + BinaryOperator op, + T init) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::inclusive_scan(std::forward(policy), first, last, output, op, init); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::inclusive_scan(std::forward(policy), first, last, output, op, init); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::inclusive_scan( + std::forward(policy), first, last, output, op, init); +#else + std::inclusive_scan(std::forward(policy), first, last, output, op, init); +#endif + } - // let's get the partial sums from each block except the last, which receives 0. - T* psum = nullptr; - if constexpr (!requires_single_thread_per_block_v) { - psum = ws + warpSize; - } else { - psum = alpaka::getDynSharedMem(acc); - } - for (int32_t i = threadIdx, ni = blocksPerGrid; i < ni; i += threadsPerBlock) { - auto j = elementsPerBlock * i + elementsPerBlock - 1; - psum[i] = (j < size) ? co[j] : T(0); - } - alpaka::syncBlockThreads(acc); - blockPrefixScan(acc, psum, psum, blocksPerGrid, ws); + template + ALPAKA_FN_HOST inline constexpr void exclusive_scan(InputIterator first, + InputIterator last, + OutputIterator output, + T init) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::exclusive_scan(thrust::device, first, last, output, init); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::exclusive_scan(thrust::hip::par, first, last, output, init); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::exclusive_scan(oneapi::dpl::execution::dpcpp_default, first, last, output, init); +#else + std::exclusive_scan(default_policy, first, last, output, init); +#endif + } - // now it would have been handy to have the other blocks around... - // Simplify the computation by having one version where threads per block = block size - // and a second for the one thread per block accelerator. - if constexpr (!requires_single_thread_per_block_v) { - // Here threadsPerBlock == elementsPerBlock - for (std::size_t i = threadIdx + threadsPerBlock, k = 0; i < size; - i += threadsPerBlock, ++k) { - co[i] += psum[k]; - } - } else { - // We are single threaded here, adding partial sums starting with the 2nd block. - for (std::size_t i = elementsPerBlock; i < size; i++) { - co[i] += psum[i / elementsPerBlock - 1]; - } - } - } - }; -} // namespace clue + template + ALPAKA_FN_HOST inline constexpr void exclusive_scan(ExecutionPolicy&& policy, + ForwardIterator first, + ForwardIterator last, + ForwardIterator output, + T init) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::exclusive_scan(std::forward(policy), first, last, output, init); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::exclusive_scan(std::forward(policy), first, last, output, init); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::exclusive_scan(std::forward(policy), first, last, output, init); +#else + std::exclusive_scan(std::forward(policy), first, last, output, init); +#endif + } -// declare the amount of block shared memory used by the multiBlockPrefixScan kernel -namespace alpaka::trait { + template + ALPAKA_FN_HOST inline constexpr void exclusive_scan( + InputIterator first, InputIterator last, OutputIterator output, T init, BinaryOperator op) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::exclusive_scan(thrust::device, first, last, output, init, op); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::exclusive_scan(thrust::hip::par, first, last, output, init, op); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::exclusive_scan( + oneapi::dpl::execution::dpcpp_default, first, last, output, init, op); +#else + std::exclusive_scan(default_policy, first, last, output, init, op); +#endif + } - // Variable size shared mem - template - struct BlockSharedMemDynSizeBytes, TAcc> { - template - ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes( - clue::multiBlockPrefixScan const& /* kernel */, - TVec const& /* blockThreadExtent */, - TVec const& /* threadElemExtent */, - T const* /* ci */, - T const* /* co */, - int32_t /* size */, - int32_t numBlocks, - int32_t const* /* pc */, - // This trait function does not receive the accelerator object to look up the warp size - std::size_t warpSize) { - // We need workspace (T[warpsize]) + partial sums (T[numblocks]). - if constexpr (clue::requires_single_thread_per_block_v) { - return sizeof(T) * numBlocks; - } else { - return sizeof(T) * (warpSize + numBlocks); - } - } - }; + template + ALPAKA_FN_HOST inline constexpr void exclusive_scan(ExecutionPolicy&& policy, + ForwardIterator first, + ForwardIterator last, + ForwardIterator output, + T init, + BinaryOperator op) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + thrust::exclusive_scan(std::forward(policy), first, last, output, init, op); +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) + thrust::exclusive_scan(std::forward(policy), first, last, output, init, op); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + oneapi::dpl::exclusive_scan( + std::forward(policy), first, last, output, init, op); +#else + std::exclusive_scan(std::forward(policy), first, last, output, init, op); +#endif + } -} // namespace alpaka::trait +} // namespace clue::internal::algorithm