diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 3cb2918f9..5af661f79 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -75,18 +75,15 @@ namespace cms { const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; const Vec1D blocksPerGrid(nblocks); + auto d_pc = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + int32_t *pc = alpaka::getPtrNative(d_pc); + alpaka::memset(queue, d_pc, 0, 1u); + const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::enqueue(queue, alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( - workDiv, ::cms::alpakatools::multiBlockPrefixScanFirstStep(), poff, poff, num_items)); - - const WorkDiv1D &workDivWith1Block = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( - Vec1D::all(1), threadsPerBlockOrElementsPerThread); - alpaka::enqueue( - queue, - alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( - workDivWith1Block, ::cms::alpakatools::multiBlockPrefixScanSecondStep(), poff, poff, num_items, nblocks)); + workDiv, ::cms::alpakatools::multiBlockPrefixScan(), poff, poff, num_items, pc)); } template @@ -106,14 +103,14 @@ namespace cms { const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::enqueue( - queue, - alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, countFromVector(), h, nh, v, offsets)); + alpaka::enqueue(queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDiv, countFromVector(), h, nh, v, offsets)); launchFinalize(h, queue); - alpaka::enqueue( - queue, - alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, fillFromVector(), h, nh, v, offsets)); + alpaka::enqueue(queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDiv, fillFromVector(), h, nh, v, offsets)); } struct finalizeBulk { diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index 88471512b..797266d66 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -4,6 +4,7 @@ #include #include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/threadfence.h" #include "Framework/CMSUnrollLoop.h" #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED @@ -49,10 +50,10 @@ namespace cms { #endif ) { #if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__ - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const gridBlockIdx(alpaka::getIdx(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - assert(ws); + const int32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + const int32_t gridBlockIdx(alpaka::getIdx(acc)[0u]); + const int32_t blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); ALPAKA_ASSERT_OFFLOAD(size <= 1024); ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); auto first = blockThreadIdx; @@ -97,10 +98,10 @@ namespace cms { #endif ) { #if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__ - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const gridBlockIdx(alpaka::getIdx(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - assert(ws); + const int32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + const int32_t gridBlockIdx(alpaka::getIdx(acc)[0u]); + const int32_t blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); ALPAKA_ASSERT_OFFLOAD(size <= 1024); ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); auto first = blockThreadIdx; @@ -135,43 +136,45 @@ namespace cms { // limited to 1024*1024 elements.... template - struct multiBlockPrefixScanFirstStep { + struct multiBlockPrefixScan { template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size) const { - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const blockIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t* pc) const { + const int32_t gridDimension(alpaka::getWorkDiv(acc)[0u]); + const int32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + const int32_t threadDimension(alpaka::getWorkDiv(acc)[0u]); + const int32_t blockIdx(alpaka::getIdx(acc)[0u]); + const int32_t threadIdx(alpaka::getIdx(acc)[0u]); - auto& ws = alpaka::declareSharedVar(acc); // first each block does a scan of size 1024; (better be enough blocks....) -#ifndef NDEBUG - uint32_t const gridDimension(alpaka::getWorkDiv(acc)[0u]); ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= 1024); -#endif + + auto& ws = alpaka::declareSharedVar(acc); int off = blockDimension * blockIdx * threadDimension; if (size - off > 0) blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDimension * threadDimension), size - off), ws); - } - }; - // limited to 1024*1024 elements.... - template - struct multiBlockPrefixScanSecondStep { - template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t numBlocks) const { - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); + auto& isLastBlockDone = alpaka::declareSharedVar(acc); + if (0 == threadIdx) { + ::cms::alpakatools::threadfence(acc); + auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter + isLastBlockDone = (value == (gridDimension - 1)); + } - uint32_t const threadIdx(alpaka::getIdx(acc)[0u]); + alpaka::syncBlockThreads(acc); - auto* const psum(alpaka::getDynSharedMem(acc)); + if (!isLastBlockDone) + return; + + ALPAKA_ASSERT_OFFLOAD(gridDimension == *pc); + + auto& psum = alpaka::declareSharedVar(acc); + + ALPAKA_ASSERT_OFFLOAD(static_cast(blockDimension * threadDimension) >= gridDimension); - // first each block does a scan of size 1024; (better be enough blocks....) - ALPAKA_ASSERT_OFFLOAD(static_cast(blockDimension * threadDimension) >= numBlocks); for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { - int index = +threadIdx * threadDimension + elemId; + int index = threadIdx * threadDimension + elemId; - if (index < numBlocks) { + if (index < gridDimension) { int lastElementOfPreviousBlockId = index * blockDimension * threadDimension - 1; psum[index] = (lastElementOfPreviousBlockId < size and lastElementOfPreviousBlockId >= 0) ? co[lastElementOfPreviousBlockId] @@ -180,9 +183,7 @@ namespace cms { } alpaka::syncBlockThreads(acc); - - auto& ws = alpaka::declareSharedVar(acc); - blockPrefixScan(acc, psum, psum, numBlocks, ws); + blockPrefixScan(acc, psum, psum, gridDimension, ws); for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { int first = threadIdx * threadDimension + elemId; @@ -197,36 +198,4 @@ namespace cms { } // namespace alpakatools } // namespace cms -namespace alpaka { - namespace traits { - - //############################################################################# - //! The trait for getting the size of the block shared dynamic memory for a kernel. - template - struct BlockSharedMemDynSizeBytes<::cms::alpakatools::multiBlockPrefixScanSecondStep, TAcc> { - //----------------------------------------------------------------------------- - //! \return The size of the shared memory allocated for a block. - template - ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes( - ::cms::alpakatools::multiBlockPrefixScanSecondStep const& myKernel, - TVec const& blockThreadExtent, - TVec const& threadElemExtent, - T const* ci, - T* co, - int32_t size, - int32_t numBlocks) -> T { - alpaka::ignore_unused(myKernel); - alpaka::ignore_unused(blockThreadExtent); - alpaka::ignore_unused(threadElemExtent); - alpaka::ignore_unused(ci); - alpaka::ignore_unused(co); - alpaka::ignore_unused(size); - - return static_cast(numBlocks) * sizeof(T); - } - }; - - } // namespace traits -} // namespace alpaka - #endif // HeterogeneousCore_AlpakaUtilities_interface_prefixScan_h diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index 0f10e450f..592884845 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -178,24 +178,14 @@ int main() { blocksPerGrid4, threadsPerBlockOrElementsPerThread4); std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl; - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDivMultiBlock, - ::cms::alpakatools::multiBlockPrefixScanFirstStep(), - input_d, - output1_d, - num_items)); - - const Vec1D blocksPerGridSecondStep(Vec1D::all(1)); - const WorkDiv1D& workDivMultiBlockSecondStep = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( - blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4); - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDivMultiBlockSecondStep, - ::cms::alpakatools::multiBlockPrefixScanSecondStep(), - input_d, - output1_d, - num_items, - nBlocks)); - + auto d_pc(alpaka::allocBuf(device, size)); + int32_t* pc = alpaka::getPtrNative(d_pc); + + alpaka::memset(queue, d_pc, 0, size); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDivMultiBlock, ::cms::alpakatools::multiBlockPrefixScan(), input_d, output1_d, num_items, pc)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMultiBlock, verify(), output1_d, num_items)); alpaka::wait(queue); // input_dBuf and output1_dBuf end of scope