diff --git a/src/alpaka/AlpakaCore/alpakaConfigAcc.h b/src/alpaka/AlpakaCore/alpakaConfigAcc.h index e0da1cf1b..65629e9ef 100644 --- a/src/alpaka/AlpakaCore/alpakaConfigAcc.h +++ b/src/alpaka/AlpakaCore/alpakaConfigAcc.h @@ -9,6 +9,7 @@ namespace alpaka_cuda_async { using namespace alpaka_common; using Acc1D = alpaka::AccGpuCudaRt; using Acc2D = alpaka::AccGpuCudaRt; + using Acc3D = alpaka::AccGpuCudaRt; using Queue = alpaka::QueueCudaRtNonBlocking; } // namespace alpaka_cuda_async @@ -25,6 +26,7 @@ namespace alpaka_serial_sync { using namespace alpaka_common; using Acc1D = alpaka::AccCpuSerial; using Acc2D = alpaka::AccCpuSerial; + using Acc3D = alpaka::AccCpuSerial; using Queue = alpaka::QueueCpuBlocking; } // namespace alpaka_serial_sync @@ -41,6 +43,7 @@ namespace alpaka_tbb_async { using namespace alpaka_common; using Acc1D = alpaka::AccCpuTbbBlocks; using Acc2D = alpaka::AccCpuTbbBlocks; + using Acc3D = alpaka::AccCpuTbbBlocks; using Queue = alpaka::QueueCpuNonBlocking; } // namespace alpaka_tbb_async @@ -57,6 +60,7 @@ namespace alpaka_omp2_async { using namespace alpaka_common; using Acc1D = alpaka::AccCpuOmp2Blocks; using Acc2D = alpaka::AccCpuOmp2Blocks; + using Acc3D = alpaka::AccCpuOmp2Blocks; using Queue = alpaka::QueueCpuNonBlocking; } // namespace alpaka_omp2_async @@ -73,6 +77,7 @@ namespace alpaka_omp4_async { using namespace alpaka_common; using Acc1D = alpaka::AccCpuOmp4; using Acc2D = alpaka::AccCpuOmp4; + using Acc3D = alpaka::AccCpuOmp4; using Queue = alpaka::QueueCpuNonBlocking; } // namespace alpaka_omp4_async @@ -92,11 +97,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { using Device = alpaka::Dev; using Platform = alpaka::Pltf; static_assert(std::is_same_v>, - STRINGIFY(alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>) " and " STRINGIFY( - alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc2D>) " are different types."); + STRINGIFY(alpaka::Dev) " and " STRINGIFY( + alpaka::Dev) " are different types."); + static_assert(std::is_same_v>, + STRINGIFY(alpaka::Dev) " and " STRINGIFY( + alpaka::Dev) " are different types."); static_assert(std::is_same_v>>, - STRINGIFY(alpaka::Pltf>) " and " STRINGIFY( - alpaka::Pltf>) " are different types."); + STRINGIFY(alpaka::Pltf>) " and " STRINGIFY( + alpaka::Pltf>) " are different types."); + static_assert(std::is_same_v>>, + STRINGIFY(alpaka::Pltf>) " and " STRINGIFY( + alpaka::Pltf>) " are different types."); using Event = alpaka::Event; diff --git a/src/alpaka/AlpakaCore/alpakaConfigCommon.h b/src/alpaka/AlpakaCore/alpakaConfigCommon.h index c160ef243..47d9f7e92 100644 --- a/src/alpaka/AlpakaCore/alpakaConfigCommon.h +++ b/src/alpaka/AlpakaCore/alpakaConfigCommon.h @@ -12,16 +12,19 @@ namespace alpaka_common { using Dim1D = alpaka::DimInt<1u>; using Dim2D = alpaka::DimInt<2u>; + using Dim3D = alpaka::DimInt<3u>; template using Vec = alpaka::Vec; using Vec1D = Vec; using Vec2D = Vec; + using Vec3D = Vec; template using WorkDiv = alpaka::WorkDivMembers; using WorkDiv1D = WorkDiv; using WorkDiv2D = WorkDiv; + using WorkDiv3D = WorkDiv; template using AlpakaHostBuf = alpaka::Buf; diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index 1bcec521a..472d07ee6 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -8,12 +8,12 @@ using namespace alpaka_common; namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { /********************************************* - * WORKDIV CREATION - ********************************************/ + * WORKDIV CREATION + ********************************************/ /* - * Creates the accelerator-dependent workdiv. - */ + * Creates the accelerator-dependent workdiv. + */ template WorkDiv make_workdiv(const Vec& blocksPerGrid, const Vec& threadsPerBlockOrElementsPerThread) { // On the GPU: @@ -32,25 +32,25 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /********************************************* - * RANGE COMPUTATION - ********************************************/ + * RANGE COMPUTATION + ********************************************/ /* - * Computes the range of the elements indexes, local to the block. - * Warning: the max index is not truncated by the max number of elements of interest. - */ + * Computes the range of the elements indexes, local to the block. + * Warning: the max index is not truncated by the max number of elements of interest. + */ template ALPAKA_FN_ACC std::pair element_index_range_in_block(const TAcc& acc, const Idx elementIdxShift, const unsigned int dimIndex = 0u) { // Take into account the thread index in block. - const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); + const Idx threadIndex(alpaka::getIdx(acc)[dimIndex]); const Idx threadDimension(alpaka::getWorkDiv(acc)[dimIndex]); // Compute the elements indexes in block. // Obviously relevant for CPU only. // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift. - const Idx firstElementIdxLocal = threadIdxLocal * threadDimension; + const Idx firstElementIdxLocal = threadIndex * threadDimension; const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift; // Add the shift! const Idx endElementIdxUncut = firstElementIdx + threadDimension; @@ -59,9 +59,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /* - * Computes the range of the elements indexes, local to the block. - * Truncated by the max number of elements of interest. - */ + * Computes the range of the elements indexes, local to the block. + * Truncated by the max number of elements of interest. + */ template ALPAKA_FN_ACC std::pair element_index_range_in_block_truncated(const TAcc& acc, const Idx maxNumberOfElements, @@ -80,28 +80,28 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /* - * Computes the range of the elements indexes in grid. - * Warning: the max index is not truncated by the max number of elements of interest. - */ + * Computes the range of the elements indexes in grid. + * Warning: the max index is not truncated by the max number of elements of interest. + */ template ALPAKA_FN_ACC std::pair element_index_range_in_grid(const TAcc& acc, Idx elementIdxShift, const unsigned int dimIndex = 0u) { // Take into account the block index in grid. - const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockIndex(alpaka::getIdx(acc)[dimIndex]); const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); // Shift to get global indices in grid (instead of local to the block) - elementIdxShift += blockIdxInGrid * blockDimension; + elementIdxShift += blockIndex * blockDimension; // Return element indexes, shifted by elementIdxShift. return element_index_range_in_block(acc, elementIdxShift, dimIndex); } /* - * Computes the range of the elements indexes in grid. - * Truncated by the max number of elements of interest. - */ + * Computes the range of the elements indexes in grid. + * Truncated by the max number of elements of interest. + */ template ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, const Idx maxNumberOfElements, @@ -120,9 +120,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /* - * Computes the range of the element(s) index(es) in grid. - * Truncated by the max number of elements of interest. - */ + * Computes the range of the element(s) index(es) in grid. + * Truncated by the max number of elements of interest. + */ template ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, const Idx maxNumberOfElements, @@ -132,14 +132,14 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /********************************************* - * LOOP ON ALL ELEMENTS - ********************************************/ + * LOOP ON ALL ELEMENTS + ********************************************/ /* - * Loop on all (CPU) elements. - * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. - * Indexes are local to the BLOCK. - */ + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are local to the BLOCK. + */ template ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, const Idx maxNumberOfElements, @@ -147,8 +147,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { const Func func, const unsigned int dimIndex = 0) { const auto& [firstElementIdx, endElementIdx] = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block_truncated( - acc, maxNumberOfElements, elementIdxShift, dimIndex); + element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) { func(elementIdx); @@ -156,23 +155,22 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /* - * Overload for elementIdxShift = 0 - */ + * Overload for elementIdxShift = 0 + */ template ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, const Idx maxNumberOfElements, const Func func, const unsigned int dimIndex = 0) { const Idx elementIdxShift = 0; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block( - acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); } /* - * Loop on all (CPU) elements. - * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. - * Indexes are expressed in GRID 'frame-of-reference'. - */ + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are expressed in GRID 'frame-of-reference'. + */ template ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, const Idx maxNumberOfElements, @@ -180,36 +178,35 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { const Func func, const unsigned int dimIndex = 0) { // Take into account the block index in grid to compute the element indices. - const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockIndex(alpaka::getIdx(acc)[dimIndex]); const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); - elementIdxShift += blockIdxInGrid * blockDimension; + elementIdxShift += blockIndex * blockDimension; for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); } /* - * Overload for elementIdxShift = 0 - */ + * Overload for elementIdxShift = 0 + */ template ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, const Idx maxNumberOfElements, const Func func, const unsigned int dimIndex = 0) { const Idx elementIdxShift = 0; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid( - acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + for_each_element_in_grid(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); } /************************************************************** - * LOOP ON ALL ELEMENTS, WITH STRIDED ACCESS - **************************************************************/ + * LOOP ON ALL ELEMENTS, WITH STRIDED ACCESS + **************************************************************/ /* - * (CPU) Loop on all elements + (CPU/GPU) Strided access. - * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. - * Stride to full problem size, by BLOCK size. - * Indexes are local to the BLOCK. - */ + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by BLOCK size. + * Indexes are local to the BLOCK. + */ template ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, const Idx maxNumberOfElements, @@ -218,7 +215,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { const unsigned int dimIndex = 0) { // Get thread / element indices in block. const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, elementIdxShift, dimIndex); + element_index_range_in_block(acc, elementIdxShift, dimIndex); // Stride = block size. const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); @@ -238,24 +235,23 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /* - * Overload for elementIdxShift = 0 - */ + * Overload for elementIdxShift = 0 + */ template ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, const Idx maxNumberOfElements, const Func func, const unsigned int dimIndex = 0) { const Idx elementIdxShift = 0; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( - acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); } /* - * (CPU) Loop on all elements + (CPU/GPU) Strided access. - * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. - * Stride to full problem size, by GRID size. - * Indexes are local to the GRID. - */ + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by GRID size. + * Indexes are local to the GRID. + */ template ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, const Idx maxNumberOfElements, @@ -264,7 +260,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { const unsigned int dimIndex = 0) { // Get thread / element indices in block. const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_grid(acc, elementIdxShift, dimIndex); + element_index_range_in_grid(acc, elementIdxShift, dimIndex); // Stride = grid size. const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); @@ -284,32 +280,31 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } /* - * Overload for elementIdxShift = 0 - */ + * Overload for elementIdxShift = 0 + */ template ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, const Idx maxNumberOfElements, const Func func, const unsigned int dimIndex = 0) { const Idx elementIdxShift = 0; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); } /************************************************************** - * LOOP ON ALL ELEMENTS WITH ONE LOOP - **************************************************************/ + * LOOP ON ALL ELEMENTS WITH ONE LOOP + **************************************************************/ /* - * Case where the input index i has reached the end of threadDimension: strides the input index. - * Otherwise: do nothing. - * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops - * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, - * because of the additional loop over elements in Alpaka model). - * This allows to keep the 'continue' and 'break' statements as-is from legacy code, - * and hence avoids a lot of legacy code reshuffling. - * NB 2: Modifies i, firstElementIdx and endElementIdx. - */ + * Case where the input index i has reached the end of threadDimension: strides the input index. + * Otherwise: do nothing. + * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops + * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, + * because of the additional loop over elements in Alpaka model). + * This allows to keep the 'continue' and 'break' statements as-is from legacy code, + * and hence avoids a lot of legacy code reshuffling. + * NB 2: Modifies i, firstElementIdx and endElementIdx. + */ ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided( Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) { bool isNextStrideElementValid = true; @@ -324,6 +319,464 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { return isNextStrideElementValid; } + /* + * Class which simplifies "for" loops over elements index + */ + template + class elements_with_stride { + public: + ALPAKA_FN_ACC elements_with_stride(const T_Acc& acc, + T extent, + Idx elementIdxShift = 0, + const unsigned int dimIndex = 0) { + const Idx threadIndex = alpaka::getIdx(acc)[dimIndex]; + const Idx blockIndex = alpaka::getIdx(acc)[dimIndex]; + const Idx blockDimension = alpaka::getWorkDiv(acc)[dimIndex]; + const Idx gridDimension = alpaka::getWorkDiv(acc)[dimIndex]; + + thread_ = blockDimension * blockIndex + threadIndex; + thread_ = thread_ + elementIdxShift; // Add the shift + stride_ = gridDimension * blockDimension; + blockDim_ = blockDimension; + + extent_ = extent; + } + + ALPAKA_FN_ACC elements_with_stride(const T_Acc& acc) { + const Idx gridDimension(alpaka::getWorkDiv(acc)[0]); + elements_with_stride(acc, gridDimension); + } + + class iterator { + friend class elements_with_stride; + + public: + ALPAKA_FN_ACC constexpr T operator*() const { return index_; } + + ALPAKA_FN_ACC constexpr iterator& operator++() { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // CUDA backend: iterate over all the elements with a grid-wise stride + index_ += stride_; + if (index_ < extent_) { + return *this; + } +#else + // CPU backend: iterate over all the elements for one thread + index_ += 1; + if (index_ < old_index_ + blockDim_ and index_ < extent_) { + return *this; + } +#endif + // the iterator has reached or overflown the end of the extent, clamp it to the extent + index_ = extent_; + return *this; + } + + ALPAKA_FN_ACC constexpr iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); } + + ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const { return index_ < other.index_; } + + private: + ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent, T blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {} + + ALPAKA_FN_ACC constexpr iterator(T thread, T stride, T extent, T index, T blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {} + + T thread_; + T stride_; + T extent_; + T index_; + T old_index_; + T blockDim_; + }; + + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); } + + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); } + + private: + T thread_; + T stride_; + T extent_; + T blockDim_; + }; + + /* + * Class which simplifies "for" loops over elements index + * Iterates over one dimension + */ + template + class elements_with_stride_1d { + public: + ALPAKA_FN_ACC elements_with_stride_1d(const T_Acc& acc) { + const Vec3D threadIndex(alpaka::getIdx(acc)); + const Vec3D blockIndex(alpaka::getIdx(acc)); + + const Vec3D blockDimension(alpaka::getWorkDiv(acc)); + const Vec3D gridDimension(alpaka::getWorkDiv(acc)); + + thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u], + blockDimension[1u] * blockIndex[1u] + threadIndex[1u], + blockDimension[2u] * blockIndex[2u] + threadIndex[2u]}; + stride_ = {blockDimension[0u] * gridDimension[0u], Idx{1}, Idx{1}}; + extent_ = stride_; + + blockDim_ = blockDimension; + } + + ALPAKA_FN_ACC elements_with_stride_1d(const T_Acc& acc, Vec3D extent, Vec3D elementIdxShift = Vec3D::all(0)) + : extent_(extent + elementIdxShift) { + const Vec3D threadIndex(alpaka::getIdx(acc)); + const Vec3D blockIndex(alpaka::getIdx(acc)); + const Vec3D blockDimension(alpaka::getWorkDiv(acc)); + const Vec3D gridDimension(alpaka::getWorkDiv(acc)); + + thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u], + blockDimension[1u] * blockIndex[1u] + threadIndex[1u], + blockDimension[2u] * blockIndex[2u] + threadIndex[2u]}; + thread_ = thread_ + elementIdxShift; + stride_ = {blockDimension[0u] * gridDimension[0u], Idx{1}, Idx{1}}; + + blockDim_ = blockDimension; + } + + class iterator { + friend class elements_with_stride_1d; + + public: + ALPAKA_FN_ACC Vec3D operator*() const { return index_; } + + ALPAKA_FN_ACC constexpr iterator& operator++() { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // increment the first coordinate + index_[0u] += stride_[0u]; + if (index_[0u] < extent_[0u]) + return *this; +#else + // increment the 3rd index and check its value + index_[2u] += 1; + if (index_[2u] == old_index_[2u] + blockDim_[2u]) + index_[2u] = old_index_[2u]; + + // if the 3rd index was reset, increment the 2nd index + if (index_[2u] == old_index_[2u]) + index_[1u] += 1; + if (index_[1u] == old_index_[1u] + blockDim_[1u]) + index_[1u] = old_index_[1u]; + + // if the 3rd and 2nd indices were set, increment the first coordinate + if (index_[1u] == old_index_[1u] and index_[2u] == old_index_[2u]) + index_[0u] += 1; + + if (index_[0u] < old_index_[0u] + blockDim_[0u] and index_[0u] < extent_[0u]) { + return *this; + } +#endif + + // the iterator has reached or overflown the end of the extent, clamp it + // to the extent + index_ = extent_; + return *this; + } + + ALPAKA_FN_ACC constexpr iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); } + + ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const { return index_[0u] < other.index_[0u]; } + + private: + ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {} + + ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D index, Vec3D blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {} + + Vec3D thread_; + Vec3D stride_; + Vec3D extent_; + Vec3D index_; + Vec3D old_index_; + Vec3D blockDim_; + }; + + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); } + + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); } + + private: + Vec3D thread_ = Vec3D::all(1); + Vec3D stride_ = Vec3D::all(1); + Vec3D extent_ = Vec3D::all(1); + Vec3D blockDim_ = Vec3D::all(1); + }; + + /* + * Class which simplifies "for" loops over elements index + * Iterates over two dimensions + */ + template + class elements_with_stride_2d { + public: + ALPAKA_FN_ACC elements_with_stride_2d(const T_Acc& acc) { + const Vec3D threadIndex(alpaka::getIdx(acc)); + const Vec3D blockIndex(alpaka::getIdx(acc)); + const Vec3D blockDimension(alpaka::getWorkDiv(acc)); + const Vec3D gridDimension(alpaka::getWorkDiv(acc)); + + thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u], + blockDimension[1u] * blockIndex[1u] + threadIndex[1u], + blockDimension[2u] * blockIndex[2u] + threadIndex[2u]}; + stride_ = {blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], Idx{1}}; + extent_ = stride_; + + blockDim_ = blockDimension; + } + + ALPAKA_FN_ACC elements_with_stride_2d(const T_Acc& acc, Vec3D extent, Vec3D elementIdxShift = Vec3D::all(0)) + : extent_(extent + elementIdxShift) { + const Vec3D threadIndex(alpaka::getIdx(acc)); + const Vec3D blockIndex(alpaka::getIdx(acc)); + const Vec3D blockDimension(alpaka::getWorkDiv(acc)); + const Vec3D gridDimension(alpaka::getWorkDiv(acc)); + + thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u], + blockDimension[1u] * blockIndex[1u] + threadIndex[1u], + blockDimension[2u] * blockIndex[2u] + threadIndex[2u]}; + thread_ = thread_ + elementIdxShift; + stride_ = {blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], Idx{1}}; + + blockDim_ = blockDimension; + } + + class iterator { + friend class elements_with_stride_2d; + + public: + ALPAKA_FN_ACC Vec3D operator*() const { return index_; } + + ALPAKA_FN_ACC constexpr iterator& operator++() { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // increment the first coordinate + index_[0u] += stride_[0u]; + if (index_[0u] < extent_[0u]) + return *this; + + // if the first coordinate overflows, reset it and increment the second + // coordinate + index_[0u] = thread_[0u]; + index_[1u] += stride_[1u]; + if (index_[1u] < extent_[1u]) + return *this; +#else + // increment the 3rd index and check its value + index_[2u] += 1; + if (index_[2u] == old_index_[2u] + blockDim_[2u]) + index_[2u] = old_index_[2u]; + + // if the 3rd index was reset, increment the 2nd index + if (index_[2u] == old_index_[2u]) + index_[1u] += 1; + if (index_[1u] == old_index_[1u] + blockDim_[1u] || index_[1u] == extent_[1u]) + index_[1u] = old_index_[1u]; + + // if the 3rd and 2nd indices were set, increment the first coordinate + if (index_[1u] == old_index_[1u] and index_[2u] == old_index_[2u]) + index_[0u] += 1; + + if (index_[0u] < old_index_[0u] + blockDim_[0u] and index_[0u] < extent_[0u] and index_[1u] < extent_[1u]) { + return *this; + } +#endif + + // the iterator has reached or overflown the end of the extent, clamp it + // to the extent + index_ = extent_; + return *this; + } + + ALPAKA_FN_ACC constexpr iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); } + + ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const { + return (index_[0u] < other.index_[0u] and index_[1u] < other.index_[1u]); + } + + private: + ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {} + + ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D index, Vec3D blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {} + + Vec3D thread_; + Vec3D stride_; + Vec3D extent_; + Vec3D index_; + Vec3D old_index_; + Vec3D blockDim_; + }; + + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); } + + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); } + + private: + Vec3D thread_ = Vec3D::all(1); + Vec3D stride_ = Vec3D::all(1); + Vec3D extent_ = Vec3D::all(1); + Vec3D blockDim_ = Vec3D::all(1); + }; + + /* + * Class which simplifies "for" loops over elements index + * Iterates over three dimensions + */ + template + class elements_with_stride_3d { + public: + ALPAKA_FN_ACC elements_with_stride_3d(const T_Acc& acc) { + const Vec3D threadIndex(alpaka::getIdx(acc)); + const Vec3D blockIndex(alpaka::getIdx(acc)); + const Vec3D blockDimension(alpaka::getWorkDiv(acc)); + const Vec3D gridDimension(alpaka::getWorkDiv(acc)); + + thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u], + blockDimension[1u] * blockIndex[1u] + threadIndex[1u], + blockDimension[2u] * blockIndex[2u] + threadIndex[2u]}; + stride_ = {blockDimension[0u] * gridDimension[0u], + blockDimension[1u] * gridDimension[1u], + blockDimension[2u] * gridDimension[2u]}; + extent_ = stride_; + + blockDim_ = blockDimension; + } + + ALPAKA_FN_ACC elements_with_stride_3d(const T_Acc& acc, Vec3D extent, Vec3D elementIdxShift = Vec3D::all(0)) + : extent_(extent + elementIdxShift) { + const Vec3D threadIndex(alpaka::getIdx(acc)); + const Vec3D blockIndex(alpaka::getIdx(acc)); + const Vec3D blockDimension(alpaka::getWorkDiv(acc)); + const Vec3D gridDimension(alpaka::getWorkDiv(acc)); + + thread_ = {blockDimension[0u] * blockIndex[0u] + threadIndex[0u], + blockDimension[1u] * blockIndex[1u] + threadIndex[1u], + blockDimension[2u] * blockIndex[2u] + threadIndex[2u]}; + thread_ = thread_ + elementIdxShift; + stride_ = {blockDimension[0u] * gridDimension[0u], + blockDimension[1u] * gridDimension[1u], + blockDimension[2u] * gridDimension[2u]}; + + blockDim_ = blockDimension; + } + + class iterator { + friend class elements_with_stride_3d; + + public: + ALPAKA_FN_ACC Vec3D operator*() const { return index_; } + + ALPAKA_FN_ACC constexpr iterator& operator++() { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // increment the first coordinate + index_[0u] += stride_[0u]; + if (index_[0u] < extent_[0u]) + return *this; + + // if the first coordinate overflows, reset it and increment the second + // coordinate + index_[0u] = thread_[0u]; + index_[1u] += stride_[1u]; + if (index_[1u] < extent_[1u]) + return *this; + + // if the second coordinate overflows, reset it and increment the third + // coordinate + index_[1u] = thread_[1u]; + index_[2u] += stride_[2u]; + if (index_[2u] < extent_[2u]) + return *this; +#else + // increment the 3rd index and check its value + index_[2u] += 1; + if (index_[2u] == old_index_[2u] + blockDim_[2u] || index_[2u] == extent_[2u]) + index_[2u] = old_index_[2u]; + + // if the 3rd index was reset, increment the 2nd index + if (index_[2u] == old_index_[2u]) + index_[1u] += 1; + if (index_[1u] == old_index_[1u] + blockDim_[1u] || index_[1u] == extent_[1u]) + index_[1u] = old_index_[1u]; + + // if the 3rd and 2nd indices were set, increment the first coordinate + if (index_[1u] == old_index_[1u] and index_[2u] == old_index_[2u]) + index_[0u] += 1; + if (index_[0u] < old_index_[0u] + blockDim_[0u] and index_[0u] < extent_[0u] and index_[1u] < extent_[1u] and + index_[2u] < extent_[2u]) { + return *this; + } +#endif + + // the iterator has reached or overflown the end of the extent, clamp it + // to the extent + index_ = extent_; + return *this; + } + + ALPAKA_FN_ACC constexpr iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC constexpr bool operator==(iterator const& other) const { return (index_ == other.index_); } + + ALPAKA_FN_ACC constexpr bool operator!=(iterator const& other) const { + return (index_[0u] < other.index_[0u] and index_[1u] < other.index_[1u] and index_[2u] < other.index_[2u]); + } + + private: + ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{thread_}, old_index_{index_}, blockDim_{blockDim} {} + + ALPAKA_FN_ACC iterator(Vec3D thread, Vec3D stride, Vec3D extent, Vec3D index, Vec3D blockDim) + : thread_{thread}, stride_{stride}, extent_{extent}, index_{index}, old_index_{index_}, blockDim_{blockDim} {} + + Vec3D thread_; + Vec3D stride_; + Vec3D extent_; + Vec3D index_; + Vec3D old_index_; + Vec3D blockDim_; + }; + + ALPAKA_FN_ACC constexpr iterator begin() const { return iterator(thread_, stride_, extent_, blockDim_); } + + ALPAKA_FN_ACC constexpr iterator end() const { return iterator(thread_, stride_, extent_, extent_, blockDim_); } + + private: + Vec3D thread_ = Vec3D::all(1); + Vec3D stride_ = Vec3D::all(1); + Vec3D extent_ = Vec3D::all(1); + Vec3D blockDim_ = Vec3D::all(1); + }; + } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE #endif // AlpakaCore_alpakaWorkDivHelper_h diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h index 0295fb783..51a4ca715 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h @@ -72,14 +72,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } const auto ntNbins = foundNtuplets->nbins(); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, ntNbins, [&](uint32_t idx) { - if (foundNtuplets->size(idx) > 5) - printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(idx) < 6); - for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) - ALPAKA_ASSERT_OFFLOAD(*ih < nHits); - }); + + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNbins)) { + if (foundNtuplets->size(idx) > 5) + printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); + ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(idx) < 6); + for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + ALPAKA_ASSERT_OFFLOAD(*ih < nHits); + } #endif if (0 == threadIdx) { @@ -94,25 +94,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } const auto ntNCells = (*nCells); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, ntNCells, [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; - printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId); - if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; - printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId); - if (thisCell.theDoubletId < 0) - alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{}); - if (0 == thisCell.theUsed) - alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{}); - if (thisCell.tracks().empty()) - alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{}); - }); - - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, nHits, [&](uint32_t idx) { + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNCells)) { + auto const &thisCell = cells[idx]; + if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId); + if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; + printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId); + if (thisCell.theDoubletId < 0) + alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{}); + if (0 == thisCell.theUsed) + alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{}); + if (thisCell.tracks().empty()) + alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{}); + } + + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, nHits)) { if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; printf("OuterHitOfCell overflow %d\n", idx); - }); + } } }; @@ -125,15 +124,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { constexpr auto bad = trackQuality::bad; const auto ntNCells = (*nCells); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, ntNCells, [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.theDoubletId < 0) { - for (auto it : thisCell.tracks()) - quality[it] = bad; - } - }); + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNCells)) { + auto const &thisCell = cells[idx]; + + if (thisCell.theDoubletId < 0) { + for (auto it : thisCell.tracks()) + quality[it] = bad; + } + } } }; @@ -150,28 +149,28 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nCells); const auto ntNCells = (*nCells); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, ntNCells, [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.tracks().size() >= 2) { - //if (0==thisCell.theUsed) continue; - // if (thisCell.theDoubletId < 0) continue; + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, ntNCells)) { + auto const &thisCell = cells[idx]; - uint32_t maxNh = 0; + if (thisCell.tracks().size() >= 2) { + //if (0==thisCell.theUsed) continue; + // if (thisCell.theDoubletId < 0) continue; - // find maxNh - for (auto it : thisCell.tracks()) { - auto nh = foundNtuplets->size(it); - maxNh = std::max(nh, maxNh); - } + uint32_t maxNh = 0; - for (auto it : thisCell.tracks()) { - if (foundNtuplets->size(it) != maxNh) - quality[it] = dup; //no race: simple assignment of the same constant - } - } - }); + // find maxNh + for (auto it : thisCell.tracks()) { + auto nh = foundNtuplets->size(it); + maxNh = std::max(nh, maxNh); + } + + for (auto it : thisCell.tracks()) { + if (foundNtuplets->size(it) != maxNh) + quality[it] = dup; //no race: simple assignment of the same constant + } + } + } } }; @@ -188,34 +187,33 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nCells); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, (*nCells), [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.tracks().size() >= 2) { - // if (thisCell.theDoubletId < 0) continue; - - float mc = 10000.f; - uint16_t im = 60000; - - auto score = [&](auto it) { - return std::abs(tracks->tip(it)); // tip - // return tracks->chi2(it); //chi2 - }; - - // find min socre - for (auto it : thisCell.tracks()) { - if (tracks->quality(it) == loose && score(it) < mc) { - mc = score(it); - im = it; - } - } - // mark all other duplicates - for (auto it : thisCell.tracks()) { - if (tracks->quality(it) != bad && it != im) - tracks->quality(it) = dup; //no race: simple assignment of the same constant - } + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, *nCells)) { + auto const &thisCell = cells[idx]; + if (thisCell.tracks().size() >= 2) { + // if (thisCell.theDoubletId < 0) continue; + + float mc = 10000.f; + uint16_t im = 60000; + + auto score = [&](auto it) { + return std::abs(tracks->tip(it)); // tip + // return tracks->chi2(it); //chi2 + }; + + // find min socre + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) == loose && score(it) < mc) { + mc = score(it); + im = it; } - }); + } + // mark all other duplicates + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) != bad && it != im) + tracks->quality(it) = dup; //no race: simple assignment of the same constant + } + } + } } }; @@ -237,6 +235,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float dcaCutOuterTriplet) const { auto const &hh = *hhp; + const Idx elementShift = 0; const uint32_t dimIndexY = 0u; const uint32_t dimIndexX = 1u; const uint32_t threadIdxY(alpaka::getIdx(acc)[dimIndexY]); @@ -247,63 +246,59 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { (*apc2) = 0; } // ready for next kernel - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, - (*nCells), - 0u, - [&](uint32_t idx) { - auto cellIndex = idx; - auto &thisCell = cells[idx]; - //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1) - // continue; - auto innerHitId = thisCell.get_inner_hit_id(); - int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); - const auto *__restrict__ vi = isOuterHitOfCell[innerHitId].data(); - - constexpr uint32_t last_bpix1_detIndex = 96; - constexpr uint32_t last_barrel_detIndex = 1184; - auto ri = thisCell.get_inner_r(hh); - auto zi = thisCell.get_inner_z(hh); - - auto ro = thisCell.get_outer_r(hh); - auto zo = thisCell.get_outer_z(hh); - auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex; - - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( - acc, - numberOfPossibleNeighbors, - 0u, - [&](uint32_t j) { - auto otherCell = vi[j]; // NB: Was with __ldg in legacy - auto &oc = cells[otherCell]; - // if (cells[otherCell].theDoubletId < 0 || - // cells[otherCell].theUsed>1 ) - // continue; - auto r1 = oc.get_inner_r(hh); - auto z1 = oc.get_inner_z(hh); - // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex; - bool aligned = GPUCACell::areAlignedRZ( - r1, - z1, - ri, - zi, - ro, - zo, - ptmin, - isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts - if (aligned && thisCell.dcaCut(hh, - oc, - oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet - : dcaCutOuterTriplet, - hardCurvCut)) { // FIXME tune cuts - oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors); - thisCell.theUsed |= 1; - oc.theUsed |= 1; - } - }, - dimIndexX); // loop on inner cells - }, - dimIndexY); // loop on outer cells + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride( + acc, *nCells, elementShift, dimIndexY)) { + auto cellIndex = idx; + auto &thisCell = cells[idx]; + //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1) + // continue; + auto innerHitId = thisCell.get_inner_hit_id(); + int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); + const auto *__restrict__ vi = isOuterHitOfCell[innerHitId].data(); + + constexpr uint32_t last_bpix1_detIndex = 96; + constexpr uint32_t last_barrel_detIndex = 1184; + auto ri = thisCell.get_inner_r(hh); + auto zi = thisCell.get_inner_z(hh); + + auto ro = thisCell.get_outer_r(hh); + auto zo = thisCell.get_outer_z(hh); + auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex; + + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, + numberOfPossibleNeighbors, + 0u, + [&](uint32_t j) { + auto otherCell = vi[j]; // NB: Was with __ldg in legacy + auto &oc = cells[otherCell]; + // if (cells[otherCell].theDoubletId < 0 || + // cells[otherCell].theUsed>1 ) + // continue; + auto r1 = oc.get_inner_r(hh); + auto z1 = oc.get_inner_z(hh); + // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex; + bool aligned = GPUCACell::areAlignedRZ( + r1, + z1, + ri, + zi, + ro, + zo, + ptmin, + isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts + if (aligned && thisCell.dcaCut(hh, + oc, + oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet + : dcaCutOuterTriplet, + hardCurvCut)) { // FIXME tune cuts + oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors); + thisCell.theUsed |= 1; + oc.theUsed |= 1; + } + }, + dimIndexX); // loop on inner cells + } // loop on outer cells } }; @@ -323,23 +318,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { //auto first = threadIdx.x + blockIdx.x * blockDim.x; //for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, (*nCells), [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.theDoubletId >= 0) { // cut by earlyFishbone - - auto pid = thisCell.theLayerPairId; - auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12; - if (doit) { - GPUCACell::TmpTuple stack; - stack.reset(); - thisCell.find_ntuplets<6>( - acc, hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); - ALPAKA_ASSERT_OFFLOAD(stack.empty()); - // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); - } - } - }); + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, *nCells)) { + auto const &thisCell = cells[idx]; + if (thisCell.theDoubletId >= 0) { // cut by earlyFishbone + + auto pid = thisCell.theLayerPairId; + auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12; + if (doit) { + GPUCACell::TmpTuple stack; + stack.reset(); + thisCell.find_ntuplets<6>( + acc, hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); + ALPAKA_ASSERT_OFFLOAD(stack.empty()); + // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); + } + } + } } }; @@ -350,12 +344,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { GPUCACell *__restrict__ cells, uint32_t const *nCells) const { // auto const &hh = *hhp; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, (*nCells), [&](uint32_t idx) { - auto &thisCell = cells[idx]; - if (!thisCell.tracks().empty()) - thisCell.theUsed |= 2; - }); + + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, *nCells)) { + auto &thisCell = cells[idx]; + if (!thisCell.tracks().empty()) + thisCell.theUsed |= 2; + } } }; @@ -365,17 +359,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ foundNtuplets, Quality const *__restrict__ quality, CAConstants::TupleMultiplicity *tupleMultiplicity) const { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, foundNtuplets->nbins(), [&](uint32_t it) { - auto nhits = foundNtuplets->size(it); - if (nhits >= 3 && quality[it] != trackQuality::dup) { - ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); - if (nhits > 5) - printf("wrong mult %d %d\n", it, nhits); - ALPAKA_ASSERT_OFFLOAD(nhits < 8); - tupleMultiplicity->countDirect(acc, nhits); - } - }); + for (uint32_t it : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, foundNtuplets->nbins())) { + auto nhits = foundNtuplets->size(it); + if (nhits >= 3 && quality[it] != trackQuality::dup) { + ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); + if (nhits > 5) + printf("wrong mult %d %d\n", it, nhits); + ALPAKA_ASSERT_OFFLOAD(nhits < 8); + tupleMultiplicity->countDirect(acc, nhits); + } + } } }; @@ -385,17 +379,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ foundNtuplets, Quality const *__restrict__ quality, CAConstants::TupleMultiplicity *tupleMultiplicity) const { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, foundNtuplets->nbins(), [&](uint32_t it) { - auto nhits = foundNtuplets->size(it); - if (nhits >= 3 && quality[it] != trackQuality::dup) { - ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); - if (nhits > 5) - printf("wrong mult %d %d\n", it, nhits); - ALPAKA_ASSERT_OFFLOAD(nhits < 8); - tupleMultiplicity->fillDirect(acc, nhits, it); - } - }); + for (uint32_t it : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, foundNtuplets->nbins())) { + auto nhits = foundNtuplets->size(it); + if (nhits >= 3 && quality[it] != trackQuality::dup) { + ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); + if (nhits > 5) + printf("wrong mult %d %d\n", it, nhits); + ALPAKA_ASSERT_OFFLOAD(nhits < 8); + tupleMultiplicity->fillDirect(acc, nhits, it); + } + } } }; @@ -406,64 +400,63 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { TkSoA const *__restrict__ tracks, CAHitNtupletGeneratorKernels::QualityCuts cuts, Quality *__restrict__ quality) const { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, tuples->nbins(), [&](uint32_t it) { - auto nhits = tuples->size(it); - if (nhits == 0) - return; // guard - - // if duplicate: not even fit - // mark doublets as bad - if (quality[it] != trackQuality::dup && nhits >= 3) { - ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); - - // if the fit has any invalid parameters, mark it as bad - bool isNaN = false; - for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); - } - if (!isNaN) { + for (uint32_t it : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) { + auto nhits = tuples->size(it); + if (nhits == 0) + return; // guard + + // if duplicate: not even fit + // mark doublets as bad + if (quality[it] != trackQuality::dup && nhits >= 3) { + ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); + + // if the fit has any invalid parameters, mark it as bad + bool isNaN = false; + for (int i = 0; i < 5; ++i) { + isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + } + if (!isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); + printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); #endif - // compute a pT-dependent chi2 cut - // default parameters: - // - chi2MaxPt = 10 GeV - // - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 } - // - chi2Scale = 30 for broken line fit, 45 for Riemann fit - // (see CAHitNtupletGeneratorGPU.cc) - float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); - float chi2Cut = - cuts.chi2Scale * - (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3]))); - // above number were for Quads not normalized so for the time being just multiple by ndof for Quads (triplets to be understood) - if (3.f * tracks->chi2(it) < chi2Cut) { + // compute a pT-dependent chi2 cut + // default parameters: + // - chi2MaxPt = 10 GeV + // - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 } + // - chi2Scale = 30 for broken line fit, 45 for Riemann fit + // (see CAHitNtupletGeneratorGPU.cc) + float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); + float chi2Cut = + cuts.chi2Scale * + (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3]))); + // above number were for Quads not normalized so for the time being just multiple by ndof for Quads (triplets to be understood) + if (3.f * tracks->chi2(it) < chi2Cut) { #ifdef NTUPLE_DEBUG - printf("Bad fit %d size %d pt %f eta %f chi2 %f\n", - it, - tuples->size(it), - tracks->pt(it), - tracks->eta(it), - 3.f * tracks->chi2(it)); + printf("Bad fit %d size %d pt %f eta %f chi2 %f\n", + it, + tuples->size(it), + tracks->pt(it), + tracks->eta(it), + 3.f * tracks->chi2(it)); #endif - // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) - // default cuts: - // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm - // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm - // (see CAHitNtupletGeneratorGPU.cc) - auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; - bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and - (std::abs(tracks->zip(it)) < region.maxZip); - - if (isOk) - quality[it] = trackQuality::loose; - - } // chi2Cut - } // !isNaN - } // trackQuality and nhits - }); + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; + bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and + (std::abs(tracks->zip(it)) < region.maxZip); + + if (isOk) + quality[it] = trackQuality::loose; + + } // chi2Cut + } // !isNaN + } // trackQuality and nhits + } } }; @@ -473,14 +466,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernels::Counters *counters) const { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, tuples->nbins(), [&](uint32_t idx) { - if (tuples->size(idx) == 0) - return; //guard - if (quality[idx] == trackQuality::loose) { - alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{}); - } - }); + for (uint32_t idx : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) { + if (tuples->size(idx) == 0) + return; //guard + if (quality[idx] == trackQuality::loose) { + alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{}); + } + } } }; @@ -490,15 +483,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) const { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, tuples->nbins(), [&](uint32_t idx) { - if (tuples->size(idx) == 0) - return; // guard - if (quality[idx] == trackQuality::loose) { - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->countDirect(acc, *h); - } - }); + for (uint32_t idx : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) { + if (tuples->size(idx) == 0) + return; // guard + if (quality[idx] == trackQuality::loose) { + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->countDirect(acc, *h); + } + } } }; @@ -508,15 +501,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) const { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, tuples->nbins(), [&](uint32_t idx) { - if (tuples->size(idx) == 0) - return; // guard - if (quality[idx] == trackQuality::loose) { - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->fillDirect(acc, *h, idx); - } - }); + for (uint32_t idx : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->nbins())) { + if (tuples->size(idx) == 0) + return; // guard + if (quality[idx] == trackQuality::loose) { + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->fillDirect(acc, *h, idx); + } + } } }; @@ -527,18 +520,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { TrackingRecHit2DSOAView const *__restrict__ hhp, HitContainer *__restrict__ hitDetIndices) const { // copy offsets - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, tuples->totbins(), [&](uint32_t idx) { hitDetIndices->off[idx] = tuples->off[idx]; }); + for (uint32_t idx : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->totbins())) { + hitDetIndices->off[idx] = tuples->off[idx]; + } // fill hit indices auto const &hh = *hhp; #ifndef NDEBUG auto nhits = hh.nHits(); #endif - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, tuples->size(), [&](uint32_t idx) { - ALPAKA_ASSERT_OFFLOAD(tuples->bins[idx] < nhits); - hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]); - }); + for (uint32_t idx : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, tuples->size())) { + ALPAKA_ASSERT_OFFLOAD(tuples->bins[idx] < nhits); + hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]); + } } }; @@ -548,14 +542,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ hitToTuple, CAHitNtupletGeneratorKernels::Counters *counters) const { auto &c = *counters; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, hitToTuple->nbins(), [&](uint32_t idx) { - if (hitToTuple->size(idx) != 0) { // SHALL NOT BE break - alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{}); - if (hitToTuple->size(idx) > 1) - alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{}); - } - }); + + for (uint32_t idx : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, hitToTuple->nbins())) { + if (hitToTuple->size(idx) != 0) { // SHALL NOT BE break + alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{}); + if (hitToTuple->size(idx) > 1) + alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{}); + } + } } }; @@ -578,45 +573,45 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // auto const & hh = *hhp; // auto l1end = hh.hitsLayerStart_d[1]; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( - acc, phitToTuple->nbins(), [&](uint32_t idx) { - if (hitToTuple.size(idx) >= 2) { - float mc = 10000.f; - uint16_t im = 60000; - uint32_t maxNh = 0; - - // find maxNh - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - uint32_t nh = foundNtuplets.size(*it); - maxNh = std::max(nh, maxNh); - } - // kill all tracks shorter than maxHn (only triplets???) - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - uint32_t nh = foundNtuplets.size(*it); - if (maxNh != nh) - quality[*it] = dup; + for (uint32_t idx : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, phitToTuple->nbins())) { + if (hitToTuple.size(idx) >= 2) { + float mc = 10000.f; + uint16_t im = 60000; + uint32_t maxNh = 0; + + // find maxNh + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + maxNh = std::max(nh, maxNh); + } + // kill all tracks shorter than maxHn (only triplets???) + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + if (maxNh != nh) + quality[*it] = dup; + } + + if (maxNh <= 3) { + // if (idx>=l1end) continue; // only for layer 1 + // for triplets choose best tip! + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) { + mc = std::abs(tracks.tip(it)); + im = it; } + } + // mark duplicates + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && it != im) + quality[it] = dup; //no race: simple assignment of the same constant + } - if (maxNh <= 3) { - // if (idx>=l1end) continue; // only for layer 1 - // for triplets choose best tip! - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); - im = it; - } - } - // mark duplicates - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] != bad && it != im) - quality[it] = dup; //no race: simple assignment of the same constant - } - - } // maxNh - } // hitToTuple.size - }); // loop over hits + } // maxNh + } // hitToTuple.size + } // loop over hits } }; @@ -633,7 +628,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const &foundNtuplets = *ptuples; auto const &tracks = *ptracks; const auto np = std::min(maxPrint, foundNtuplets.nbins()); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, np, [&](uint32_t i) { + + for (uint32_t i : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, np)) { auto nh = foundNtuplets.size(i); if (nh >= 3) { printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n", @@ -654,7 +650,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1, nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1); } // nh - }); + } } }; diff --git a/src/alpaka/test/alpaka/atomtest.cc b/src/alpaka/test/alpaka/atomtest.cc new file mode 100644 index 000000000..678143272 --- /dev/null +++ b/src/alpaka/test/alpaka/atomtest.cc @@ -0,0 +1,205 @@ +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct shared_block { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const { + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + auto blockIdxInGrid(alpaka::getIdx(acc)[0u]); + Data b = 1.0; + Data c = -1.0; + + auto& s = alpaka::declareSharedVar(acc); + + if (threadIdxLocal == 0) { + s = 0; + } + + syncBlockThreads(acc); + + for ([[maybe_unused]] T index : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{}); + } + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); + } + + syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + vec[blockIdxInGrid] = s; + } + } +}; + +template +struct global_block { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const { + auto blockIdxInGrid(alpaka::getIdx(acc)[0u]); + Data b = 1.0; + Data c = -1.0; + + for ([[maybe_unused]] T index : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{}); + alpaka::atomicAdd(acc, &vec[blockIdxInGrid], c, alpaka::hierarchy::Grids{}); + } + alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{}); + } + } +}; + +template +struct global_grid { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const { + Data b = 1.0; + Data c = -1.0; + + for ([[maybe_unused]] T index : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{}); //alpaka::hierarchy::Blocks/Threads/Grids + alpaka::atomicAdd(acc, &vec[0], c, alpaka::hierarchy::Grids{}); + } + alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{}); + } + } +}; + +template +struct shared_grid { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const { + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + Data b = 1.0; + Data c = -1.0; + + auto& s = alpaka::declareSharedVar(acc); + + if (threadIdxLocal == 0) { + s = 0; + } + + syncBlockThreads(acc); + + for ([[maybe_unused]] T index : + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); //alpaka::hierarchy::Blocks/Threads/Grids + alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{}); + } + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); + } + + syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + alpaka::atomicAdd(acc, &vec[0], s, alpaka::hierarchy::Grids{}); + } + } +}; + +int main(void) { + using Dim = alpaka::DimInt<1u>; + using Data = float; + const Idx num_items = 1 << 15; + Idx nThreadsInit = 256; + Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + + const Device device_1(alpaka::getDevByIdx(0u)); + alpaka::Queue queue_1_0(device_1); + alpaka::Queue queue_1_1(device_1); + + const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit)); + const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit)); + auto workDivMultiBlockInit1 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + using DevHost = alpaka::DevCpu; + auto const devHost = alpaka::getDevByIdx(0u); + + using BufHost = alpaka::Buf; + BufHost bufHostA(alpaka::allocBuf(devHost, num_items)); + BufHost res(alpaka::allocBuf(devHost, num_items)); + + Data* const pBufHostA(alpaka::getPtrNative(bufHostA)); + Data* const res_ptr(alpaka::getPtrNative(res)); + + for (Idx i = 0; i < num_items; i++) { + pBufHostA[i] = 0.0; + } + + using BufAcc = alpaka::Buf; + BufAcc order(alpaka::allocBuf(device_1, num_items)); + + printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); + + // Run on shared memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + auto beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, + alpaka::createTaskKernel( + workDivMultiBlockInit1, shared_block(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + auto endT = std::chrono::high_resolution_clock::now(); + std::cout << "Shared Block: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (Idx i = 0; i < nBlocksInit; i++) { + if (res_ptr[i] != (Data)nThreadsInit) + std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data)num_items << std::endl; + } + + // Run on global memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, + alpaka::createTaskKernel( + workDivMultiBlockInit1, global_block(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Global Block: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (Idx i = 0; i < nBlocksInit; i++) { + if (res_ptr[i] != (Data)nThreadsInit) + std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data)num_items << std::endl; + } + + // Run on Shared memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, + alpaka::createTaskKernel( + workDivMultiBlockInit1, shared_grid(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Shared Grid: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + if (res_ptr[0] != (Data)num_items) + std::cout << "[0]: " << res_ptr[0] << " != " << (Data)num_items << std::endl << std::endl; + + // Run on Global memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, + alpaka::createTaskKernel( + workDivMultiBlockInit1, global_grid(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Global Grid: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + if (res_ptr[0] != (Data)num_items) + std::cout << "[0]: " << res_ptr[0] << " != " << (Data)num_items << std::endl; + + return 0; +} diff --git a/src/alpaka/test/alpaka/barrier_fence.cc b/src/alpaka/test/alpaka/barrier_fence.cc new file mode 100644 index 000000000..3e42104ea --- /dev/null +++ b/src/alpaka/test/alpaka/barrier_fence.cc @@ -0,0 +1,115 @@ +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" +#include "AlpakaCore/threadfence.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct global_fence { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const { + auto blockIdxLocal(alpaka::getIdx(acc)[0u]); + int no_blocks = 128; + + for (int i = 0; i < no_blocks * no_blocks * 10; i++) { + if (i % no_blocks == (int)blockIdxLocal) { + if (i % no_blocks > 0) { + vec[blockIdxLocal] = vec[blockIdxLocal - 1] + 1; + } + } + cms::alpakatools::threadfence(acc); + } + } +}; + +template +struct shared_fence { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const { + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + auto blockIdxLocal(alpaka::getIdx(acc)[0u]); + + auto& s = alpaka::declareSharedVar(acc); + + for (int i = 0; i < 256 * 256 * 10; i++) { + if (i % 256 == (int)threadIdxLocal && threadIdxLocal > 0) { + s[threadIdxLocal] = s[threadIdxLocal - 1] + 1; + } + cms::alpakatools::threadfence(acc); + } + + if (threadIdxLocal == 0) { + vec[blockIdxLocal] = s[127] + s[129]; + } + } +}; + +int main(void) { + using Dim = alpaka::DimInt<1u>; + using Data = float; + const Idx num_items = 1 << 15; + Idx nThreadsInit = 256; + Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + + const Device device_1(alpaka::getDevByIdx(0u)); + alpaka::Queue queue_1_0(device_1); + alpaka::Queue queue_1_1(device_1); + + const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit)); + const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit)); + auto workDivMultiBlockInit1 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + using DevHost = alpaka::DevCpu; + auto const devHost = alpaka::getDevByIdx(0u); + + using BufHost = alpaka::Buf; + BufHost bufHostA(alpaka::allocBuf(devHost, num_items)); + BufHost res(alpaka::allocBuf(devHost, num_items)); + + Data* const pBufHostA(alpaka::getPtrNative(bufHostA)); + Data* const res_ptr(alpaka::getPtrNative(res)); + + for (Idx i = 0; i < num_items; i++) { + pBufHostA[i] = 0.0; + } + + using BufAcc = alpaka::Buf; + BufAcc order(alpaka::allocBuf(device_1, num_items)); + + printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); + + // Run on shared memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + auto beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, + alpaka::createTaskKernel( + workDivMultiBlockInit1, shared_fence(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + auto endT = std::chrono::high_resolution_clock::now(); + std::cout << "Shared time: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (int i = 0; i < 128; i++) { + if (res_ptr[i] != 256.0) + printf("Error: d[%d] != r (%f, %d)\n", i, res_ptr[i], i); + } + + // Run on global memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, + alpaka::createTaskKernel( + workDivMultiBlockInit1, global_fence(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Global time: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (int i = 0; i < 128; i++) { + if (res_ptr[i] != Data(i)) + printf("Error: d[%d] != r (%f, %d)\n", i, res_ptr[i], i); + } + + return 0; +} diff --git a/src/alpaka/test/alpaka/barrier_sync.cc b/src/alpaka/test/alpaka/barrier_sync.cc new file mode 100644 index 000000000..96541209a --- /dev/null +++ b/src/alpaka/test/alpaka/barrier_sync.cc @@ -0,0 +1,62 @@ +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct check_sync { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data* vec, T elements) const { + int n = (int)elements; + + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + for (int i = 0; i < n * n; i++) { + if (i % n == (int)threadIdxLocal) { + for (int j = i; j < 10000; j++) { + if (j % 2 == 0) { + // Do random stuff + int sum = 0; + for (int k = 0; k < 1000; k++) + sum += k; + } + } + } + syncBlockThreads(acc); + } + } +}; + +int main(void) { + using Dim = alpaka::DimInt<1u>; + using Data = float; + const Idx num_items = 1 << 10; + Idx nThreadsInit = 1024; + Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + + const Device device_1(alpaka::getDevByIdx(0u)); + alpaka::Queue queue_1_0(device_1); + alpaka::Queue queue_1_1(device_1); + + const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit)); + const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit)); + auto workDivMultiBlockInit1 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + using BufAcc = alpaka::Buf; + BufAcc order(alpaka::allocBuf(device_1, num_items)); + + printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); + + // Run function + auto beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, + alpaka::createTaskKernel( + workDivMultiBlockInit1, check_sync(), alpaka::getPtrNative(order), nThreadsInit)); + alpaka::wait(queue_1_0); + auto endT = std::chrono::high_resolution_clock::now(); + std::cout << "Time: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + + return 0; +} diff --git a/src/alpaka/test/alpaka/elements_stride.cc b/src/alpaka/test/alpaka/elements_stride.cc new file mode 100644 index 000000000..15301a2a6 --- /dev/null +++ b/src/alpaka/test/alpaka/elements_stride.cc @@ -0,0 +1,184 @@ +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct explicit_loop { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T elements) const { + const T threadIdxLocal(alpaka::getIdx(acc)[0u]); + const T blockIdxInGrid(alpaka::getIdx(acc)[0u]); + + const T blockDimension(alpaka::getWorkDiv(acc)[0u]); + const T gridDimension(alpaka::getWorkDiv(acc)[0u]); + + T thread = threadIdxLocal + blockDimension * blockIdxInGrid; + T stride = blockDimension * gridDimension; + + for (T index(thread); index < elements; index += stride) { + printf("explicit_loop:%d\n", index); + } + } +}; + +template +struct range_loop { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T elements) const { + for (T index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride(acc, elements)) { + printf("range:%d\n", index); + } + } +}; + +template +struct explicit_loop_1d { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const { + auto threadIdxLocal(alpaka::getIdx(acc)); + auto blockIdxInGrid(alpaka::getIdx(acc)); + + auto blockDimension(alpaka::getWorkDiv(acc)); + auto gridDimension(alpaka::getWorkDiv(acc)); + + Vec3D thread({blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], Idx{0}, Idx{0}}); + Vec3D stride({blockDimension[0u] * gridDimension[0u], Idx{1}, Idx{1}}); + Vec3D index(thread); + + for (T i = index[0u]; i < elements[0u]; i += stride[0u]) { + printf("explicit_loop_1d:%d:%d:%d\n", i, index[1u], index[2u]); + } + } +}; + +template +struct range_loop_1d { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const { + for (Vec3D index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride_1d(acc, elements)) { + printf("range_1d:%d:%d:%d\n", index[0u], index[1u], index[2u]); + } + } +}; + +template +struct explicit_loop_2d { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const { + auto threadIdxLocal(alpaka::getIdx(acc)); + auto blockIdxInGrid(alpaka::getIdx(acc)); + + auto blockDimension(alpaka::getWorkDiv(acc)); + auto gridDimension(alpaka::getWorkDiv(acc)); + + Vec3D thread({blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], + blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u], + Idx{0}}); + Vec3D stride({blockDimension[0u] * gridDimension[0u], blockDimension[1u] * gridDimension[1u], Idx{1}}); + Vec3D index(thread); + + for (T i = index[1u]; i < elements[1u]; i += stride[1u]) { + for (T i = index[0u]; i < elements[0u]; i += stride[0u]) { + printf("explicit_loop_2d:%d:%d:%d\n", index[0u], index[1u], index[2u]); + } + } + } +}; + +template +struct range_loop_2d { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const { + for (Vec3D index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride_2d(acc, elements)) { + printf("range_2d:%d:%d:%d\n", index[0u], index[1u], index[2u]); + } + } +}; + +template +struct explicit_loop_3d { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const { + auto threadIdxLocal(alpaka::getIdx(acc)); + auto blockIdxInGrid(alpaka::getIdx(acc)); + + auto blockDimension(alpaka::getWorkDiv(acc)); + auto gridDimension(alpaka::getWorkDiv(acc)); + + Vec3D thread({blockDimension[0u] * blockIdxInGrid[0u] + threadIdxLocal[0u], + blockDimension[1u] * blockIdxInGrid[1u] + threadIdxLocal[1u], + blockDimension[2u] * blockIdxInGrid[2u] + threadIdxLocal[2u]}); + Vec3D stride({blockDimension[0u] * gridDimension[0u], + blockDimension[1u] * gridDimension[1u], + blockDimension[2u] * gridDimension[2u]}); + Vec3D index(thread); + + for (T i = index[2u]; i < elements[2u]; i += stride[2u]) { + for (T i = index[1u]; i < elements[1u]; i += stride[1u]) { + for (T i = index[0u]; i < elements[0u]; i += stride[0u]) { + printf("explicit_loop_3d:%d:%d:%d\n", index[0u], index[1u], index[2u]); + } + } + } + } +}; + +template +struct range_loop_3d { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Vec3D elements) const { + for (Vec3D index : ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::elements_with_stride_3d(acc, elements)) { + printf("range_3d:%d:%d:%d\n", index[0u], index[1u], index[2u]); + } + } +}; + +int main(void) { + const Idx num_items = 10; + Idx nThreadsInit = 7; + Vec3D elements(Vec3D::all(num_items)); + Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + + /* + // Case for elements_with_stride (only one dimension) + const Device device_1(alpaka::getDevByIdx(0u)); + + alpaka::Queue queue_1_0(device_1); + alpaka::Queue queue_1_1(device_1); + + const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(nThreadsInit)); + const Vec1D blocksPerGrid1(Vec1D::all(nBlocksInit)); + auto workDivMultiBlockInit1 = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + printf("threads/block = %d and blocks/grid = %d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); + + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + explicit_loop(), num_items)); + + alpaka::enqueue(queue_1_1, alpaka::createTaskKernel(workDivMultiBlockInit1, + range_loop(), num_items)); + + */ + // Case for elements_with_stride_Xd (3 dimensions) + const Device device3(alpaka::getDevByIdx(0u)); + + alpaka::Queue queue_3_0(device3); + alpaka::Queue queue_3_1(device3); + + const Vec3D threadsPerBlockOrElementsPerThread3(Vec3D::all(nThreadsInit)); + const Vec3D blocksPerGrid3(Vec3D::all(nBlocksInit)); + auto workDivMultiBlockInit3 = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid3, threadsPerBlockOrElementsPerThread3); + + printf("threads/block = %d and blocks/grid = %d\n", threadsPerBlockOrElementsPerThread3[0u], blocksPerGrid3[0u]); + alpaka::enqueue(queue_3_0, + alpaka::createTaskKernel(workDivMultiBlockInit3, explicit_loop_1d(), elements)); + alpaka::enqueue(queue_3_1, alpaka::createTaskKernel(workDivMultiBlockInit3, range_loop_1d(), elements)); + + return 0; +} diff --git a/src/cuda/test/atomtest.cu b/src/cuda/test/atomtest.cu new file mode 100644 index 000000000..a2ee8fc4d --- /dev/null +++ b/src/cuda/test/atomtest.cu @@ -0,0 +1,137 @@ +#include +#include +#include + +#include +#include +#include + +using Data = float; + +__global__ void shared_block(Data *d, int n) { + __shared__ Data s; + int block = blockIdx.x; + if (threadIdx.x == 0) { + s = 0.0; + } + __syncthreads(); + + for (int i = 0; i < 200000; i++) { + atomicAdd(&s, 1.0); + atomicAdd(&s, -1.0); + } + atomicAdd(&s, 1.0); + + __syncthreads(); + + if (threadIdx.x == 0) { + d[block] = s; + } +} + +__global__ void global_block(Data *d, int n) { + int block = blockIdx.x; + for (int i = 0; i < 200000; i++) { + atomicAdd(&d[block], 1.0); + atomicAdd(&d[block], -1.0); + } + atomicAdd(&d[block], 1.0); +} + +__global__ void shared_grid(Data *d, int n) { + __shared__ Data var; + if (threadIdx.x == 0) { + var = 0.0; + } + + __syncthreads(); + + for (int i = 0; i < 200000; i++) { + atomicAdd(&var, 1.0); + atomicAdd(&var, -1.0); + } + atomicAdd(&var, 1.0); + + __syncthreads(); + + if (threadIdx.x == 0) { + atomicAdd(&d[0], var); + } +} + +__global__ void global_grid(Data *d, int n) { + for (int i = 0; i < 200000; i++) { + atomicAdd(&d[0], 1.0); + atomicAdd(&d[0], -1.0); + } + atomicAdd(&d[0], 1.0); +} + +int main(void) { + const int n = 1 << 15; + + const int thr = 256; + const int blocks = (n + thr - 1) / thr; + Data order[n], backup[n]; + + printf("Threads/block:%d blocks/grid:%d\n", thr, blocks); + + for (int i = 0; i < n; i++) + order[i] = 0.0; + + Data *d_d; + cudaMalloc(&d_d, n * sizeof(Data)); + struct timeval t1, t2; + + // run version with static shared memory + cudaMemcpy(d_d, &order, n * sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + shared_block<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + double time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0; + printf("Shared block: %f s \n", time); + cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < blocks; i++) { + if (backup[i] != (Data)thr) + printf("Error: d[%d] != r (%f, %d)\n", i, backup[i], thr); + } + + // run version with global memory + cudaMemcpy(d_d, &order, n * sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + global_block<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0; + printf("Global block: %f s \n", time); + cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < blocks; i++) { + if (backup[i] != (Data)thr) + printf("Error: d[%d] !=r (%f, %d)\n", i, backup[i], thr); + } + + // run version with shared memory + cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + shared_grid<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0; + printf("Shared grid: %f s \n", time); + cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost); + if (backup[0] != (Data)n) + printf("Error: d !=r (%f, %d)\n", backup[0], n); + + // run version with global memory + cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + global_grid<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0; + printf("Global grid: %f s \n", time); + cudaMemcpy(backup, d_d, 1 * sizeof(Data), cudaMemcpyDeviceToHost); + if (backup[0] != (Data)n) + printf("Error: d !=r (%f, %d)\n", backup[0], n); +} \ No newline at end of file diff --git a/src/cuda/test/barrier_fence.cu b/src/cuda/test/barrier_fence.cu new file mode 100644 index 000000000..905dc0874 --- /dev/null +++ b/src/cuda/test/barrier_fence.cu @@ -0,0 +1,85 @@ +#include +#include +#include + +#include +#include +#include +#include + +using Data = float; + +__global__ void global_thread(Data *d, int n) { + int no_blocks = 128; + + for (int i = 0; i < no_blocks * no_blocks * 10; i++) { + if (i % no_blocks == blockIdx.x) { + if (i % no_blocks > 0) { + d[blockIdx.x] = d[blockIdx.x - 1] + 1; + } + } + __threadfence(); + } +} + +__global__ void shared_fence(Data *d, int n) { + __shared__ Data s[256]; + int block = blockIdx.x; + int no_threads = 256; + + for (int i = 0; i < no_threads * no_threads * 10; i++) { + if (i % no_threads == threadIdx.x && threadIdx.x > 0) { + s[threadIdx.x] = s[threadIdx.x - 1] + 1; + } + __threadfence(); + } + + if (threadIdx.x == 0) { + d[block] = s[127] + s[129]; + } +} + +int main(void) { + const int n = 1 << 15; + + const int thr = 256; + const int blocks = (n + thr - 1) / thr; + Data order[n], backup[n]; + + printf("Threads/block:%d blocks/grid:%d\n", thr, blocks); + + for (int i = 0; i < n; i++) + order[i] = 0.0; + + Data *d_d; + cudaMalloc(&d_d, n * sizeof(Data)); + struct timeval t1, t2; + + // run version with shared memory + cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + shared_fence<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + auto time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0; + printf("Shared time: %f\n", time); + cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < 128; i++) { + if (backup[i] != 256.0) + printf("Error1: d[%d] != r (%f, %d)\n", i, backup[i], n / 128); + } + + // run version with global memory + cudaMemcpy(d_d, order, n * sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + global_thread<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0; + printf("Global time: %f\n", time); + cudaMemcpy(backup, d_d, n * sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < 128; i++) { + if (backup[i] != Data(i)) + printf("Error1: d[%d] != r (%f, %d)\n", i, backup[i], i); + } +} \ No newline at end of file diff --git a/src/cuda/test/barrier_sync.cu b/src/cuda/test/barrier_sync.cu new file mode 100644 index 000000000..c3fba7d96 --- /dev/null +++ b/src/cuda/test/barrier_sync.cu @@ -0,0 +1,52 @@ +#include +#include +#include + +#include +#include +#include + +using Data = float; + +__device__ void rand_func() { + int sum = 0; + for (int i = 0; i < 1000; i++) + sum += i; +} + +__device__ void iterate(int id) { + for (int j = id; j < 10000; j++) { + if (j % 2 == 0) { + rand_func(); + } + } +} + +__global__ void check_sync(Data *d, int no_threads) { + for (int i = 0; i < no_threads * no_threads; i++) { + if ((i % no_threads) == threadIdx.x) { + iterate(threadIdx.x); + } + __syncthreads(); + } +} + +int main(void) { + const int n = 1 << 10; + + const int thr = 1024; + const int blocks = (n + thr - 1) / thr; + + printf("Threads/block:%d blocks/grid:%d\n", thr, blocks); + + Data *d_d; + cudaMalloc(&d_d, n * sizeof(Data)); + struct timeval t1, t2; + + gettimeofday(&t1, 0); + check_sync<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + auto time = (1000000.0 * (t2.tv_sec - t1.tv_sec) + t2.tv_usec - t1.tv_usec) / 1000.0 / 1000.0; + printf("Time: %f s\n", time); +} \ No newline at end of file