From 650c9710a475839205e920229459d0f0db0e9911 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Tue, 12 Oct 2021 12:28:47 +0200 Subject: [PATCH 01/29] add cooperative groups --- .../CUDAUtilities/interface/cudaCompat.h | 116 +++++++++++++++++- 1 file changed, 115 insertions(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index a7e4e963a87c6..c5f6d2e0c2af6 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -21,6 +21,9 @@ #undef __forceinline__ #define __forceinline__ inline __attribute__((always_inline)) +#undef __launch_bounds__ +#define __launch_bounds__(...) + namespace cms { namespace cudacompat { @@ -106,14 +109,125 @@ namespace cms { } inline void __syncthreads() {} - inline void __threadfence() {} inline bool __syncthreads_or(bool x) { return x; } inline bool __syncthreads_and(bool x) { return x; } + + inline void __trap() { abort(); } + inline void __threadfence() {} + template inline T __ldg(T const* x) { return *x; } + namespace cooperative_groups { + + // This class represents the thread block + class thread_block { + private: + thread_block() = default; + + friend thread_block this_thread_block(); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + static void sync() {} + + // Total number of threads in the group. + // On the serial CPU implementation, always 1. + static unsigned long long size() { return 1; } + + // Rank of the calling thread within [0, size-1]. + // On the serial CPU implementation, always 0. + static unsigned long long thread_rank() { return 0; } + + // 3-Dimensional index of the block within the launched grid. + // On the serial CPU implementation, always {0, 0, 0}. + static dim3 group_index() { return blockIdx; } + + // 3-Dimensional index of the thread within the launched block + // On the serial CPU implementation, always {0, 0, 0}. + static dim3 thread_index() { return threadIdx; } + + // Dimensions of the launched block. + // On the serial CPU implementation, always {1, 1, 1}. + static dim3 group_dim() { return blockDim; } + }; + + // Return the current thread block + inline thread_block this_thread_block() { return thread_block{}; } + + // Represent a tiled group of threads, with compile-time fixed size. + // On the serial CPU implementation, the only valid Size is 1 + template + class thread_block_tile { + private: + static_assert( + Size == 1, + "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread."); + + thread_block_tile() = default; + + friend thread_block_tile tiled_partition(const ParentT& g); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + void sync() const {} + + // Total number of threads in the group. + // On the serial CPU implementation, always 1. + unsigned long long size() const { return 1; } + + // Rank of the calling thread within [0, size-1]. + // On the serial CPU implementation, always 0. + unsigned long long thread_rank() const { return 0; } + + // Returns the number of groups created when the parent group was partitioned. + // On the serial CPU implementation, always 1. + unsigned long long meta_group_size() const { return 1; } + + // Linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size). + // On the serial CPU implementation, always 0. + unsigned long long meta_group_rank() const { return 0; } + + // Not implemented - Refer to Warp Shuffle Functions + template + T shfl(T var, unsigned int src_rank) const; + template + T shfl_up(T var, int delta) const; + template + T shfl_down(T var, int delta) const; + template + T shfl_xor(T var, int delta) const; + + // Not implemented - Refer to Warp Vote Functions + template + T any(int predicate) const; + template + T all(int predicate) const; + template + T ballot(int predicate) const; + + // Not implemented - Refer to Warp Match Functions + template + T match_any(T val) const; + template + T match_all(T val, int& pred) const; + }; + + template + inline thread_block_tile tiled_partition(const ParentT& g) { + static_assert( + Size == 1, + "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread."); + + return thread_block_tile{}; + } + + } // namespace cooperative_groups + } // namespace cudacompat } // namespace cms From e654481c42b9c5065d5588a6d02613782b3babe9 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Tue, 12 Oct 2021 17:56:10 +0200 Subject: [PATCH 02/29] works with coop --- .../CUDAUtilities/interface/cudaCompat.h | 15 ++++++ .../CUDAUtilities/interface/prefixScan.h | 46 +++++++++++++++++++ .../CUDAUtilities/test/prefixScan_t.cu | 37 ++++++++++++++- 3 files changed, 96 insertions(+), 2 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index c5f6d2e0c2af6..c2bc3f04450c0 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -122,6 +122,21 @@ namespace cms { namespace cooperative_groups { + // This class represents the grid group + class grid_group { + private: + grid_group() = default; + + friend grid_group this_grid(); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + static void sync() {} + }; + + inline grid_group this_grid() { return grid_group{};} + // This class represents the thread block class thread_block { private: diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 1a779fc677ff7..e4ff57e0eb415 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -6,6 +6,7 @@ #include "FWCore/Utilities/interface/CMSUnrollLoop.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include #ifdef __CUDA_ARCH__ @@ -183,6 +184,51 @@ namespace cms { co[i] += psum[k]; } } + + + template + __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T * ipsum) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + volatile T const* ci = ici; + volatile T* co = ico; + volatile T* psum = ipsum; + + __shared__ T ws[32]; + + // assert(blockDim.x * gridDim.x >= size); + + int nChunks = size/blockDim.x + 1; + + // first each block does a scan + for (int ib=blockIdx.x; ib 0) { + int ls = std::min(int(blockDim.x), size - off); + blockPrefixScan(ci + off, co + off, ls, ws); + psum[ib] = co[off+ls-1]; + __syncthreads(); + } + } + grid.sync(); + + // good each block has done his work + // let's get the partial sums from each block + if (0==blockIdx.x) { + blockPrefixScan(psum, psum, nChunks, ws); + } + + grid.sync(); + + for (int ib=blockIdx.x; ib +__global__ void doCoop(T const* ici, T* ico, int32_t size, T * ipsum) { + coopBlockPrefixScan(ici,ico,size,ipsum); +} + int main() { cms::cudatest::requireDevices(); @@ -113,9 +118,9 @@ int main() { int num_items = 200; for (int ksize = 1; ksize < 4; ++ksize) { // test multiblock - std::cout << "multiblok" << std::endl; // Declare, allocate, and initialize device-accessible pointers for input and output num_items *= 10; + std::cout << "multiblok " << num_items << std::endl; uint32_t *d_in; uint32_t *d_out1; uint32_t *d_out2; @@ -141,7 +146,35 @@ int main() { cudaCheck(cudaGetLastError()); verify<<>>(d_out1, num_items); cudaCheck(cudaGetLastError()); - cudaDeviceSynchronize(); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaGetLastError()); + + uint32_t *d_psum; + cudaCheck(cudaMalloc(&d_psum, nblocks*sizeof(uint32_t))); + std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl; + int numBlocksPerSm = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, (void*)doCoop, nthreads, 0); + std::cout << "max number of blocks is " << deviceProp.multiProcessorCount*numBlocksPerSm << std::endl; + auto ncoopblocks = std::min(nblocks,deviceProp.multiProcessorCount*numBlocksPerSm); + void *kernelArgs[] = { &d_in, &d_out2, &num_items, &d_psum }; + dim3 dimBlock(nthreads, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaLaunchCooperativeKernel((void*)doCoop, dimGrid, dimBlock, kernelArgs); + cudaCheck(cudaGetLastError()); + verify<<>>(d_out2, num_items); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaGetLastError()); + + std::cout << "Free" << std::endl; + cudaCheck(cudaFree(d_psum)); + cudaCheck(cudaFree(d_out2)); + cudaCheck(cudaFree(d_out1)); + cudaCheck(cudaFree(d_in)); + } // ksize return 0; From 061469415ea4a4914a162689dac0bf07986ca395 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 13 Oct 2021 12:08:45 +0200 Subject: [PATCH 03/29] coop works in assoc --- .../CUDAUtilities/interface/OneToManyAssoc.h | 67 ++++++++++++++++--- .../CUDAUtilities/interface/maxCoopBlocks.h | 16 +++++ .../CUDAUtilities/test/OneToManyAssoc_t.h | 63 ++++++++++++++--- .../CUDAUtilities/test/prefixScan_t.cu | 10 ++- 4 files changed, 130 insertions(+), 26 deletions(-) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h index 01f48bca94f4b..7a3876881ab7d 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h +++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h @@ -29,6 +29,25 @@ namespace cms { index_type *contentStorage = nullptr; int32_t offSize = -1; int32_t contentSize = -1; + + constexpr Counter * offsets() const { + Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off)); + if constexpr (Assoc::ctNOnes() < 0) { + assert(offStorage); + poff = offStorage; + } + return poff; + } + + constexpr int32_t size() const { + auto nOnes = Assoc::ctNOnes(); + if constexpr (Assoc::ctNOnes() < 0) { + nOnes = offSize; + } + assert(nOnes > 0); + return nOnes; + } + }; // this MUST BE DONE in a single block (or in two kernels!) @@ -50,6 +69,29 @@ namespace cms { } } + + template + __device__ void zeroAndInitCoop(OneToManyAssocView view) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + + auto h = view.assoc; + + auto first = blockDim.x * blockIdx.x + threadIdx.x; + + if (0 == first) { + h->psws = 0; + h->initStorage(view); + } + + grid.sync(); + for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) { + h->off[i] = 0; + } + + } + + template inline __attribute__((always_inline)) void launchZero(Assoc *h, cudaStream_t stream @@ -111,16 +153,8 @@ namespace cms { auto h = view.assoc; assert(h); #ifdef __CUDACC__ - using Counter = typename Assoc::Counter; - Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off)); - auto nOnes = Assoc::ctNOnes(); - if constexpr (Assoc::ctNOnes() < 0) { - assert(view.offStorage); - assert(view.offSize > 0); - nOnes = view.offSize; - poff = view.offStorage; - } - assert(nOnes > 0); + auto poff = view.offsets(); + auto nOnes = view.size(); int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws)); auto nthreads = 1024; auto nblocks = (nOnes + nthreads - 1) / nthreads; @@ -131,6 +165,19 @@ namespace cms { #endif } + template + __device__ __inline__ void finalizeCoop(OneToManyAssocView view, typename Assoc::Counter * ws) { +#ifdef __CUDACC__ + auto poff = view.offsets(); + auto nOnes = view.size(); + coopBlockPrefixScan(poff, poff, nOnes, ws); +#else + auto h = view.assoc; + assert(h); + h->finalize(); +#endif + } + template __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { assoc->bulkFinalizeFill(*apc); diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h new file mode 100644 index 0000000000000..1522af45e247b --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h @@ -0,0 +1,16 @@ + +#ifndef HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h +#define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h + +#include + +template +inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device) { + int numBlocksPerSm = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, device); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem); + return deviceProp.multiProcessorCount*numBlocksPerSm; +} + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 8ba9158cb69b7..c369e8da2c7bd 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -11,6 +11,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" #endif #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" @@ -58,7 +59,7 @@ __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restr assert(m1->off[i] == m2->off[i]); } -__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { +__device__ __inline__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { auto k = i / 4; @@ -70,8 +71,12 @@ __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int3 assoc->count(tk[k][j]); } } +__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + count(tk,assoc,n); +} + -__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { +__device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { auto k = i / 4; @@ -84,6 +89,24 @@ __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32 } } +__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { + fill(tk,assoc,n); +} + + +__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter * ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto h = view.assoc; + zeroAndInitCoop(view); + grid.sync(); + count(tk,h,n); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + fill(tk,h,n); +} + __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); } template @@ -118,7 +141,7 @@ __global__ void verifyFill(Assoc const* __restrict__ la, int n) { imax = std::max(imax, int(x)); } assert(0 == la->size(n)); - printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z); + printf("found with %d elements %f %d %d\n\n", n, double(ave) / n, imax, z); } template @@ -179,7 +202,7 @@ int main() { std::geometric_distribution rdm(0.8); - constexpr uint32_t N = 4000; + uint32_t N = 4000; std::vector> tr(N); @@ -249,21 +272,41 @@ int main() { launchZero(saView, 0); #ifdef __CUDACC__ - auto nThreads = 256; - auto nBlocks = (4 * N + nThreads - 1) / nThreads; + int nThreads = 256; + int nBlocks = (4 * N + nThreads - 1) / nThreads; - count<<>>(v_d.get(), a_d.get(), N); + countKernel<<>>(v_d.get(), a_d.get(), N); launchFinalize(aView, 0); verify<<<1, 1>>>(a_d.get()); - fill<<>>(v_d.get(), a_d.get(), N); + fillKernel<<>>(v_d.get(), a_d.get(), N); + verifyFill<<<1, 1>>>(a_d.get(), n); + + // now with cooperative gropus + + auto nOnes = aView.size(); + auto nchunks = nOnes/nThreads + 1; + auto ws = cms::cuda::make_device_unique(nchunks,0); + + int maxBlocks = maxCoopBlocks(populate, nThreads, 0,0); + std::cout << "max number of blocks is " << maxBlocks << std::endl; + auto ncoopblocks = std::min(nBlocks,maxBlocks); + auto a1 = v_d.get(); + auto a4 = ws.get(); + void *kernelArgs[] = { &a1, &aView, &N, &a4 }; + dim3 dimBlock(nThreads, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void*)populate, dimGrid, dimBlock, kernelArgs)); verifyFill<<<1, 1>>>(a_d.get(), n); + cudaCheck(cudaGetLastError()); + #else - count(v_d, a_d.get(), N); + countKernel(v_d, a_d.get(), N); launchFinalize(aView); verify(a_d.get()); - fill(v_d, a_d.get(), N); + fillKernel(v_d, a_d.get(), N); verifyFill(a_d.get(), n); #endif diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index 23575fd418e40..909b2886e32ef 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -3,6 +3,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" using namespace cms::cuda; @@ -152,12 +153,9 @@ int main() { uint32_t *d_psum; cudaCheck(cudaMalloc(&d_psum, nblocks*sizeof(uint32_t))); std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl; - int numBlocksPerSm = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, (void*)doCoop, nthreads, 0); - std::cout << "max number of blocks is " << deviceProp.multiProcessorCount*numBlocksPerSm << std::endl; - auto ncoopblocks = std::min(nblocks,deviceProp.multiProcessorCount*numBlocksPerSm); + int maxBlocks = maxCoopBlocks(doCoop, nthreads, 0,0); + std::cout << "max number of blocks is " << maxBlocks << std::endl; + auto ncoopblocks = std::min(nblocks,maxBlocks); void *kernelArgs[] = { &d_in, &d_out2, &num_items, &d_psum }; dim3 dimBlock(nthreads, 1, 1); dim3 dimGrid(ncoopblocks, 1, 1); From df689169dfef93892a1b61095c7ec032bec2f976 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 13 Oct 2021 13:52:00 +0200 Subject: [PATCH 04/29] coops implelented in histo filling --- .../CUDAUtilities/interface/HistoContainer.h | 93 ++++++++++++++++++- .../CUDAUtilities/test/BuildFile.xml | 8 ++ .../CUDAUtilities/test/HistoContainer_t.cu | 5 + 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 7bf5db603bccd..02da35a8bd39e 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -3,11 +3,17 @@ #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" +#endif + + namespace cms { namespace cuda { template - __global__ void countFromVector(Histo *__restrict__ h, + __device__ __inline__ void countFromVector(Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, uint32_t const *__restrict__ offsets) { @@ -23,7 +29,15 @@ namespace cms { } template - __global__ void fillFromVector(Histo *__restrict__ h, + __global__ void countFromVectorKernel(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + countFromVector(h,nh,v,offsets); + } + + template + __device__ __inline__ void fillFromVector(Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, uint32_t const *__restrict__ offsets) { @@ -38,6 +52,15 @@ namespace cms { } } + template + __global__ void fillFromVectorKernel(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + fillFromVector(h,nh,v,offsets); + } + + template inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, uint32_t nh, @@ -56,18 +79,80 @@ namespace cms { #ifdef __CUDACC__ auto nblocks = (totSize + nthreads - 1) / nthreads; assert(nblocks > 0); - countFromVector<<>>(h, nh, v, offsets); + countFromVectorKernel<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); launchFinalize(view, stream); - fillFromVector<<>>(h, nh, v, offsets); + fillFromVectorKernel<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); +#else + countFromVectorKernel(h, nh, v, offsets); + h->finalize(); + fillFromVectorKernel(h, nh, v, offsets); +#endif + } + + +#ifdef __CUDACC__ + template + __global__ void fillManyFromVectorCoopKernel(typename Histo::View view, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + int32_t totSize, typename Histo::View::Counter * ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto h = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + countFromVector(h, nh, v, offsets); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + fillFromVector(h, nh, v, offsets); + } +#endif + + template + inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo * h, + uint32_t nh, + T const * v, + uint32_t const * offsets, + int32_t totSize, + int nthreads, + typename Histo::index_type *mem, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + using View = typename Histo::View; + View view = {h, nullptr, mem, -1, totSize}; +#ifdef __CUDACC__ + auto kernel = fillManyFromVectorCoopKernel; + auto nblocks = (totSize + nthreads - 1) / nthreads; + assert(nblocks > 0); + auto nOnes = view.size(); + auto nchunks = nOnes/nthreads + 1; + auto ws = cms::cuda::make_device_unique(nchunks,stream); + auto wsp = ws.get(); + int maxBlocks = maxCoopBlocks(kernel, nthreads, 0,0); + auto ncoopblocks = std::min(nblocks,maxBlocks); + assert(ncoopblocks>0); + void *kernelArgs[] = { &view, &nh, &v, &offsets, &totSize, &wsp }; + dim3 dimBlock(nthreads, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void*)kernel, dimGrid, dimBlock, kernelArgs, 0, stream)); cudaCheck(cudaGetLastError()); #else + launchZero(view, stream); countFromVector(h, nh, v, offsets); h->finalize(); fillFromVector(h, nh, v, offsets); #endif } + // iteratate over N bins left and right of the one containing "v" template __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 53d41efcf4236..72f34863ab8dd 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -52,6 +52,14 @@ + + + + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 75f9cc0e626f5..67ec589bb9bc6 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -72,7 +72,12 @@ void go() { cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); +#ifdef COOP + fillManyFromVectorCoop(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0); +#else fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0); +#endif + cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); assert(0 == h.off[0]); assert(offsets[10] == h.size()); From f71031aecef3b53d7f9b3f9eaf1383959ec3e34b Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 13 Oct 2021 14:04:24 +0200 Subject: [PATCH 05/29] format --- .../CUDAUtilities/interface/HistoContainer.h | 79 +++++++++---------- .../CUDAUtilities/interface/OneToManyAssoc.h | 14 ++-- .../CUDAUtilities/interface/cudaCompat.h | 12 +-- .../CUDAUtilities/interface/maxCoopBlocks.h | 14 ++-- .../CUDAUtilities/interface/prefixScan.h | 27 ++++--- .../CUDAUtilities/test/OneToManyAssoc_t.h | 31 +++----- .../CUDAUtilities/test/prefixScan_t.cu | 17 ++-- 7 files changed, 90 insertions(+), 104 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 02da35a8bd39e..49e198bf43332 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -8,15 +8,14 @@ #include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" #endif - namespace cms { namespace cuda { template __device__ __inline__ void countFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); @@ -30,17 +29,17 @@ namespace cms { template __global__ void countFromVectorKernel(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - countFromVector(h,nh,v,offsets); + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + countFromVector(h, nh, v, offsets); } template __device__ __inline__ void fillFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); @@ -54,12 +53,11 @@ namespace cms { template __global__ void fillFromVectorKernel(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - fillFromVector(h,nh,v,offsets); - } - + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + fillFromVector(h, nh, v, offsets); + } template inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, @@ -91,14 +89,14 @@ namespace cms { #endif } - #ifdef __CUDACC__ template - __global__ void fillManyFromVectorCoopKernel(typename Histo::View view, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - int32_t totSize, typename Histo::View::Counter * ws) { + __global__ void fillManyFromVectorCoopKernel(typename Histo::View view, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + int32_t totSize, + typename Histo::View::Counter *ws) { namespace cg = cooperative_groups; auto grid = cg::this_grid(); auto h = static_cast(view.assoc); @@ -113,36 +111,36 @@ namespace cms { #endif template - inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo * h, - uint32_t nh, - T const * v, - uint32_t const * offsets, - int32_t totSize, - int nthreads, - typename Histo::index_type *mem, - cudaStream_t stream + inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h, + uint32_t nh, + T const *v, + uint32_t const *offsets, + int32_t totSize, + int nthreads, + typename Histo::index_type *mem, + cudaStream_t stream #ifndef __CUDACC__ - = cudaStreamDefault + = cudaStreamDefault #endif ) { using View = typename Histo::View; View view = {h, nullptr, mem, -1, totSize}; #ifdef __CUDACC__ - auto kernel = fillManyFromVectorCoopKernel; + auto kernel = fillManyFromVectorCoopKernel; auto nblocks = (totSize + nthreads - 1) / nthreads; assert(nblocks > 0); auto nOnes = view.size(); - auto nchunks = nOnes/nthreads + 1; - auto ws = cms::cuda::make_device_unique(nchunks,stream); + auto nchunks = nOnes / nthreads + 1; + auto ws = cms::cuda::make_device_unique(nchunks, stream); auto wsp = ws.get(); - int maxBlocks = maxCoopBlocks(kernel, nthreads, 0,0); - auto ncoopblocks = std::min(nblocks,maxBlocks); - assert(ncoopblocks>0); - void *kernelArgs[] = { &view, &nh, &v, &offsets, &totSize, &wsp }; + int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); + auto ncoopblocks = std::min(nblocks, maxBlocks); + assert(ncoopblocks > 0); + void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp}; dim3 dimBlock(nthreads, 1, 1); dim3 dimGrid(ncoopblocks, 1, 1); // launch - cudaCheck(cudaLaunchCooperativeKernel((void*)kernel, dimGrid, dimBlock, kernelArgs, 0, stream)); + cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream)); cudaCheck(cudaGetLastError()); #else launchZero(view, stream); @@ -152,7 +150,6 @@ namespace cms { #endif } - // iteratate over N bins left and right of the one containing "v" template __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h index 7a3876881ab7d..b7bf7ca4f354f 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h +++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h @@ -30,11 +30,11 @@ namespace cms { int32_t offSize = -1; int32_t contentSize = -1; - constexpr Counter * offsets() const { + constexpr Counter *offsets() const { Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off)); if constexpr (Assoc::ctNOnes() < 0) { assert(offStorage); - poff = offStorage; + poff = offStorage; } return poff; } @@ -45,9 +45,8 @@ namespace cms { nOnes = offSize; } assert(nOnes > 0); - return nOnes; - } - + return nOnes; + } }; // this MUST BE DONE in a single block (or in two kernels!) @@ -69,7 +68,6 @@ namespace cms { } } - template __device__ void zeroAndInitCoop(OneToManyAssocView view) { namespace cg = cooperative_groups; @@ -88,10 +86,8 @@ namespace cms { for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) { h->off[i] = 0; } - } - template inline __attribute__((always_inline)) void launchZero(Assoc *h, cudaStream_t stream @@ -166,7 +162,7 @@ namespace cms { } template - __device__ __inline__ void finalizeCoop(OneToManyAssocView view, typename Assoc::Counter * ws) { + __device__ __inline__ void finalizeCoop(OneToManyAssocView view, typename Assoc::Counter *ws) { #ifdef __CUDACC__ auto poff = view.offsets(); auto nOnes = view.size(); diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index c2bc3f04450c0..bce1c9493606c 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -123,19 +123,19 @@ namespace cms { namespace cooperative_groups { // This class represents the grid group - class grid_group { - private: - grid_group() = default; + class grid_group { + private: + grid_group() = default; - friend grid_group this_grid(); + friend grid_group this_grid(); public: // Synchronize the threads named in the group. // On the serial CPU implementation, do nothing. static void sync() {} - }; + }; - inline grid_group this_grid() { return grid_group{};} + inline grid_group this_grid() { return grid_group{}; } // This class represents the thread block class thread_block { diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h index 1522af45e247b..44490110eeb28 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h +++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h @@ -2,15 +2,15 @@ #ifndef HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h #define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h -#include +#include -template +template inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device) { - int numBlocksPerSm = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, device); - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem); - return deviceProp.multiProcessorCount*numBlocksPerSm; + int numBlocksPerSm = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, device); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem); + return deviceProp.multiProcessorCount * numBlocksPerSm; } #endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index e4ff57e0eb415..d305e2dc13e4e 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -185,9 +185,8 @@ namespace cms { } } - template - __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T * ipsum) { + __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T* ipsum) { namespace cg = cooperative_groups; auto grid = cg::this_grid(); volatile T const* ci = ici; @@ -198,34 +197,36 @@ namespace cms { // assert(blockDim.x * gridDim.x >= size); - int nChunks = size/blockDim.x + 1; + int nChunks = size / blockDim.x + 1; // first each block does a scan - for (int ib=blockIdx.x; ib 0) { - int ls = std::min(int(blockDim.x), size - off); - blockPrefixScan(ci + off, co + off, ls, ws); - psum[ib] = co[off+ls-1]; - __syncthreads(); + int ls = std::min(int(blockDim.x), size - off); + blockPrefixScan(ci + off, co + off, ls, ws); + psum[ib] = co[off + ls - 1]; + __syncthreads(); } } grid.sync(); // good each block has done his work // let's get the partial sums from each block - if (0==blockIdx.x) { + if (0 == blockIdx.x) { blockPrefixScan(psum, psum, nChunks, ws); } grid.sync(); - for (int ib=blockIdx.x; ibcount(tk[k][j]); } } -__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { - count(tk,assoc,n); -} - +__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { count(tk, assoc, n); } __device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -89,22 +86,19 @@ __device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ a } } -__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { - fill(tk,assoc,n); -} +__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { fill(tk, assoc, n); } - -__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter * ws) { +__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter* ws) { namespace cg = cooperative_groups; auto grid = cg::this_grid(); auto h = view.assoc; zeroAndInitCoop(view); grid.sync(); - count(tk,h,n); + count(tk, h, n); grid.sync(); finalizeCoop(view, ws); grid.sync(); - fill(tk,h,n); + fill(tk, h, n); } __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); } @@ -285,15 +279,15 @@ int main() { // now with cooperative gropus auto nOnes = aView.size(); - auto nchunks = nOnes/nThreads + 1; - auto ws = cms::cuda::make_device_unique(nchunks,0); + auto nchunks = nOnes / nThreads + 1; + auto ws = cms::cuda::make_device_unique(nchunks, 0); - int maxBlocks = maxCoopBlocks(populate, nThreads, 0,0); + int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0); std::cout << "max number of blocks is " << maxBlocks << std::endl; - auto ncoopblocks = std::min(nBlocks,maxBlocks); - auto a1 = v_d.get(); - auto a4 = ws.get(); - void *kernelArgs[] = { &a1, &aView, &N, &a4 }; + auto ncoopblocks = std::min(nBlocks, maxBlocks); + auto a1 = v_d.get(); + auto a4 = ws.get(); + void* kernelArgs[] = {&a1, &aView, &N, &a4}; dim3 dimBlock(nThreads, 1, 1); dim3 dimGrid(ncoopblocks, 1, 1); // launch @@ -301,7 +295,6 @@ int main() { verifyFill<<<1, 1>>>(a_d.get(), n); cudaCheck(cudaGetLastError()); - #else countKernel(v_d, a_d.get(), N); launchFinalize(aView); diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index 909b2886e32ef..ef0a785e00bd5 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -84,9 +84,9 @@ __global__ void verify(uint32_t const *v, uint32_t n) { printf("verify\n"); } -template -__global__ void doCoop(T const* ici, T* ico, int32_t size, T * ipsum) { - coopBlockPrefixScan(ici,ico,size,ipsum); +template +__global__ void doCoop(T const *ici, T *ico, int32_t size, T *ipsum) { + coopBlockPrefixScan(ici, ico, size, ipsum); } int main() { @@ -151,16 +151,16 @@ int main() { cudaCheck(cudaGetLastError()); uint32_t *d_psum; - cudaCheck(cudaMalloc(&d_psum, nblocks*sizeof(uint32_t))); + cudaCheck(cudaMalloc(&d_psum, nblocks * sizeof(uint32_t))); std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl; - int maxBlocks = maxCoopBlocks(doCoop, nthreads, 0,0); + int maxBlocks = maxCoopBlocks(doCoop, nthreads, 0, 0); std::cout << "max number of blocks is " << maxBlocks << std::endl; - auto ncoopblocks = std::min(nblocks,maxBlocks); - void *kernelArgs[] = { &d_in, &d_out2, &num_items, &d_psum }; + auto ncoopblocks = std::min(nblocks, maxBlocks); + void *kernelArgs[] = {&d_in, &d_out2, &num_items, &d_psum}; dim3 dimBlock(nthreads, 1, 1); dim3 dimGrid(ncoopblocks, 1, 1); // launch - cudaLaunchCooperativeKernel((void*)doCoop, dimGrid, dimBlock, kernelArgs); + cudaLaunchCooperativeKernel((void *)doCoop, dimGrid, dimBlock, kernelArgs); cudaCheck(cudaGetLastError()); verify<<>>(d_out2, num_items); cudaCheck(cudaGetLastError()); @@ -173,7 +173,6 @@ int main() { cudaCheck(cudaFree(d_out1)); cudaCheck(cudaFree(d_in)); - } // ksize return 0; } From ac394e7d357b0d7fd511349b7f7b5bcf7123d624 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 13 Oct 2021 14:28:01 +0200 Subject: [PATCH 06/29] use in rechits --- RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 30d1ee2353a65..e70acffa5ecd2 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -61,7 +61,7 @@ namespace pixelgpudetails { setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); cudaCheck(cudaGetLastError()); - cms::cuda::fillManyFromVector( + cms::cuda::fillManyFromVectorCoop( hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream); cudaCheck(cudaGetLastError()); From a3ab3fff8aaa84a3e48972c67077ed9cdfa47525 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 14 Oct 2021 12:03:37 +0200 Subject: [PATCH 07/29] factorize away algos --- .../CUDAUtilities/interface/HistoContainer.h | 162 ---------------- .../interface/HistoContainerAlgo.h | 179 ++++++++++++++++++ .../CUDAUtilities/test/HistoContainerRT_t.cu | 2 +- .../CUDAUtilities/test/HistoContainer_t.cpp | 2 +- .../CUDAUtilities/test/HistoContainer_t.cu | 2 +- .../CUDAUtilities/test/OneHistoContainer_t.cu | 2 +- .../plugins/gpuClustering.h | 2 +- .../plugins/PixelRecHitGPUKernel.cu | 1 + .../plugins/SiPixelRecHitSoAFromLegacy.cc | 1 + .../CAHitNtupletGeneratorKernelsImpl.h | 1 + .../plugins/gpuClusterTracksByDensity.h | 2 +- .../plugins/gpuClusterTracksDBSCAN.h | 2 +- .../plugins/gpuClusterTracksIterative.h | 2 +- .../plugins/gpuFitVertices.h | 2 +- .../PixelVertexFinding/plugins/gpuSortByPt2.h | 2 +- .../plugins/gpuSplitVertices.h | 2 +- 16 files changed, 193 insertions(+), 173 deletions(-) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 49e198bf43332..e0329a13b1edb 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -11,168 +11,6 @@ namespace cms { namespace cuda { - template - __device__ __inline__ void countFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - (*h).count(v[i], ih); - } - } - - template - __global__ void countFromVectorKernel(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - countFromVector(h, nh, v, offsets); - } - - template - __device__ __inline__ void fillFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - (*h).fill(v[i], i, ih); - } - } - - template - __global__ void fillFromVectorKernel(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - fillFromVector(h, nh, v, offsets); - } - - template - inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - int32_t totSize, - int nthreads, - typename Histo::index_type *mem, - cudaStream_t stream -#ifndef __CUDACC__ - = cudaStreamDefault -#endif - ) { - typename Histo::View view = {h, nullptr, mem, -1, totSize}; - launchZero(view, stream); -#ifdef __CUDACC__ - auto nblocks = (totSize + nthreads - 1) / nthreads; - assert(nblocks > 0); - countFromVectorKernel<<>>(h, nh, v, offsets); - cudaCheck(cudaGetLastError()); - launchFinalize(view, stream); - fillFromVectorKernel<<>>(h, nh, v, offsets); - cudaCheck(cudaGetLastError()); -#else - countFromVectorKernel(h, nh, v, offsets); - h->finalize(); - fillFromVectorKernel(h, nh, v, offsets); -#endif - } - -#ifdef __CUDACC__ - template - __global__ void fillManyFromVectorCoopKernel(typename Histo::View view, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - int32_t totSize, - typename Histo::View::Counter *ws) { - namespace cg = cooperative_groups; - auto grid = cg::this_grid(); - auto h = static_cast(view.assoc); - zeroAndInitCoop(view); - grid.sync(); - countFromVector(h, nh, v, offsets); - grid.sync(); - finalizeCoop(view, ws); - grid.sync(); - fillFromVector(h, nh, v, offsets); - } -#endif - - template - inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h, - uint32_t nh, - T const *v, - uint32_t const *offsets, - int32_t totSize, - int nthreads, - typename Histo::index_type *mem, - cudaStream_t stream -#ifndef __CUDACC__ - = cudaStreamDefault -#endif - ) { - using View = typename Histo::View; - View view = {h, nullptr, mem, -1, totSize}; -#ifdef __CUDACC__ - auto kernel = fillManyFromVectorCoopKernel; - auto nblocks = (totSize + nthreads - 1) / nthreads; - assert(nblocks > 0); - auto nOnes = view.size(); - auto nchunks = nOnes / nthreads + 1; - auto ws = cms::cuda::make_device_unique(nchunks, stream); - auto wsp = ws.get(); - int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); - auto ncoopblocks = std::min(nblocks, maxBlocks); - assert(ncoopblocks > 0); - void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp}; - dim3 dimBlock(nthreads, 1, 1); - dim3 dimGrid(ncoopblocks, 1, 1); - // launch - cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream)); - cudaCheck(cudaGetLastError()); -#else - launchZero(view, stream); - countFromVector(h, nh, v, offsets); - h->finalize(); - fillFromVector(h, nh, v, offsets); -#endif - } - - // iteratate over N bins left and right of the one containing "v" - template - __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { - int bs = Hist::bin(value); - int be = std::min(int(Hist::nbins() - 1), bs + n); - bs = std::max(0, bs - n); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } - } - - // iteratate over bins containing all values in window wmin, wmax - template - __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { - auto bs = Hist::bin(wmin); - auto be = Hist::bin(wmax); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } - } - template + __device__ __inline__ void countFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).count(v[i], ih); + } + } + + template + __global__ void countFromVectorKernel(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + countFromVector(h, nh, v, offsets); + } + + template + __device__ __inline__ void fillFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + (*h).fill(v[i], i, ih); + } + } + + template + __global__ void fillFromVectorKernel(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + fillFromVector(h, nh, v, offsets); + } + + template + inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + int32_t totSize, + int nthreads, + typename Histo::index_type *mem, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + typename Histo::View view = {h, nullptr, mem, -1, totSize}; + launchZero(view, stream); +#ifdef __CUDACC__ + auto nblocks = (totSize + nthreads - 1) / nthreads; + assert(nblocks > 0); + countFromVectorKernel<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); + launchFinalize(view, stream); + fillFromVectorKernel<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); +#else + countFromVectorKernel(h, nh, v, offsets); + h->finalize(); + fillFromVectorKernel(h, nh, v, offsets); +#endif + } + +#ifdef __CUDACC__ + template + __global__ void fillManyFromVectorCoopKernel(typename Histo::View view, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + int32_t totSize, + typename Histo::View::Counter *ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto h = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + countFromVector(h, nh, v, offsets); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + fillFromVector(h, nh, v, offsets); + } +#endif + + template + inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h, + uint32_t nh, + T const *v, + uint32_t const *offsets, + int32_t totSize, + int nthreads, + typename Histo::index_type *mem, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + using View = typename Histo::View; + View view = {h, nullptr, mem, -1, totSize}; +#ifdef __CUDACC__ + auto kernel = fillManyFromVectorCoopKernel; + auto nblocks = (totSize + nthreads - 1) / nthreads; + assert(nblocks > 0); + auto nOnes = view.size(); + auto nchunks = nOnes / nthreads + 1; + auto ws = cms::cuda::make_device_unique(nchunks, stream); + auto wsp = ws.get(); + // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event + static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); + auto ncoopblocks = std::min(nblocks, maxBlocks); + assert(ncoopblocks > 0); + void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp}; + dim3 dimBlock(nthreads, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream)); + cudaCheck(cudaGetLastError()); +#else + launchZero(view, stream); + countFromVector(h, nh, v, offsets); + h->finalize(); + fillFromVector(h, nh, v, offsets); +#endif + } + + // iteratate over N bins left and right of the one containing "v" + template + __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + // iteratate over bins containing all values in window wmin, wmax + template + __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu index b49498990fcbe..3831e8e384f85 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu @@ -6,7 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" using namespace cms::cuda; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp index 2109e5a0d5b38..d8ffe204bf56e 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -4,7 +4,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" using namespace cms::cuda; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 67ec589bb9bc6..b879bc8f17e96 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -6,7 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" using namespace cms::cuda; diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index f2d74c8e7ce98..44d487b907699 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -6,7 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index b266f2f2a997e..2201e8bddc8b2 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -6,7 +6,7 @@ #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" namespace gpuClustering { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index e70acffa5ecd2..795b1a35641ef 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -9,6 +9,7 @@ #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index ab0a1ef97e0f2..f184b47ca3b96 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -10,6 +10,7 @@ #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" #include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 92c720bb86012..51276ea403264 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -13,6 +13,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" #include "CAConstants.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h index f71aa56842a67..8bd30493e661b 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h index a11283a7b2065..4132ab0c227da 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h index 66d246fcfa4fa..8837b04e45654 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h index 0acf67244528a..e59063a84c0b3 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index 93f78d498b26f..28ef595bb4868 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #ifdef __CUDA_ARCH__ #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h index 0fe8bd882dcc5..00c7448b51a63 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" From 8a5d69b0086f3e9f9a2c8eb6220fbd9db273b02d Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 14 Oct 2021 15:45:51 +0200 Subject: [PATCH 08/29] decapsulate and format --- .../interface/HistoContainerAlgo.h | 3 +- .../CAHitNtupletGeneratorKernelsImpl.h | 48 ++++++++++++++----- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h index bd80b505fdcda..e2b37a3c138b5 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h @@ -134,7 +134,8 @@ namespace cms { auto ws = cms::cuda::make_device_unique(nchunks, stream); auto wsp = ws.get(); // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event - static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); + // static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); + static int maxBlocks = std::max(1, maxCoopBlocks(kernel, nthreads, 0, 0) / 10); auto ncoopblocks = std::min(nblocks, maxBlocks); assert(ncoopblocks > 0); void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp}; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 51276ea403264..3d7b307f06e88 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -380,9 +380,9 @@ __global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp, } } -__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { +__device__ __inline__ void countMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = foundNtuplets->size(it); @@ -398,9 +398,15 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN } } -__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { +__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { + countMultiplicity(foundNtuplets, quality, tupleMultiplicity); +} + +__device__ __inline__ void fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = foundNtuplets->size(it); @@ -416,6 +422,12 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt } } +__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { + fillMultiplicity(foundNtuplets, quality, tupleMultiplicity); +} + __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, TkSoA const *__restrict__ tracks, CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, @@ -520,9 +532,9 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, } } -__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { +__device__ __inline__ void countHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (tuples->size(idx) == 0) @@ -532,9 +544,15 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, } } -__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { +__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { + countHitInTracks(tuples, quality, hitToTuple); +} + +__device__ __inline__ void fillHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (tuples->size(idx) == 0) @@ -544,6 +562,12 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, } } +__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { + fillHitInTracks(tuples, quality, hitToTuple); +} + __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, TrackingRecHit2DSOAView const *__restrict__ hhp, HitContainer *__restrict__ hitDetIndices) { From 697402983d7da419a912f6732b132c4a3de0a114 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 14 Oct 2021 16:30:52 +0200 Subject: [PATCH 09/29] encapsulate --- .../plugins/CAHitNtupletGeneratorKernels.cu | 61 +++++++++++++------ .../plugins/CAHitNtupletGeneratorKernels.h | 5 -- .../CAHitNtupletGeneratorKernelsAlloc.cc | 5 +- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index a47c89b27ed30..85a82f3171270 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,5 +1,44 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" + /// Compute the number of quadruplet blocks for block size + inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { + // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 + return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; + } + +__inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + caConstants::TupleMultiplicity *tupleMultiplicity_d, cudaStream_t cudaStream) { + + cms::cuda::launchZero(tupleMultiplicity_d, cudaStream); + auto blockSize = 128; + auto numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; + kernel_countMultiplicity<<>>( + tuples_d, quality_d, tupleMultiplicity_d); + cms::cuda::launchFinalize(tupleMultiplicity_d, cudaStream); + kernel_fillMultiplicity<<>>( + tuples_d, quality_d, tupleMultiplicity_d); + +} + + +__inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d, HitToTuple::View hitToTupleView, cudaStream_t cudaStream) { + + cms::cuda::launchZero(hitToTupleView, cudaStream); + auto blockSize = 64; + auto numberOfBlocks = nQuadrupletBlocks(blockSize); + kernel_countHitInTracks<<>>( + tuples_d, quality_d, hitToTuple_d); + cudaCheck(cudaGetLastError()); + cms::cuda::launchFinalize(hitToTupleView, cudaStream); + cudaCheck(cudaGetLastError()); + kernel_fillHitInTracks<<>>(tuples_d, quality_d, hitToTuple_d); +} + + + template <> void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { auto blockSize = 128; @@ -106,13 +145,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); - blockSize = 128; - numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); - cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); + + populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(),cudaStream); cudaCheck(cudaGetLastError()); if (nhits > 1 && params_.lateFishbone_) { @@ -259,17 +293,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA #endif if (params_.doSharedHitCut_ || params_.doStats_) { - // fill hit->track "map" - assert(hitToTupleView_.offSize > nhits); - numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>( - tuples_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); - assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && - (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); - cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); + // populate hit->track "map" + populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(),hitToTupleView_,cudaStream); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 684a54f5d2ed4..c77132dfe1c2d 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -230,11 +230,6 @@ class CAHitNtupletGeneratorKernels { return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize; } - /// Compute the number of quadruplet blocks for block size - inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { - // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 - return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; - } }; using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc index 5978ef8851c73..5bd8793b32411 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc @@ -40,9 +40,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); } else { *device_nCells_ = 0; + // for gpu moved to populate functions + cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); + cms::cuda::launchZero(hitToTupleView_, stream); // we may wish to keep it in the edm } - cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); - cms::cuda::launchZero(hitToTupleView_, stream); // we may wish to keep it in the edm #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); From 0c1e5f45bf966018850f7163f4ee5440000090a2 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 14 Oct 2021 16:31:20 +0200 Subject: [PATCH 10/29] format --- .../plugins/CAHitNtupletGeneratorKernels.cu | 61 ++++++++----------- .../plugins/CAHitNtupletGeneratorKernels.h | 1 - 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 85a82f3171270..9947033633eb1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,44 +1,38 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" - /// Compute the number of quadruplet blocks for block size - inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { - // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 - return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; - } - -__inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d, - Quality const *__restrict__ quality_d, - caConstants::TupleMultiplicity *tupleMultiplicity_d, cudaStream_t cudaStream) { +/// Compute the number of quadruplet blocks for block size +inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { + // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 + return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; +} +__inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + caConstants::TupleMultiplicity *tupleMultiplicity_d, + cudaStream_t cudaStream) { cms::cuda::launchZero(tupleMultiplicity_d, cudaStream); - auto blockSize = 128; + auto blockSize = 128; auto numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( - tuples_d, quality_d, tupleMultiplicity_d); + kernel_countMultiplicity<<>>(tuples_d, quality_d, tupleMultiplicity_d); cms::cuda::launchFinalize(tupleMultiplicity_d, cudaStream); - kernel_fillMultiplicity<<>>( - tuples_d, quality_d, tupleMultiplicity_d); - + kernel_fillMultiplicity<<>>(tuples_d, quality_d, tupleMultiplicity_d); } - -__inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d, - Quality const *__restrict__ quality_d, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d, HitToTuple::View hitToTupleView, cudaStream_t cudaStream) { - - cms::cuda::launchZero(hitToTupleView, cudaStream); - auto blockSize = 64; - auto numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>( - tuples_d, quality_d, hitToTuple_d); - cudaCheck(cudaGetLastError()); - cms::cuda::launchFinalize(hitToTupleView, cudaStream); - cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, hitToTuple_d); +__inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d, + HitToTuple::View hitToTupleView, + cudaStream_t cudaStream) { + cms::cuda::launchZero(hitToTupleView, cudaStream); + auto blockSize = 64; + auto numberOfBlocks = nQuadrupletBlocks(blockSize); + kernel_countHitInTracks<<>>(tuples_d, quality_d, hitToTuple_d); + cudaCheck(cudaGetLastError()); + cms::cuda::launchFinalize(hitToTupleView, cudaStream); + cudaCheck(cudaGetLastError()); + kernel_fillHitInTracks<<>>(tuples_d, quality_d, hitToTuple_d); } - - template <> void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { auto blockSize = 128; @@ -145,8 +139,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); - - populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(),cudaStream); + populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(), cudaStream); cudaCheck(cudaGetLastError()); if (nhits > 1 && params_.lateFishbone_) { @@ -294,7 +287,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA if (params_.doSharedHitCut_ || params_.doStats_) { // populate hit->track "map" - populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(),hitToTupleView_,cudaStream); + populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(), hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index c77132dfe1c2d..880664007cb7c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -229,7 +229,6 @@ class CAHitNtupletGeneratorKernels { // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed. return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize; } - }; using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; From 151aea71704bad057662fa2ab153359c23cf0c85 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 15 Oct 2021 12:12:18 +0200 Subject: [PATCH 11/29] use coop for other 2 assoc --- .../plugins/CAHitNtupletGeneratorKernels.cu | 98 ++++++++++++++++++- 1 file changed, 95 insertions(+), 3 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 9947033633eb1..cca4b0cde5983 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -6,6 +6,9 @@ inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; } +#define CMS_USE_COOP_GROUPS + +#ifndef CMS_USE_COOP_GROUPS __inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d, Quality const *__restrict__ quality_d, caConstants::TupleMultiplicity *tupleMultiplicity_d, @@ -20,9 +23,9 @@ __inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d, __inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d, Quality const *__restrict__ quality_d, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d, - HitToTuple::View hitToTupleView, + CAHitNtupletGeneratorKernelsGPU::HitToTuple::View hitToTupleView, cudaStream_t cudaStream) { + auto hitToTuple_d = static_cast(hitToTupleView.assoc); cms::cuda::launchZero(hitToTupleView, cudaStream); auto blockSize = 64; auto numberOfBlocks = nQuadrupletBlocks(blockSize); @@ -33,6 +36,95 @@ __inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d, kernel_fillHitInTracks<<>>(tuples_d, quality_d, hitToTuple_d); } +#else +__global__ void kernel_populateHitInTracks(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + HitToTuple::View view, + HitToTuple::View::Counter *ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto tuple_d = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + countHitInTracks(tuples_d, quality_d, tuple_d); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + fillHitInTracks(tuples_d, quality_d, tuple_d); +} + +__inline__ void populateHitInTracks(HitContainer const *tuples_d, + Quality const *quality_d, + HitToTuple::View view, + cudaStream_t cudaStream) { + using View = HitToTuple::View; + int blockSize = 64; + int nblocks = nQuadrupletBlocks(blockSize); + + auto kernel = kernel_populateHitInTracks; + + assert(nblocks > 0); + auto nOnes = view.size(); + auto nchunks = nOnes / blockSize + 1; + auto ws = cms::cuda::make_device_unique(nchunks, cudaStream); + auto wsp = ws.get(); + // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event + // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); + static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10); + auto ncoopblocks = std::min(nblocks, maxBlocks); + assert(ncoopblocks > 0); + void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp}; + dim3 dimBlock(blockSize, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream)); +} + +__global__ void kernel_populateMultiplicity(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + caConstants::TupleMultiplicity::View view, + caConstants::TupleMultiplicity::View::Counter *ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto tupleMultiplicity_d = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + countMultiplicity(tuples_d, quality_d, tupleMultiplicity_d); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + fillMultiplicity(tuples_d, quality_d, tupleMultiplicity_d); +} + +__inline__ void populateMultiplicity(HitContainer const *tuples_d, + Quality const *quality_d, + caConstants::TupleMultiplicity *tupleMultiplicity_d, + cudaStream_t cudaStream) { + auto kernel = kernel_populateMultiplicity; + using View = caConstants::TupleMultiplicity::View; + View view = {tupleMultiplicity_d, nullptr, nullptr, -1, -1}; + + int blockSize = 128; + int nblocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; + assert(nblocks > 0); + auto nOnes = view.size(); + auto nchunks = nOnes / blockSize + 1; + auto ws = cms::cuda::make_device_unique(nchunks, cudaStream); + auto wsp = ws.get(); + // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event + // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); + static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10); + auto ncoopblocks = std::min(nblocks, maxBlocks); + assert(ncoopblocks > 0); + void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp}; + dim3 dimBlock(blockSize, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream)); +} + +#endif + template <> void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { auto blockSize = 128; @@ -287,7 +379,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA if (params_.doSharedHitCut_ || params_.doStats_) { // populate hit->track "map" - populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(), hitToTupleView_, cudaStream); + populateHitInTracks(tuples_d, quality_d, hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); From b8e2760f7929128da1fa43619c36306c59c878d1 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 16 Oct 2021 16:03:29 +0200 Subject: [PATCH 12/29] drive performance tests by envvar --- .../CUDAUtilities/interface/HistoContainerAlgo.h | 3 +-- .../CUDAUtilities/interface/maxCoopBlocks.h | 16 ++++++++++++++-- .../CUDAUtilities/test/OneToManyAssoc_t.h | 2 +- .../CUDAUtilities/test/prefixScan_t.cu | 2 +- .../plugins/CAHitNtupletGeneratorKernels.cu | 6 ++---- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h index e2b37a3c138b5..bd80b505fdcda 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h @@ -134,8 +134,7 @@ namespace cms { auto ws = cms::cuda::make_device_unique(nchunks, stream); auto wsp = ws.get(); // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event - // static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); - static int maxBlocks = std::max(1, maxCoopBlocks(kernel, nthreads, 0, 0) / 10); + static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); auto ncoopblocks = std::min(nblocks, maxBlocks); assert(ncoopblocks > 0); void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp}; diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h index 44490110eeb28..9d4db05e9276f 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h +++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h @@ -3,14 +3,26 @@ #define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h #include +#include +#include template -inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device) { +inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redFact = 10) { +#define GET_COOP_RED_FACT_FROM_ENV +#ifdef GET_COOP_RED_FACT_FROM_ENV + auto env = getenv("COOP_RED_FACT"); + int redFactFromEnv = env ? atoi(env) : 0; + if (redFactFromEnv != 0) + redFact = redFactFromEnv; +#endif + int numBlocksPerSm = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device); cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem); - return deviceProp.multiProcessorCount * numBlocksPerSm; + int mxblocks = deviceProp.multiProcessorCount * numBlocksPerSm; + // reduce number of blocks to account for multiple CPU threads + return std::max(1, mxblocks / redFact); } #endif diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 5685dce628e59..da4e1be0ba782 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -282,7 +282,7 @@ int main() { auto nchunks = nOnes / nThreads + 1; auto ws = cms::cuda::make_device_unique(nchunks, 0); - int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0); + int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0, 0); std::cout << "max number of blocks is " << maxBlocks << std::endl; auto ncoopblocks = std::min(nBlocks, maxBlocks); auto a1 = v_d.get(); diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index ef0a785e00bd5..61dc6dfc0a3dd 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -153,7 +153,7 @@ int main() { uint32_t *d_psum; cudaCheck(cudaMalloc(&d_psum, nblocks * sizeof(uint32_t))); std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl; - int maxBlocks = maxCoopBlocks(doCoop, nthreads, 0, 0); + int maxBlocks = maxCoopBlocks(doCoop, nthreads, 0, 0, 0); std::cout << "max number of blocks is " << maxBlocks << std::endl; auto ncoopblocks = std::min(nblocks, maxBlocks); void *kernelArgs[] = {&d_in, &d_out2, &num_items, &d_psum}; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index cca4b0cde5983..52c343a470ba5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -69,8 +69,7 @@ __inline__ void populateHitInTracks(HitContainer const *tuples_d, auto ws = cms::cuda::make_device_unique(nchunks, cudaStream); auto wsp = ws.get(); // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event - // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); - static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10); + static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); auto ncoopblocks = std::min(nblocks, maxBlocks); assert(ncoopblocks > 0); void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp}; @@ -112,8 +111,7 @@ __inline__ void populateMultiplicity(HitContainer const *tuples_d, auto ws = cms::cuda::make_device_unique(nchunks, cudaStream); auto wsp = ws.get(); // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event - // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); - static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10); + static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); auto ncoopblocks = std::min(nblocks, maxBlocks); assert(ncoopblocks > 0); void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp}; From 446d652df9076188755ca1364fe2d716ad4865e3 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 16 Oct 2021 16:29:36 +0200 Subject: [PATCH 13/29] add comment --- HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h index 9d4db05e9276f..f8e493540aacb 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h +++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h @@ -9,6 +9,8 @@ template inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redFact = 10) { #define GET_COOP_RED_FACT_FROM_ENV + +// to drive performance assessment by envvar #ifdef GET_COOP_RED_FACT_FROM_ENV auto env = getenv("COOP_RED_FACT"); int redFactFromEnv = env ? atoi(env) : 0; From 5f6f59676038677f704b8e820046fb052da0231e Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 17 Oct 2021 13:39:54 +0200 Subject: [PATCH 14/29] factorize, encapsulate --- .../interface/HistoContainerAlgo.h | 51 ++++++------------- .../CUDAUtilities/interface/OneToManyAssoc.h | 3 ++ 2 files changed, 18 insertions(+), 36 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h index bd80b505fdcda..4177dfda97f83 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h @@ -11,8 +11,8 @@ namespace cms { namespace cuda { - template - __device__ __inline__ void countFromVector(Histo *__restrict__ h, + template + __device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, uint32_t const *__restrict__ offsets) { @@ -23,40 +23,19 @@ namespace cms { int32_t ih = off - offsets - 1; assert(ih >= 0); assert(ih < int(nh)); - (*h).count(v[i], ih); + if constexpr(CountOrFill::count==cof) + (*h).count(v[i], ih); + else + (*h).fill(v[i], i, ih); } } - template - __global__ void countFromVectorKernel(Histo *__restrict__ h, + template + __global__ void countOrFillFromVectorKernel(Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, uint32_t const *__restrict__ offsets) { - countFromVector(h, nh, v, offsets); - } - - template - __device__ __inline__ void fillFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - (*h).fill(v[i], i, ih); - } - } - - template - __global__ void fillFromVectorKernel(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - fillFromVector(h, nh, v, offsets); + countOrFillFromVector(h, nh, v, offsets); } template @@ -77,15 +56,15 @@ namespace cms { #ifdef __CUDACC__ auto nblocks = (totSize + nthreads - 1) / nthreads; assert(nblocks > 0); - countFromVectorKernel<<>>(h, nh, v, offsets); + countOrFillFromVectorKernel<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); launchFinalize(view, stream); - fillFromVectorKernel<<>>(h, nh, v, offsets); + countOrFillFromVectorKernel<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); #else - countFromVectorKernel(h, nh, v, offsets); + countOrFillFromVectorKernel(h, nh, v, offsets); h->finalize(); - fillFromVectorKernel(h, nh, v, offsets); + countOrFillFromVectorKernel(h, nh, v, offsets); #endif } @@ -102,11 +81,11 @@ namespace cms { auto h = static_cast(view.assoc); zeroAndInitCoop(view); grid.sync(); - countFromVector(h, nh, v, offsets); + countOrFillFromVector(h, nh, v, offsets); grid.sync(); finalizeCoop(view, ws); grid.sync(); - fillFromVector(h, nh, v, offsets); + countOrFillFromVector(h, nh, v, offsets); } #endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h index b7bf7ca4f354f..41a88d6f624ec 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h +++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h @@ -19,6 +19,9 @@ namespace cms { namespace cuda { + enum class CountOrFill {count, fill}; + + template struct OneToManyAssocView { using Counter = typename Assoc::Counter; From 71631b28890bbc839c84ef22a5f28ca2a405583a Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 17 Oct 2021 14:36:32 +0200 Subject: [PATCH 15/29] propagate factorization --- .../interface/HistoContainerAlgo.h | 44 ++++++---- .../CUDAUtilities/interface/OneToManyAssoc.h | 3 +- .../plugins/CAHitNtupletGeneratorKernels.cc | 8 +- .../plugins/CAHitNtupletGeneratorKernels.cu | 20 +++-- .../CAHitNtupletGeneratorKernelsImpl.h | 86 ++++++------------- 5 files changed, 74 insertions(+), 87 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h index 4177dfda97f83..9de360bc32142 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h @@ -11,11 +11,25 @@ namespace cms { namespace cuda { + template