diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 7bf5db603bccd..e0329a13b1edb 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -3,93 +3,13 @@ #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" -namespace cms { - namespace cuda { - - template - __global__ void countFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - (*h).count(v[i], ih); - } - } - - template - __global__ void fillFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < int(nh)); - (*h).fill(v[i], i, ih); - } - } - - template - inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - int32_t totSize, - int nthreads, - typename Histo::index_type *mem, - cudaStream_t stream -#ifndef __CUDACC__ - = cudaStreamDefault -#endif - ) { - typename Histo::View view = {h, nullptr, mem, -1, totSize}; - launchZero(view, stream); #ifdef __CUDACC__ - auto nblocks = (totSize + nthreads - 1) / nthreads; - assert(nblocks > 0); - countFromVector<<>>(h, nh, v, offsets); - cudaCheck(cudaGetLastError()); - launchFinalize(view, stream); - fillFromVector<<>>(h, nh, v, offsets); - cudaCheck(cudaGetLastError()); -#else - countFromVector(h, nh, v, offsets); - h->finalize(); - fillFromVector(h, nh, v, offsets); +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" #endif - } - - // iteratate over N bins left and right of the one containing "v" - template - __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { - int bs = Hist::bin(value); - int be = std::min(int(Hist::nbins() - 1), bs + n); - bs = std::max(0, bs - n); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } - } - - // iteratate over bins containing all values in window wmin, wmax - template - __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { - auto bs = Hist::bin(wmin); - auto be = Hist::bin(wmax); - assert(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } - } + +namespace cms { + namespace cuda { template typename Func, typename Histo, typename... Args> + __global__ void kernel_populate(typename Histo::View view, typename Histo::View::Counter *ws, Args... args) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto histo = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + Func::countOrFill(histo, std::forward(args)...); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + Func::countOrFill(histo, std::forward(args)...); + } + + template + __device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < int(nh)); + if constexpr (CountOrFill::count == cof) + (*h).count(v[i], ih); + else + (*h).fill(v[i], i, ih); + } + } + + template + __global__ void countOrFillFromVectorKernel(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) { + countOrFillFromVector(h, nh, v, offsets); + } + + template + inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + int32_t totSize, + int nthreads, + typename Histo::index_type *mem, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + typename Histo::View view = {h, nullptr, mem, -1, totSize}; + launchZero(view, stream); +#ifdef __CUDACC__ + auto nblocks = (totSize + nthreads - 1) / nthreads; + assert(nblocks > 0); + countOrFillFromVectorKernel<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); + launchFinalize(view, stream); + countOrFillFromVectorKernel<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); +#else + countOrFillFromVectorKernel(h, nh, v, offsets); + h->finalize(); + countOrFillFromVectorKernel(h, nh, v, offsets); +#endif + } + +#ifdef __CUDACC__ + template + __global__ void fillManyFromVectorCoopKernel(typename Histo::View view, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + int32_t totSize, + typename Histo::View::Counter *ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto h = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + countOrFillFromVector(h, nh, v, offsets); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + countOrFillFromVector(h, nh, v, offsets); + } +#endif + + template + inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h, + uint32_t nh, + T const *v, + uint32_t const *offsets, + int32_t totSize, + int nthreads, + typename Histo::index_type *mem, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + using View = typename Histo::View; + View view = {h, nullptr, mem, -1, totSize}; +#ifdef __CUDACC__ + auto kernel = fillManyFromVectorCoopKernel; + auto nblocks = (totSize + nthreads - 1) / nthreads; + assert(nblocks > 0); + auto nOnes = view.size(); + auto nchunks = nOnes / nthreads + 1; + auto ws = cms::cuda::make_device_unique(nchunks, stream); + auto wsp = ws.get(); + // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event + static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); + auto ncoopblocks = std::min(nblocks, maxBlocks); + assert(ncoopblocks > 0); + void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp}; + dim3 dimBlock(nthreads, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream)); +#else + launchZero(view, stream); + countFromVector(h, nh, v, offsets); + h->finalize(); + fillFromVector(h, nh, v, offsets); +#endif + } + + // iteratate over N bins left and right of the one containing "v" + template + __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + // iteratate over bins containing all values in window wmin, wmax + template + __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func &&func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h index 01f48bca94f4b..65baf7c046922 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h +++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h @@ -19,6 +19,8 @@ namespace cms { namespace cuda { + enum class CountOrFill { count, fill }; + template struct OneToManyAssocView { using Counter = typename Assoc::Counter; @@ -29,6 +31,24 @@ namespace cms { index_type *contentStorage = nullptr; int32_t offSize = -1; int32_t contentSize = -1; + + constexpr Counter *offsets() const { + Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off)); + if constexpr (Assoc::ctNOnes() < 0) { + assert(offStorage); + poff = offStorage; + } + return poff; + } + + constexpr int32_t size() const { + auto nOnes = Assoc::ctNOnes(); + if constexpr (Assoc::ctNOnes() < 0) { + nOnes = offSize; + } + assert(nOnes > 0); + return nOnes; + } }; // this MUST BE DONE in a single block (or in two kernels!) @@ -50,6 +70,26 @@ namespace cms { } } + template + __device__ void zeroAndInitCoop(OneToManyAssocView view) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + + auto h = view.assoc; + + auto first = blockDim.x * blockIdx.x + threadIdx.x; + + if (0 == first) { + h->psws = 0; + h->initStorage(view); + } + + grid.sync(); + for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) { + h->off[i] = 0; + } + } + template inline __attribute__((always_inline)) void launchZero(Assoc *h, cudaStream_t stream @@ -111,16 +151,8 @@ namespace cms { auto h = view.assoc; assert(h); #ifdef __CUDACC__ - using Counter = typename Assoc::Counter; - Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off)); - auto nOnes = Assoc::ctNOnes(); - if constexpr (Assoc::ctNOnes() < 0) { - assert(view.offStorage); - assert(view.offSize > 0); - nOnes = view.offSize; - poff = view.offStorage; - } - assert(nOnes > 0); + auto poff = view.offsets(); + auto nOnes = view.size(); int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws)); auto nthreads = 1024; auto nblocks = (nOnes + nthreads - 1) / nthreads; @@ -131,6 +163,19 @@ namespace cms { #endif } + template + __device__ __inline__ void finalizeCoop(OneToManyAssocView view, typename Assoc::Counter *ws) { +#ifdef __CUDACC__ + auto poff = view.offsets(); + auto nOnes = view.size(); + coopBlockPrefixScan(poff, poff, nOnes, ws); +#else + auto h = view.assoc; + assert(h); + h->finalize(); +#endif + } + template __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { assoc->bulkFinalizeFill(*apc); diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h index a7e4e963a87c6..bce1c9493606c 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h @@ -21,6 +21,9 @@ #undef __forceinline__ #define __forceinline__ inline __attribute__((always_inline)) +#undef __launch_bounds__ +#define __launch_bounds__(...) + namespace cms { namespace cudacompat { @@ -106,14 +109,140 @@ namespace cms { } inline void __syncthreads() {} - inline void __threadfence() {} inline bool __syncthreads_or(bool x) { return x; } inline bool __syncthreads_and(bool x) { return x; } + + inline void __trap() { abort(); } + inline void __threadfence() {} + template inline T __ldg(T const* x) { return *x; } + namespace cooperative_groups { + + // This class represents the grid group + class grid_group { + private: + grid_group() = default; + + friend grid_group this_grid(); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + static void sync() {} + }; + + inline grid_group this_grid() { return grid_group{}; } + + // This class represents the thread block + class thread_block { + private: + thread_block() = default; + + friend thread_block this_thread_block(); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + static void sync() {} + + // Total number of threads in the group. + // On the serial CPU implementation, always 1. + static unsigned long long size() { return 1; } + + // Rank of the calling thread within [0, size-1]. + // On the serial CPU implementation, always 0. + static unsigned long long thread_rank() { return 0; } + + // 3-Dimensional index of the block within the launched grid. + // On the serial CPU implementation, always {0, 0, 0}. + static dim3 group_index() { return blockIdx; } + + // 3-Dimensional index of the thread within the launched block + // On the serial CPU implementation, always {0, 0, 0}. + static dim3 thread_index() { return threadIdx; } + + // Dimensions of the launched block. + // On the serial CPU implementation, always {1, 1, 1}. + static dim3 group_dim() { return blockDim; } + }; + + // Return the current thread block + inline thread_block this_thread_block() { return thread_block{}; } + + // Represent a tiled group of threads, with compile-time fixed size. + // On the serial CPU implementation, the only valid Size is 1 + template + class thread_block_tile { + private: + static_assert( + Size == 1, + "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread."); + + thread_block_tile() = default; + + friend thread_block_tile tiled_partition(const ParentT& g); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + void sync() const {} + + // Total number of threads in the group. + // On the serial CPU implementation, always 1. + unsigned long long size() const { return 1; } + + // Rank of the calling thread within [0, size-1]. + // On the serial CPU implementation, always 0. + unsigned long long thread_rank() const { return 0; } + + // Returns the number of groups created when the parent group was partitioned. + // On the serial CPU implementation, always 1. + unsigned long long meta_group_size() const { return 1; } + + // Linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size). + // On the serial CPU implementation, always 0. + unsigned long long meta_group_rank() const { return 0; } + + // Not implemented - Refer to Warp Shuffle Functions + template + T shfl(T var, unsigned int src_rank) const; + template + T shfl_up(T var, int delta) const; + template + T shfl_down(T var, int delta) const; + template + T shfl_xor(T var, int delta) const; + + // Not implemented - Refer to Warp Vote Functions + template + T any(int predicate) const; + template + T all(int predicate) const; + template + T ballot(int predicate) const; + + // Not implemented - Refer to Warp Match Functions + template + T match_any(T val) const; + template + T match_all(T val, int& pred) const; + }; + + template + inline thread_block_tile tiled_partition(const ParentT& g) { + static_assert( + Size == 1, + "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread."); + + return thread_block_tile{}; + } + + } // namespace cooperative_groups + } // namespace cudacompat } // namespace cms diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h new file mode 100644 index 0000000000000..ecb0ba5c05121 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h @@ -0,0 +1,30 @@ + +#ifndef HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h +#define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h + +#include +#include +#include + +template +inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redFact = 10) { +// #define GET_COOP_RED_FACT_FROM_ENV + +// to drive performance assessment by envvar +#ifdef GET_COOP_RED_FACT_FROM_ENV + auto env = std::getenv("COOP_RED_FACT"); + int redFactFromEnv = env ? atoi(env) : 0; + if (redFactFromEnv != 0) + redFact = redFactFromEnv; +#endif + + int numBlocksPerSm = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, device); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem); + int mxblocks = deviceProp.multiProcessorCount * numBlocksPerSm; + // reduce number of blocks to account for multiple CPU threads + return std::max(1, mxblocks / redFact); +} + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h index 1a779fc677ff7..d305e2dc13e4e 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -6,6 +6,7 @@ #include "FWCore/Utilities/interface/CMSUnrollLoop.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include #ifdef __CUDA_ARCH__ @@ -183,6 +184,52 @@ namespace cms { co[i] += psum[k]; } } + + template + __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T* ipsum) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + volatile T const* ci = ici; + volatile T* co = ico; + volatile T* psum = ipsum; + + __shared__ T ws[32]; + + // assert(blockDim.x * gridDim.x >= size); + + int nChunks = size / blockDim.x + 1; + + // first each block does a scan + for (int ib = blockIdx.x; ib < nChunks; ib += gridDim.x) { + int off = blockDim.x * ib; + if (size - off > 0) { + int ls = std::min(int(blockDim.x), size - off); + blockPrefixScan(ci + off, co + off, ls, ws); + psum[ib] = co[off + ls - 1]; + __syncthreads(); + } + } + grid.sync(); + + // good each block has done his work + // let's get the partial sums from each block + if (0 == blockIdx.x) { + blockPrefixScan(psum, psum, nChunks, ws); + } + + grid.sync(); + + for (int ib = blockIdx.x; ib < nChunks; ib += gridDim.x) { + // now each block updates its piece (but for chunk 0) + int k = ib; + if (0 == k) + continue; + int i = threadIdx.x + k * blockDim.x; + if (i < size) + co[i] += psum[k - 1]; + } + } + } // namespace cuda } // namespace cms diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 53d41efcf4236..782c46dc66739 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -52,6 +52,14 @@ + + + + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu index b49498990fcbe..3831e8e384f85 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu @@ -6,7 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" using namespace cms::cuda; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp index 2109e5a0d5b38..d8ffe204bf56e 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -4,7 +4,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" using namespace cms::cuda; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 75f9cc0e626f5..c9fe4ee66a2ed 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -6,7 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" using namespace cms::cuda; @@ -72,7 +72,12 @@ void go() { cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); +#ifdef USE_COOPERATIVE_GROUPS + fillManyFromVectorCoop(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0); +#else fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0); +#endif + cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); assert(0 == h.off[0]); assert(offsets[10] == h.size()); diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index f2d74c8e7ce98..44d487b907699 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -6,7 +6,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 8ba9158cb69b7..da4e1be0ba782 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -11,6 +11,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" #endif #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" @@ -58,7 +59,7 @@ __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restr assert(m1->off[i] == m2->off[i]); } -__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { +__device__ __inline__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { auto k = i / 4; @@ -70,8 +71,9 @@ __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int3 assoc->count(tk[k][j]); } } +__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { count(tk, assoc, n); } -__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { +__device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) { auto k = i / 4; @@ -84,6 +86,21 @@ __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32 } } +__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { fill(tk, assoc, n); } + +__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter* ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto h = view.assoc; + zeroAndInitCoop(view); + grid.sync(); + count(tk, h, n); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + fill(tk, h, n); +} + __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); } template @@ -118,7 +135,7 @@ __global__ void verifyFill(Assoc const* __restrict__ la, int n) { imax = std::max(imax, int(x)); } assert(0 == la->size(n)); - printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z); + printf("found with %d elements %f %d %d\n\n", n, double(ave) / n, imax, z); } template @@ -179,7 +196,7 @@ int main() { std::geometric_distribution rdm(0.8); - constexpr uint32_t N = 4000; + uint32_t N = 4000; std::vector> tr(N); @@ -249,21 +266,40 @@ int main() { launchZero(saView, 0); #ifdef __CUDACC__ - auto nThreads = 256; - auto nBlocks = (4 * N + nThreads - 1) / nThreads; + int nThreads = 256; + int nBlocks = (4 * N + nThreads - 1) / nThreads; - count<<>>(v_d.get(), a_d.get(), N); + countKernel<<>>(v_d.get(), a_d.get(), N); launchFinalize(aView, 0); verify<<<1, 1>>>(a_d.get()); - fill<<>>(v_d.get(), a_d.get(), N); + fillKernel<<>>(v_d.get(), a_d.get(), N); verifyFill<<<1, 1>>>(a_d.get(), n); + // now with cooperative gropus + + auto nOnes = aView.size(); + auto nchunks = nOnes / nThreads + 1; + auto ws = cms::cuda::make_device_unique(nchunks, 0); + + int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0, 0); + std::cout << "max number of blocks is " << maxBlocks << std::endl; + auto ncoopblocks = std::min(nBlocks, maxBlocks); + auto a1 = v_d.get(); + auto a4 = ws.get(); + void* kernelArgs[] = {&a1, &aView, &N, &a4}; + dim3 dimBlock(nThreads, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void*)populate, dimGrid, dimBlock, kernelArgs)); + verifyFill<<<1, 1>>>(a_d.get(), n); + cudaCheck(cudaGetLastError()); + #else - count(v_d, a_d.get(), N); + countKernel(v_d, a_d.get(), N); launchFinalize(aView); verify(a_d.get()); - fill(v_d, a_d.get(), N); + fillKernel(v_d, a_d.get(), N); verifyFill(a_d.get(), n); #endif diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index 5e4a943f5c069..c4ca11a8eddf1 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -3,6 +3,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" using namespace cms::cuda; @@ -83,6 +84,11 @@ __global__ void verify(uint32_t const *v, uint32_t n) { printf("verify\n"); } +template +__global__ void doCoop(T const *ici, T *ico, int32_t size, T *ipsum) { + coopBlockPrefixScan(ici, ico, size, ipsum); +} + int main() { cms::cudatest::requireDevices(); @@ -113,9 +119,9 @@ int main() { int num_items = 200; for (int ksize = 1; ksize < 4; ++ksize) { // test multiblock - std::cout << "multiblok" << std::endl; // Declare, allocate, and initialize device-accessible pointers for input and output num_items *= 10; + std::cout << "multiblok " << num_items << std::endl; uint32_t *d_in; uint32_t *d_out1; uint32_t *d_out2; @@ -141,7 +147,31 @@ int main() { cudaCheck(cudaGetLastError()); verify<<>>(d_out1, num_items); cudaCheck(cudaGetLastError()); - cudaDeviceSynchronize(); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaGetLastError()); + + uint32_t *d_psum; + cudaCheck(cudaMalloc(&d_psum, nblocks * sizeof(uint32_t))); + std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl; + int maxBlocks = maxCoopBlocks(doCoop, nthreads, 0, 0, 0); + std::cout << "max number of blocks is " << maxBlocks << std::endl; + auto ncoopblocks = std::min(nblocks, maxBlocks); + void *kernelArgs[] = {&d_in, &d_out2, &num_items, &d_psum}; + dim3 dimBlock(nthreads, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaLaunchCooperativeKernel((void *)doCoop, dimGrid, dimBlock, kernelArgs); + cudaCheck(cudaGetLastError()); + verify<<>>(d_out2, num_items); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaGetLastError()); + + std::cout << "Free" << std::endl; + cudaCheck(cudaFree(d_psum)); + cudaCheck(cudaFree(d_out2)); + cudaCheck(cudaFree(d_out1)); + cudaCheck(cudaFree(d_in)); } // ksize return 0; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index b4fb89c3a709e..5a4385854e258 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -6,7 +6,7 @@ #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" namespace gpuClustering { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 135254fa6e9f2..b623f787375df 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -9,6 +9,7 @@ #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index 51aa7e896cf5d..6d32ef96359c4 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -10,6 +10,7 @@ #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" #include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index bd8151a2a816a..3d2567714790e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -134,9 +134,9 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * // remove duplicates (tracks that share a doublet) kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); - kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_countOrFillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + kernel_countOrFillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); if (nhits > 1 && params_.lateFishbone_) { gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true); @@ -163,9 +163,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { - kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_countOrFillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + kernel_countOrFillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 689cc0afd052b..27f4e0141456c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,5 +1,132 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" +/// Compute the number of quadruplet blocks for block size +inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { + // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 + return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; +} + +// #define CMS_CA_USE_COOPERATIVE_GROUPS + +#ifndef CMS_CA_USE_COOPERATIVE_GROUPS +__inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + caConstants::TupleMultiplicity *tupleMultiplicity_d, + cudaStream_t cudaStream) { + cms::cuda::launchZero(tupleMultiplicity_d, cudaStream); + auto blockSize = 128; + auto numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; + kernel_countOrFillMultiplicity + <<>>(tuples_d, quality_d, tupleMultiplicity_d); + cms::cuda::launchFinalize(tupleMultiplicity_d, cudaStream); + kernel_countOrFillMultiplicity + <<>>(tuples_d, quality_d, tupleMultiplicity_d); +} + +__inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + CAHitNtupletGeneratorKernelsGPU::HitToTuple::View hitToTupleView, + cudaStream_t cudaStream) { + auto hitToTuple_d = static_cast(hitToTupleView.assoc); + cms::cuda::launchZero(hitToTupleView, cudaStream); + auto blockSize = 64; + auto numberOfBlocks = nQuadrupletBlocks(blockSize); + kernel_countOrFillHitInTracks + <<>>(tuples_d, quality_d, hitToTuple_d); + cudaCheck(cudaGetLastError()); + cms::cuda::launchFinalize(hitToTupleView, cudaStream); + cudaCheck(cudaGetLastError()); + kernel_countOrFillHitInTracks + <<>>(tuples_d, quality_d, hitToTuple_d); +} + +#else +__global__ void kernel_populateHitInTracks(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + HitToTuple::View view, + HitToTuple::View::Counter *ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto tuple_d = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + countOrFillHitInTracks(tuples_d, quality_d, tuple_d); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + countOrFillHitInTracks(tuples_d, quality_d, tuple_d); +} + +__inline__ void populateHitInTracks(HitContainer const *tuples_d, + Quality const *quality_d, + HitToTuple::View view, + cudaStream_t cudaStream) { + using View = HitToTuple::View; + int blockSize = 64; + int nblocks = nQuadrupletBlocks(blockSize); + + auto kernel = kernel_populateHitInTracks; + + assert(nblocks > 0); + auto nOnes = view.size(); + auto nchunks = nOnes / blockSize + 1; + auto ws = cms::cuda::make_device_unique(nchunks, cudaStream); + auto wsp = ws.get(); + // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event + static int const maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); + auto ncoopblocks = std::min(nblocks, maxBlocks); + assert(ncoopblocks > 0); + void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp}; + dim3 dimBlock(blockSize, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream)); +} + +__global__ void kernel_populateMultiplicity(HitContainer const *__restrict__ tuples_d, + Quality const *__restrict__ quality_d, + caConstants::TupleMultiplicity::View view, + caConstants::TupleMultiplicity::View::Counter *ws) { + namespace cg = cooperative_groups; + auto grid = cg::this_grid(); + auto tupleMultiplicity_d = static_cast(view.assoc); + zeroAndInitCoop(view); + grid.sync(); + countOrFillMultiplicity(tuples_d, quality_d, tupleMultiplicity_d); + grid.sync(); + finalizeCoop(view, ws); + grid.sync(); + countOrFillMultiplicity(tuples_d, quality_d, tupleMultiplicity_d); +} + +__inline__ void populateMultiplicity(HitContainer const *tuples_d, + Quality const *quality_d, + caConstants::TupleMultiplicity *tupleMultiplicity_d, + cudaStream_t cudaStream) { + auto kernel = kernel_populateMultiplicity; + using View = caConstants::TupleMultiplicity::View; + View view = {tupleMultiplicity_d, nullptr, nullptr, -1, -1}; + + int blockSize = 128; + int nblocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; + assert(nblocks > 0); + auto nOnes = view.size(); + auto nchunks = nOnes / blockSize + 1; + auto ws = cms::cuda::make_device_unique(nchunks, cudaStream); + auto wsp = ws.get(); + // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event + static int const maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0); + auto ncoopblocks = std::min(nblocks, maxBlocks); + assert(ncoopblocks > 0); + void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp}; + dim3 dimBlock(blockSize, 1, 1); + dim3 dimGrid(ncoopblocks, 1, 1); + // launch + cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream)); +} + +#endif + template <> void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! @@ -99,13 +226,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); - blockSize = 128; - numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; - kernel_countMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); - cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity<<>>( - tuples_d, quality_d, device_tupleMultiplicity_.get()); + populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(), cudaStream); cudaCheck(cudaGetLastError()); // do not run the fishbone if there are hits only in BPIX1 @@ -253,17 +374,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA #endif if (params_.doSharedHitCut_ || params_.doStats_) { - // fill hit->track "map" - assert(hitToTupleView_.offSize > nhits); - numberOfBlocks = nQuadrupletBlocks(blockSize); - kernel_countHitInTracks<<>>( - tuples_d, quality_d, device_hitToTuple_.get()); - cudaCheck(cudaGetLastError()); - assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && - (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); - cms::cuda::launchFinalize(hitToTupleView_, cudaStream); - cudaCheck(cudaGetLastError()); - kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); + // populate hit->track "map" + populateHitInTracks(tuples_d, quality_d, hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 10a02309185a3..641f68e5fad85 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -227,12 +227,6 @@ class CAHitNtupletGeneratorKernels { // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed. return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize; } - - /// Compute the number of quadruplet blocks for block size - inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { - // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 - return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; - } }; using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc index 5978ef8851c73..5bd8793b32411 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc @@ -40,9 +40,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); } else { *device_nCells_ = 0; + // for gpu moved to populate functions + cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); + cms::cuda::launchZero(hitToTupleView_, stream); // we may wish to keep it in the edm } - cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); - cms::cuda::launchZero(hitToTupleView_, stream); // we may wish to keep it in the edm #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 86d7ab8b5a1a6..fb7378890d826 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -13,6 +13,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" #include "CAConstants.h" @@ -31,6 +32,8 @@ using Quality = pixelTrack::Quality; using TkSoA = pixelTrack::TrackSoA; using HitContainer = pixelTrack::HitContainer; +using CountOrFill = cms::cuda::CountOrFill; + namespace { constexpr uint16_t tkNotFound = std::numeric_limits::max(); @@ -376,9 +379,10 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const * } } -__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { +template +__device__ __inline__ void countOrFillMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = foundNtuplets->size(it); @@ -390,26 +394,18 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN if (nhits > 7) // current limit printf("wrong mult %d %d\n", it, nhits); assert(nhits <= caConstants::maxHitsOnTrack); - tupleMultiplicity->count(nhits); + if constexpr (CountOrFill::count == cof) + tupleMultiplicity->count(nhits); + else + tupleMultiplicity->fill(nhits, it); } } -__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, - caConstants::TupleMultiplicity *tupleMultiplicity) { - auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); - if (nhits < 3) - continue; - if (quality[it] == pixelTrack::Quality::edup) - continue; - assert(quality[it] == pixelTrack::Quality::bad); - if (nhits > 7) - printf("wrong mult %d %d\n", it, nhits); - assert(nhits <= caConstants::maxHitsOnTrack); - tupleMultiplicity->fill(nhits, it); - } +template +__global__ void kernel_countOrFillMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { + countOrFillMultiplicity(foundNtuplets, quality, tupleMultiplicity); } __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, @@ -516,28 +512,27 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, } } -__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { +template +__device__ __inline__ void countOrFillHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (tuples->size(idx) == 0) break; // guard for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->count(*h); + if constexpr (CountOrFill::count == cof) + hitToTuple->count(*h); + else + hitToTuple->fill(*h, idx); } } -__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { - int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) - break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->fill(*h, idx); - } +template +__global__ void kernel_countOrFillHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { + countOrFillHitInTracks(tuples, quality, hitToTuple); } __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h index f71aa56842a67..8bd30493e661b 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h index a11283a7b2065..4132ab0c227da 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h index 66d246fcfa4fa..8837b04e45654 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h index 0acf67244528a..e59063a84c0b3 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index 93f78d498b26f..28ef595bb4868 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #ifdef __CUDA_ARCH__ #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h index 0fe8bd882dcc5..00c7448b51a63 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -5,7 +5,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "gpuVertexFinder.h"