diff --git a/src/cudadev/CUDACore/cudaCompat.h b/src/cudadev/CUDACore/cudaCompat.h index a7e4e963a..dfe784840 100644 --- a/src/cudadev/CUDACore/cudaCompat.h +++ b/src/cudadev/CUDACore/cudaCompat.h @@ -21,6 +21,9 @@ #undef __forceinline__ #define __forceinline__ inline __attribute__((always_inline)) +#undef __launch_bounds__ +#define __launch_bounds__(...) + namespace cms { namespace cudacompat { @@ -106,14 +109,124 @@ namespace cms { } inline void __syncthreads() {} - inline void __threadfence() {} inline bool __syncthreads_or(bool x) { return x; } inline bool __syncthreads_and(bool x) { return x; } + + inline void __threadfence() {} + template inline T __ldg(T const* x) { return *x; } + namespace cooperative_groups { + + // This class represents the thread block + class thread_block { + private: + thread_block() = default; + + friend thread_block this_thread_block(); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + static void sync() {} + + // Total number of threads in the group. + // On the serial CPU implementation, always 1. + static unsigned long long size() { return 1; } + + // Rank of the calling thread within [0, size-1]. + // On the serial CPU implementation, always 0. + static unsigned long long thread_rank() { return 0; } + + // 3-Dimensional index of the block within the launched grid. + // On the serial CPU implementation, always {0, 0, 0}. + static dim3 group_index() { return blockIdx; } + + // 3-Dimensional index of the thread within the launched block + // On the serial CPU implementation, always {0, 0, 0}. + static dim3 thread_index() { return threadIdx; } + + // Dimensions of the launched block. + // On the serial CPU implementation, always {1, 1, 1}. + static dim3 group_dim() { return blockDim; } + }; + + // Return the current thread block + inline thread_block this_thread_block() { return thread_block{}; } + + // Represent a tiled group of threads, with compile-time fixed size. + // On the serial CPU implementation, the only valid Size is 1 + template + class thread_block_tile { + private: + static_assert( + Size == 1, + "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread."); + + thread_block_tile() = default; + + friend thread_block_tile tiled_partition(const ParentT& g); + + public: + // Synchronize the threads named in the group. + // On the serial CPU implementation, do nothing. + void sync() const {} + + // Total number of threads in the group. + // On the serial CPU implementation, always 1. + unsigned long long size() const { return 1; } + + // Rank of the calling thread within [0, size-1]. + // On the serial CPU implementation, always 0. + unsigned long long thread_rank() const { return 0; } + + // Returns the number of groups created when the parent group was partitioned. + // On the serial CPU implementation, always 1. + unsigned long long meta_group_size() const { return 1; } + + // Linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size). + // On the serial CPU implementation, always 0. + unsigned long long meta_group_rank() const { return 0; } + + // Not implemented - Refer to Warp Shuffle Functions + template + T shfl(T var, unsigned int src_rank) const; + template + T shfl_up(T var, int delta) const; + template + T shfl_down(T var, int delta) const; + template + T shfl_xor(T var, int delta) const; + + // Not implemented - Refer to Warp Vote Functions + template + T any(int predicate) const; + template + T all(int predicate) const; + template + T ballot(int predicate) const; + + // Not implemented - Refer to Warp Match Functions + template + T match_any(T val) const; + template + T match_all(T val, int& pred) const; + }; + + template + inline thread_block_tile tiled_partition(const ParentT& g) { + static_assert( + Size == 1, + "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread."); + + return thread_block_tile{}; + } + + } // namespace cooperative_groups + } // namespace cudacompat } // namespace cms diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc index f2805d018..745e6a829 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc @@ -107,7 +107,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * params_.dcaCutOuterTriplet_); if (nhits > 1 && params_.earlyFishbone_) { - gpuPixelDoublets::fishbone( + gpuPixelDoublets::fishbone<1, 1>( hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); } @@ -132,7 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); if (nhits > 1 && params_.lateFishbone_) { - gpuPixelDoublets::fishbone( + gpuPixelDoublets::fishbone<1, 1>( hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); } diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu index edc1eb49b..e1661581d 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu @@ -64,13 +64,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cudaCheck(cudaGetLastError()); if (nhits > 1 && params_.earlyFishbone_) { - auto nthTot = 128; - auto stride = 16; - auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - gpuPixelDoublets::fishbone<<>>( + constexpr int threadsPerHit = 16; + constexpr int hitsPerBlock = 8; + auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock; + gpuPixelDoublets::fishbone<<>>( hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); cudaCheck(cudaGetLastError()); } @@ -116,13 +113,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cudaCheck(cudaGetLastError()); if (nhits > 1 && params_.lateFishbone_) { - auto nthTot = 128; - auto stride = 16; - auto blockSize = nthTot / stride; - auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; - dim3 blks(1, numberOfBlocks, 1); - dim3 thrs(stride, blockSize, 1); - gpuPixelDoublets::fishbone<<>>( + constexpr int threadsPerHit = 16; + constexpr int hitsPerBlock = 8; + auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock; + gpuPixelDoublets::fishbone<<>>( hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); cudaCheck(cudaGetLastError()); } diff --git a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h index 4a493f0c9..d5c6f393c 100644 --- a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h +++ b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h @@ -7,6 +7,9 @@ #include #include +#include +namespace cg = cooperative_groups; + #include "DataFormats/approx_atan2.h" #include "Geometry/phase1PixelTopology.h" #include "CUDACore/VecArray.h" @@ -16,76 +19,108 @@ namespace gpuPixelDoublets { - // __device__ - // __forceinline__ - __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp, - GPUCACell* cells, - uint32_t const* __restrict__ nCells, - GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell, - uint32_t nHits, - bool checkTrack) { + template + __global__ __launch_bounds__(hitsPerBlock* threadsPerHit) void fishbone( + GPUCACell::Hits const* __restrict__ hits_p, + GPUCACell* cells, + uint32_t const* __restrict__ nCells, + GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell, + uint32_t nHits, + bool checkTrack) { constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit; - auto const& hh = *hhp; + auto const& hits = *hits_p; + + // blockDim must be hitsPerBlock times threadsPerHit + assert(blockDim.x == hitsPerBlock * threadsPerHit); + + // partition the thread block into threadsPerHit-sized tiles + auto tile = cg::tiled_partition(cg::this_thread_block()); + assert(hitsPerBlock == tile.meta_group_size()); - // x run faster... - auto firstY = threadIdx.y + blockIdx.y * blockDim.y; - auto firstX = threadIdx.x; + auto numberOfBlocks = gridDim.x; + auto hitsPerGrid = numberOfBlocks * hitsPerBlock; + auto hitInBlock = tile.meta_group_rank(); // 0 .. hitsPerBlock-1 + auto innerThread = tile.thread_rank(); + auto firstHit = hitInBlock + blockIdx.x * hitsPerBlock; - float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit]; - uint16_t d[maxCellsPerHit]; // uint8_t l[maxCellsPerHit]; - uint32_t cc[maxCellsPerHit]; + // hitsPerBlock buffers in shared memory + __shared__ float s_x[hitsPerBlock][maxCellsPerHit]; + __shared__ float s_y[hitsPerBlock][maxCellsPerHit]; + __shared__ float s_z[hitsPerBlock][maxCellsPerHit]; + __shared__ float s_length2[hitsPerBlock][maxCellsPerHit]; + __shared__ uint32_t s_cc[hitsPerBlock][maxCellsPerHit]; + __shared__ uint16_t s_detId[hitsPerBlock][maxCellsPerHit]; + __shared__ uint32_t s_doublets[hitsPerBlock]; - for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) { - auto const& vc = isOuterHitOfCell[idy]; + // buffer used by the current thread + float(&x)[maxCellsPerHit] = s_x[hitInBlock]; + float(&y)[maxCellsPerHit] = s_y[hitInBlock]; + float(&z)[maxCellsPerHit] = s_z[hitInBlock]; + float(&length2)[maxCellsPerHit] = s_length2[hitInBlock]; + uint32_t(&cc)[maxCellsPerHit] = s_cc[hitInBlock]; + uint16_t(&detId)[maxCellsPerHit] = s_detId[hitInBlock]; + uint32_t& doublets = s_doublets[hitInBlock]; + + // outer loop: parallelize over the hits + for (int hit = firstHit; hit < (int)nHits; hit += hitsPerGrid) { + auto const& vc = isOuterHitOfCell[hit]; auto size = vc.size(); if (size < 2) continue; + // if alligned kill one of the two. // in principle one could try to relax the cut (only in r-z?) for jumping-doublets auto const& c0 = cells[vc[0]]; - auto xo = c0.outer_x(hh); - auto yo = c0.outer_y(hh); - auto zo = c0.outer_z(hh); - auto sg = 0; - for (int32_t ic = 0; ic < size; ++ic) { - auto& ci = cells[vc[ic]]; - if (ci.unused()) + auto xo = c0.outer_x(hits); + auto yo = c0.outer_y(hits); + auto zo = c0.outer_z(hits); + if (innerThread == 0) { + doublets = 0; + } + tile.sync(); + for (int32_t i = innerThread; i < size; i += threadsPerHit) { + auto& cell = cells[vc[i]]; + if (cell.unused()) continue; // for triplets equivalent to next - if (checkTrack && ci.tracks().empty()) + if (checkTrack && cell.tracks().empty()) continue; - cc[sg] = vc[ic]; - d[sg] = ci.inner_detIndex(hh); - x[sg] = ci.inner_x(hh) - xo; - y[sg] = ci.inner_y(hh) - yo; - z[sg] = ci.inner_z(hh) - zo; - n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg]; - ++sg; + auto index = atomicInc(&doublets, 0xFFFFFFFF); + cc[index] = vc[i]; + detId[index] = cell.inner_detIndex(hits); + x[index] = cell.inner_x(hits) - xo; + y[index] = cell.inner_y(hits) - yo; + z[index] = cell.inner_z(hits) - zo; + length2[index] = x[index] * x[index] + y[index] * y[index] + z[index] * z[index]; } - if (sg < 2) + + tile.sync(); + if (doublets < 2) continue; - // here we parallelize - for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) { - auto& ci = cells[cc[ic]]; - for (auto jc = ic + 1; jc < sg; ++jc) { - auto& cj = cells[cc[jc]]; - // must be different detectors (in the same layer) - // if (d[ic]==d[jc]) continue; - // || l[ic]!=l[jc]) continue; - auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc]; - if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) { - // alligned: kill farthest (prefer consecutive layers) - if (n[ic] > n[jc]) { - ci.kill(); + + // inner loop: parallelize over the doublets + for (uint32_t i = innerThread; i < doublets - 1; i += threadsPerHit) { + auto& cell_i = cells[cc[i]]; + for (uint32_t j = i + 1; j < doublets; ++j) { + // must be different detectors (potentially in the same layer) + if (detId[i] == detId[j]) + continue; + auto& cell_j = cells[cc[j]]; + auto cos12 = x[i] * x[j] + y[i] * y[j] + z[i] * z[j]; + if (cos12 * cos12 >= 0.99999f * length2[i] * length2[j]) { + // alligned: kill farthest (prefer consecutive layers) + if (length2[i] > length2[j]) { + cell_i.kill(); break; } else { - cj.kill(); + cell_j.kill(); } } - } //cj - } // ci - } // hits + } // j + } // i + } // hit } + } // namespace gpuPixelDoublets #endif // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h