Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 114 additions & 1 deletion src/cudadev/CUDACore/cudaCompat.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
#undef __forceinline__
#define __forceinline__ inline __attribute__((always_inline))

#undef __launch_bounds__
#define __launch_bounds__(...)

namespace cms {
namespace cudacompat {

Expand Down Expand Up @@ -106,14 +109,124 @@ namespace cms {
}

inline void __syncthreads() {}
inline void __threadfence() {}
inline bool __syncthreads_or(bool x) { return x; }
inline bool __syncthreads_and(bool x) { return x; }

inline void __threadfence() {}

template <typename T>
inline T __ldg(T const* x) {
return *x;
}

namespace cooperative_groups {

// This class represents the thread block
class thread_block {
private:
thread_block() = default;

friend thread_block this_thread_block();

public:
// Synchronize the threads named in the group.
// On the serial CPU implementation, do nothing.
static void sync() {}

// Total number of threads in the group.
// On the serial CPU implementation, always 1.
static unsigned long long size() { return 1; }

// Rank of the calling thread within [0, size-1].
// On the serial CPU implementation, always 0.
static unsigned long long thread_rank() { return 0; }

// 3-Dimensional index of the block within the launched grid.
// On the serial CPU implementation, always {0, 0, 0}.
static dim3 group_index() { return blockIdx; }

// 3-Dimensional index of the thread within the launched block
// On the serial CPU implementation, always {0, 0, 0}.
static dim3 thread_index() { return threadIdx; }

// Dimensions of the launched block.
// On the serial CPU implementation, always {1, 1, 1}.
static dim3 group_dim() { return blockDim; }
};

// Return the current thread block
inline thread_block this_thread_block() { return thread_block{}; }

// Represent a tiled group of threads, with compile-time fixed size.
// On the serial CPU implementation, the only valid Size is 1
template <unsigned int Size, typename ParentT>
class thread_block_tile {
private:
static_assert(
Size == 1,
"The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");

thread_block_tile() = default;

friend thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g);

public:
// Synchronize the threads named in the group.
// On the serial CPU implementation, do nothing.
void sync() const {}

// Total number of threads in the group.
// On the serial CPU implementation, always 1.
unsigned long long size() const { return 1; }

// Rank of the calling thread within [0, size-1].
// On the serial CPU implementation, always 0.
unsigned long long thread_rank() const { return 0; }

// Returns the number of groups created when the parent group was partitioned.
// On the serial CPU implementation, always 1.
unsigned long long meta_group_size() const { return 1; }

// Linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size).
// On the serial CPU implementation, always 0.
unsigned long long meta_group_rank() const { return 0; }

// Not implemented - Refer to Warp Shuffle Functions
template <typename T>
T shfl(T var, unsigned int src_rank) const;
template <typename T>
T shfl_up(T var, int delta) const;
template <typename T>
T shfl_down(T var, int delta) const;
template <typename T>
T shfl_xor(T var, int delta) const;

// Not implemented - Refer to Warp Vote Functions
template <typename T>
T any(int predicate) const;
template <typename T>
T all(int predicate) const;
template <typename T>
T ballot(int predicate) const;

// Not implemented - Refer to Warp Match Functions
template <typename T>
T match_any(T val) const;
template <typename T>
T match_all(T val, int& pred) const;
};

template <unsigned int Size, typename ParentT>
inline thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g) {
static_assert(
Size == 1,
"The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");

return thread_block_tile<Size, ParentT>{};
}

} // namespace cooperative_groups

} // namespace cudacompat
} // namespace cms

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
params_.dcaCutOuterTriplet_);

if (nhits > 1 && params_.earlyFishbone_) {
gpuPixelDoublets::fishbone(
gpuPixelDoublets::fishbone<1, 1>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
}

Expand All @@ -132,7 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());

if (nhits > 1 && params_.lateFishbone_) {
gpuPixelDoublets::fishbone(
gpuPixelDoublets::fishbone<1, 1>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
}

Expand Down
22 changes: 8 additions & 14 deletions src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
cudaCheck(cudaGetLastError());

if (nhits > 1 && params_.earlyFishbone_) {
auto nthTot = 128;
auto stride = 16;
auto blockSize = nthTot / stride;
auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
dim3 blks(1, numberOfBlocks, 1);
dim3 thrs(stride, blockSize, 1);
gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
constexpr int threadsPerHit = 16;
constexpr int hitsPerBlock = 8;
auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock;
gpuPixelDoublets::fishbone<hitsPerBlock, threadsPerHit><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
cudaCheck(cudaGetLastError());
}
Expand Down Expand Up @@ -116,13 +113,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
cudaCheck(cudaGetLastError());

if (nhits > 1 && params_.lateFishbone_) {
auto nthTot = 128;
auto stride = 16;
auto blockSize = nthTot / stride;
auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
dim3 blks(1, numberOfBlocks, 1);
dim3 thrs(stride, blockSize, 1);
gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
constexpr int threadsPerHit = 16;
constexpr int hitsPerBlock = 8;
auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock;
gpuPixelDoublets::fishbone<hitsPerBlock, threadsPerHit><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
cudaCheck(cudaGetLastError());
}
Expand Down
135 changes: 85 additions & 50 deletions src/cudadev/plugin-PixelTriplets/gpuFishbone.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#include <cstdio>
#include <limits>

#include <cooperative_groups.h>
namespace cg = cooperative_groups;

#include "DataFormats/approx_atan2.h"
#include "Geometry/phase1PixelTopology.h"
#include "CUDACore/VecArray.h"
Expand All @@ -16,76 +19,108 @@

namespace gpuPixelDoublets {

// __device__
// __forceinline__
__global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp,
GPUCACell* cells,
uint32_t const* __restrict__ nCells,
GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
uint32_t nHits,
bool checkTrack) {
template <int hitsPerBlock, int threadsPerHit>
__global__ __launch_bounds__(hitsPerBlock* threadsPerHit) void fishbone(
GPUCACell::Hits const* __restrict__ hits_p,
GPUCACell* cells,
uint32_t const* __restrict__ nCells,
GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
uint32_t nHits,
bool checkTrack) {
constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;

auto const& hh = *hhp;
auto const& hits = *hits_p;

// blockDim must be hitsPerBlock times threadsPerHit
assert(blockDim.x == hitsPerBlock * threadsPerHit);

// partition the thread block into threadsPerHit-sized tiles
auto tile = cg::tiled_partition<threadsPerHit>(cg::this_thread_block());
assert(hitsPerBlock == tile.meta_group_size());

// x run faster...
auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
auto firstX = threadIdx.x;
auto numberOfBlocks = gridDim.x;
auto hitsPerGrid = numberOfBlocks * hitsPerBlock;
auto hitInBlock = tile.meta_group_rank(); // 0 .. hitsPerBlock-1
auto innerThread = tile.thread_rank();
auto firstHit = hitInBlock + blockIdx.x * hitsPerBlock;

float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
uint16_t d[maxCellsPerHit]; // uint8_t l[maxCellsPerHit];
uint32_t cc[maxCellsPerHit];
// hitsPerBlock buffers in shared memory
__shared__ float s_x[hitsPerBlock][maxCellsPerHit];
__shared__ float s_y[hitsPerBlock][maxCellsPerHit];
__shared__ float s_z[hitsPerBlock][maxCellsPerHit];
__shared__ float s_length2[hitsPerBlock][maxCellsPerHit];
__shared__ uint32_t s_cc[hitsPerBlock][maxCellsPerHit];
__shared__ uint16_t s_detId[hitsPerBlock][maxCellsPerHit];
__shared__ uint32_t s_doublets[hitsPerBlock];

for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) {
auto const& vc = isOuterHitOfCell[idy];
// buffer used by the current thread
float(&x)[maxCellsPerHit] = s_x[hitInBlock];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if

Suggested change
float(&x)[maxCellsPerHit] = s_x[hitInBlock];
auto& x = s_x[hitInBlock];

would be more clear.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't say which one is clearer... your suggestion is certainly simpler to write :-)

float(&y)[maxCellsPerHit] = s_y[hitInBlock];
float(&z)[maxCellsPerHit] = s_z[hitInBlock];
float(&length2)[maxCellsPerHit] = s_length2[hitInBlock];
uint32_t(&cc)[maxCellsPerHit] = s_cc[hitInBlock];
uint16_t(&detId)[maxCellsPerHit] = s_detId[hitInBlock];
uint32_t& doublets = s_doublets[hitInBlock];

// outer loop: parallelize over the hits
for (int hit = firstHit; hit < (int)nHits; hit += hitsPerGrid) {
auto const& vc = isOuterHitOfCell[hit];
auto size = vc.size();
if (size < 2)
continue;

// if alligned kill one of the two.
// in principle one could try to relax the cut (only in r-z?) for jumping-doublets
auto const& c0 = cells[vc[0]];
auto xo = c0.outer_x(hh);
auto yo = c0.outer_y(hh);
auto zo = c0.outer_z(hh);
auto sg = 0;
for (int32_t ic = 0; ic < size; ++ic) {
auto& ci = cells[vc[ic]];
if (ci.unused())
auto xo = c0.outer_x(hits);
auto yo = c0.outer_y(hits);
auto zo = c0.outer_z(hits);
if (innerThread == 0) {
doublets = 0;
}
tile.sync();
for (int32_t i = innerThread; i < size; i += threadsPerHit) {
auto& cell = cells[vc[i]];
if (cell.unused())
continue; // for triplets equivalent to next
if (checkTrack && ci.tracks().empty())
if (checkTrack && cell.tracks().empty())
continue;
cc[sg] = vc[ic];
d[sg] = ci.inner_detIndex(hh);
x[sg] = ci.inner_x(hh) - xo;
y[sg] = ci.inner_y(hh) - yo;
z[sg] = ci.inner_z(hh) - zo;
n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
++sg;
auto index = atomicInc(&doublets, 0xFFFFFFFF);
cc[index] = vc[i];
detId[index] = cell.inner_detIndex(hits);
x[index] = cell.inner_x(hits) - xo;
y[index] = cell.inner_y(hits) - yo;
z[index] = cell.inner_z(hits) - zo;
length2[index] = x[index] * x[index] + y[index] * y[index] + z[index] * z[index];
}
if (sg < 2)

tile.sync();
if (doublets < 2)
continue;
// here we parallelize
for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) {
auto& ci = cells[cc[ic]];
for (auto jc = ic + 1; jc < sg; ++jc) {
auto& cj = cells[cc[jc]];
// must be different detectors (in the same layer)
// if (d[ic]==d[jc]) continue;
// || l[ic]!=l[jc]) continue;
auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
// alligned: kill farthest (prefer consecutive layers)
if (n[ic] > n[jc]) {
ci.kill();

// inner loop: parallelize over the doublets
for (uint32_t i = innerThread; i < doublets - 1; i += threadsPerHit) {
auto& cell_i = cells[cc[i]];
for (uint32_t j = i + 1; j < doublets; ++j) {
// must be different detectors (potentially in the same layer)
if (detId[i] == detId[j])
continue;
auto& cell_j = cells[cc[j]];
auto cos12 = x[i] * x[j] + y[i] * y[j] + z[i] * z[j];
if (cos12 * cos12 >= 0.99999f * length2[i] * length2[j]) {
// alligned: kill farthest (prefer consecutive layers)
if (length2[i] > length2[j]) {
cell_i.kill();
break;
} else {
cj.kill();
cell_j.kill();
}
}
} //cj
} // ci
} // hits
} // j
} // i
} // hit
}

} // namespace gpuPixelDoublets

#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h