Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
if (doit) {
GPUCACell::TmpTuple stack;
stack.reset();
thisCell.find_ntuplets(
thisCell.find_ntuplets<6>(
acc, hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
ALPAKA_ASSERT_OFFLOAD(stack.empty());
// printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
Expand Down
82 changes: 43 additions & 39 deletions src/alpaka/plugin-PixelTriplets/alpaka/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {

// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
template <typename T_Acc>
template <int DEPTH, typename T_Acc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void find_ntuplets(const T_Acc& acc,
Hits const& hh,
GPUCACell* __restrict__ cells,
Expand All @@ -334,49 +334,54 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
// the building process for a track ends if:
// it has no right neighbor
// it has no compatible neighbor
// the ntuplets is then saved if the number of hits it contains is greater
// than a threshold

tmpNtuplet.push_back_unsafe(theDoubletId);
ALPAKA_ASSERT_OFFLOAD(tmpNtuplet.size() <= 4);

bool last = true;
for (int j = 0; j < outerNeighbors().size(); ++j) {
auto otherCell = outerNeighbors()[j];
if (cells[otherCell].theDoubletId < 0)
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets(
acc, hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
if constexpr (DEPTH == 0) {
printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
ALPAKA_ASSERT_OFFLOAD(false);
} else {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fwyzard @VinInn For Alpaka I went with this instead of the specialization for DEPTH == 0 because partial specializations of functions are not allowed ("partial" caused by the additional T_Acc template argument). By quick test the throughput improvement is similar order (3-4 %) than in cuda without caching/async allocator and in kokkos (that both use the specialization as in the original PR). If you have any better suggestions, let me know.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After seeing it, I actually like the approach with if constexpr better than the one with the specialisation for 0, as it keeps things more localised.

We should check that it works well also for the native CUDA and HIP cases.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be ok for you to do that in a subsequent PR?

// the building process for a track ends if:
// it has no right neighbor
// it has no compatible neighbor
// the ntuplets is then saved if the number of hits it contains is greater
// than a threshold

tmpNtuplet.push_back_unsafe(theDoubletId);
ALPAKA_ASSERT_OFFLOAD(tmpNtuplet.size() <= 4);

bool last = true;
for (int j = 0; j < outerNeighbors().size(); ++j) {
auto otherCell = outerNeighbors()[j];
if (cells[otherCell].theDoubletId < 0)
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets<DEPTH - 1>(
acc, hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
#ifdef ONLY_TRIPLETS_IN_HOLE
// triplets accepted only pointing to the hole
if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
// triplets accepted only pointing to the hole
if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
#endif
{
hindex_type hits[6];
auto nh = 0U;
for (auto c : tmpNtuplet) {
hits[nh++] = cells[c].theInnerHitId;
}
hits[nh] = theOuterHitId;
auto it = foundNtuplets.bulkFill(acc, apc, hits, tmpNtuplet.size() + 1);
if (it >= 0) { // if negative is overflow....
for (auto c : tmpNtuplet)
cells[c].addTrack(acc, it, cellTracks);
quality[it] = bad; // initialize to bad
{
hindex_type hits[6];
auto nh = 0U;
for (auto c : tmpNtuplet) {
hits[nh++] = cells[c].theInnerHitId;
}
hits[nh] = theOuterHitId;
auto it = foundNtuplets.bulkFill(acc, apc, hits, tmpNtuplet.size() + 1);
if (it >= 0) { // if negative is overflow....
for (auto c : tmpNtuplet)
cells[c].addTrack(acc, it, cellTracks);
quality[it] = bad; // initialize to bad
}
}
}
}
tmpNtuplet.pop_back();
ALPAKA_ASSERT_OFFLOAD(tmpNtuplet.size() < 4);
}
tmpNtuplet.pop_back();
ALPAKA_ASSERT_OFFLOAD(tmpNtuplet.size() < 4);
}

private:
Expand All @@ -394,7 +399,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
hindex_type theInnerHitId;
hindex_type theOuterHitId;
};

} // namespace ALPAKA_ACCELERATOR_NAMESPACE

#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
if (doit) {
GPUCACell::TmpTuple stack;
stack.reset();
thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
thisCell.find_ntuplets<6>(
hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
assert(stack.empty());
// printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
}
Expand Down
21 changes: 20 additions & 1 deletion src/cuda/plugin-PixelTriplets/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ class GPUCACell {

// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
template <int DEPTH>
__device__ inline void find_ntuplets(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
Expand All @@ -299,7 +300,7 @@ class GPUCACell {
if (cells[otherCell].theDoubletId < 0)
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets(
cells[otherCell].find_ntuplets<DEPTH - 1>(
hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
Expand Down Expand Up @@ -345,4 +346,22 @@ class GPUCACell {
hindex_type theOuterHitId;
};

template <>
__device__ inline void GPUCACell::find_ntuplets<0>(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
HitContainer& foundNtuplets,
cms::cuda::AtomicPairCounter& apc,
Quality* __restrict__ quality,
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
#ifdef __CUDA_ARCH__
__trap();
#else
abort();
#endif
}

#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
if (doit) {
GPUCACell::TmpTuple stack;
stack.reset();
thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
thisCell.find_ntuplets<6>(
hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
assert(stack.empty());
// printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
}
Expand Down
21 changes: 20 additions & 1 deletion src/cudacompat/plugin-PixelTriplets/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ class GPUCACell {

// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
template <int DEPTH>
__device__ inline void find_ntuplets(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
Expand All @@ -299,7 +300,7 @@ class GPUCACell {
if (cells[otherCell].theDoubletId < 0)
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets(
cells[otherCell].find_ntuplets<DEPTH - 1>(
hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
Expand Down Expand Up @@ -345,4 +346,22 @@ class GPUCACell {
hindex_type theOuterHitId;
};

template <>
__device__ inline void GPUCACell::find_ntuplets<0>(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
HitContainer& foundNtuplets,
cms::cuda::AtomicPairCounter& apc,
Quality* __restrict__ quality,
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
#ifdef __CUDA_ARCH__
__trap();
#else
abort();
#endif
}

#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
if (doit) {
GPUCACell::TmpTuple stack;
stack.reset();
thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
thisCell.find_ntuplets<6>(
hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
assert(stack.empty());
// printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
}
Expand Down
21 changes: 20 additions & 1 deletion src/cudadev/plugin-PixelTriplets/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ class GPUCACell {

// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
template <int DEPTH>
__device__ inline void find_ntuplets(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
Expand All @@ -294,7 +295,7 @@ class GPUCACell {
if (cells[otherCell].theDoubletId_ < 0)
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets(
cells[otherCell].find_ntuplets<DEPTH - 1>(
hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
Expand Down Expand Up @@ -347,4 +348,22 @@ class GPUCACell {
hindex_type theOuterHitId;
};

template <>
__device__ inline void GPUCACell::find_ntuplets<0>(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
HitContainer& foundNtuplets,
cms::cuda::AtomicPairCounter& apc,
Quality* __restrict__ quality,
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
#ifdef __CUDA_ARCH__
__trap();
#else
abort();
#endif
}

#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
if (doit) {
GPUCACell::TmpTuple stack;
stack.reset();
thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
thisCell.find_ntuplets<6>(
hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
assert(stack.empty());
// printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
}
Expand Down
21 changes: 20 additions & 1 deletion src/cudauvm/plugin-PixelTriplets/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ class GPUCACell {

// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
template <int DEPTH>
__device__ inline void find_ntuplets(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
Expand All @@ -299,7 +300,7 @@ class GPUCACell {
if (cells[otherCell].theDoubletId < 0)
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets(
cells[otherCell].find_ntuplets<DEPTH - 1>(
hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
Expand Down Expand Up @@ -345,4 +346,22 @@ class GPUCACell {
hindex_type theOuterHitId;
};

template <>
__device__ inline void GPUCACell::find_ntuplets<0>(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
HitContainer& foundNtuplets,
cms::cuda::AtomicPairCounter& apc,
Quality* __restrict__ quality,
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
#ifdef __CUDA_ARCH__
__trap();
#else
abort();
#endif
}

#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
17 changes: 16 additions & 1 deletion src/kokkos/plugin-PixelTriplets/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ class GPUCACell {

// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
template <int DEPTH>
KOKKOS_INLINE_FUNCTION void find_ntuplets(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
Expand All @@ -305,7 +306,7 @@ class GPUCACell {
if (cells[otherCell].theDoubletId < 0)
continue; // killed by earlyFishbone
last = false;
cells[otherCell].find_ntuplets(
cells[otherCell].find_ntuplets<DEPTH - 1>(
hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
}
if (last) { // if long enough save...
Expand Down Expand Up @@ -351,4 +352,18 @@ class GPUCACell {
hindex_type theOuterHitId;
};

template <>
KOKKOS_INLINE_FUNCTION void GPUCACell::find_ntuplets<0>(Hits const& hh,
GPUCACell* __restrict__ cells,
CellTracksVector& cellTracks,
HitContainer& foundNtuplets,
cms::kokkos::AtomicPairCounter& apc,
Quality* __restrict__ quality,
TmpTuple& tmpNtuplet,
const unsigned int minHitsPerNtuplet,
bool startAt0) const {
printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
assert(false);
}

#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ namespace KOKKOS_NAMESPACE {
if (doit) {
GPUCACell::TmpTuple stack;
stack.reset();
thisCell.find_ntuplets(
thisCell.find_ntuplets<6>(
hh, cells.data(), cellTracks(), *foundNtuplets, apc(), quality, stack, minHitsPerNtuplet, pid < 3);
assert(stack.empty());
// printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
if (doit) {
GPUCACell::TmpTuple stack;
stack.reset();
thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
thisCell.find_ntuplets<6>(
hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
assert(stack.empty());
// printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
}
Expand Down
Loading