Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
119 commits
Select commit Hold shift + click to select a range
d15d0dd
use gpu vertices
VinInn Oct 5, 2018
fc8ffad
add vertex spitting
VinInn Oct 5, 2018
9e3a3aa
fix iterations
VinInn Oct 5, 2018
c9418f6
apply outlier rejection, tune error
VinInn Oct 7, 2018
5e9d7cf
fix duplicate cleaning
VinInn Oct 19, 2018
2e7910f
sort and clean
VinInn Oct 20, 2018
2d155f1
fishbone works
VinInn Oct 26, 2018
aad5235
fishbone works
VinInn Oct 26, 2018
3244703
fishbone works
VinInn Oct 26, 2018
d6508a8
add layerid
VinInn Oct 28, 2018
e2fd6d2
copy layer on gpu
VinInn Oct 28, 2018
c68f413
efficient
VinInn Oct 29, 2018
9dc2184
optimize parallelization
VinInn Oct 29, 2018
b3ed9d0
update notebook to include fishbone
VinInn Oct 31, 2018
cc973f6
silence it
VinInn Oct 31, 2018
06365df
mark magic 2
VinInn Oct 31, 2018
f2439af
remove magic 256, reduce it to 128
VinInn Oct 31, 2018
ae7fc3f
reduce size
VinInn Oct 31, 2018
9030763
remove duplicate code lines
VinInn Oct 31, 2018
d79faf5
narrow cut to avoid inefficiency for realistic
VinInn Nov 1, 2018
376a0d4
Merged gpuVertexRedux from repository VinInn with cms-merge-topic
VinInn Nov 1, 2018
ad06e33
build pentuplets
VinInn Nov 2, 2018
7bea72e
simplify
VinInn Nov 3, 2018
932fec9
align to offline
VinInn Nov 4, 2018
d0f3adf
simplify histogrammer: no need of ws in fill
VinInn Nov 4, 2018
6a192fd
test cuda_assert
VinInn Nov 5, 2018
fce72dc
use more stable and gpu friendly version of circle
VinInn Nov 5, 2018
f7dbc25
assoc tested
VinInn Nov 6, 2018
483d591
check cosdir
VinInn Nov 6, 2018
384465e
clean clode
VinInn Nov 6, 2018
ada49bd
try to use template errors
VinInn Nov 7, 2018
6acfd9f
retune but still use old params
VinInn Nov 7, 2018
45f65b9
add AtomicPairCounter and implement manyToOne
VinInn Nov 8, 2018
7ac1cf3
tuning cuts
VinInn Nov 9, 2018
2d8e41b
few steps toward persistent gputracks, crashes
VinInn Nov 9, 2018
0adee78
Q productions works
VinInn Nov 9, 2018
0198e52
forward hits
VinInn Nov 10, 2018
fd8f49c
compiles
VinInn Nov 11, 2018
fd49d01
runs
VinInn Nov 11, 2018
aef70eb
use less memory
VinInn Nov 11, 2018
545a326
use even less memory
VinInn Nov 11, 2018
511aa99
add quality flag
VinInn Nov 11, 2018
de2333d
factorize
VinInn Nov 12, 2018
2a2ab2b
factorize
VinInn Nov 12, 2018
e028fd1
reading correcly tuples
VinInn Nov 12, 2018
3064dd9
read hits
VinInn Nov 12, 2018
e64286e
Add B-hadron MTV variation to pixel track validation sequence
makortel Nov 12, 2018
5695203
fix errors on gpu
VinInn Nov 12, 2018
bca3120
tip/zip ok
VinInn Nov 13, 2018
9367675
use new version of Rinman fit
VinInn Nov 13, 2018
76cfce2
fix error^2
VinInn Nov 13, 2018
433d1cd
fix pixel errors
VinInn Nov 13, 2018
3bd5c2e
use error from templates
VinInn Nov 13, 2018
6899cd0
dup remover written
VinInn Nov 13, 2018
ae195a2
filter duplicates
VinInn Nov 13, 2018
e135f21
mae sure algo is stable
VinInn Nov 14, 2018
a605664
Merged mtvBhadronPixel from repository makortel with cms-merge-topic
VinInn Nov 14, 2018
5763c56
fix for absent lape
VinInn Nov 14, 2018
7ea1bb1
drop quads if sharing cell with pents
VinInn Nov 14, 2018
dd6ad51
add region cuts
VinInn Nov 14, 2018
34989f3
merged, refactorize
VinInn Nov 16, 2018
8cc7562
merged, refactorize
VinInn Nov 16, 2018
5cafde2
Merged gpuTracks104 from repository VinInn with cms-merge-topic
VinInn Nov 16, 2018
5ed6161
back to previous status
VinInn Nov 16, 2018
572613b
prepare vertex finder to read from gpu
VinInn Nov 17, 2018
2797693
produce vertices
VinInn Nov 17, 2018
aa0e4ad
maka vertices on gpu only: not scheduled..
VinInn Nov 17, 2018
989019e
make profiling working
VinInn Nov 17, 2018
db66a14
minor cleanup
VinInn Nov 17, 2018
79fd0ae
silenced
VinInn Nov 18, 2018
68f9162
solve conflict
VinInn Nov 18, 2018
ddad076
resize to avoid overflows
VinInn Nov 18, 2018
a6e3e7d
protect and report cell overflow as well
VinInn Nov 19, 2018
0935c5e
more cleanup
VinInn Nov 19, 2018
e695b29
remove all cpu stuff from CA on gpu
VinInn Nov 19, 2018
b2445f2
fix gpu only wf
VinInn Nov 19, 2018
fb73c7a
Address code style and quality issues (#203)
fwyzard Nov 28, 2018
6110cf4
Fix MTV validation of initialStepPreSplitting tracks and add B-hadron…
makortel Nov 29, 2018
f11b911
Merge branch 'CMSSW_10_4_X_Patatrack' into gpuTracks104
VinInn Nov 29, 2018
77bd114
Fix Free issues
VinInn Nov 29, 2018
ef869f5
Merge branch 'CMSSW_10_4_X_Patatrack' into gpuTracks104
VinInn Nov 29, 2018
94b521e
Remove stray empty lines for consistency with upstream
fwyzard Dec 3, 2018
54f759e
Merged gpuTracks104 from repository VinInn with cms-merge-topic
VinInn Dec 3, 2018
76a4ae9
add test, adress first set of comments
VinInn Dec 3, 2018
ce143ca
more comments addressed
VinInn Dec 3, 2018
13b7277
silenced
VinInn Dec 3, 2018
c455a96
now works
VinInn Dec 4, 2018
6d9379f
test of fit on gpu works
VinInn Dec 4, 2018
d3cc0b4
Merge branch 'CMSSW_10_4_X_Patatrack' of https://github.com/cms-patat…
VinInn Dec 7, 2018
522cfdf
late fishbone
VinInn Dec 7, 2018
c8623a5
make fishbone configurable
VinInn Dec 8, 2018
807d794
Merge branch 'CMSSW_10_4_X_Patatrack' into gpuTracks104
VinInn Dec 8, 2018
1d517e8
Merged gpuTracks104 from repository VinInn with cms-merge-topic
VinInn Dec 9, 2018
909cfb0
fix long standing bug (minor effect)
VinInn Dec 10, 2018
b8b4299
compiles and run
VinInn Dec 11, 2018
0c55b56
lit fit is wrong
VinInn Dec 11, 2018
811cb63
fix stupid bug
VinInn Dec 11, 2018
47d28a7
remove default arg
VinInn Dec 11, 2018
ff31eaf
debug occupancy
VinInn Dec 12, 2018
5ee726d
faster clustering
VinInn Dec 12, 2018
31584e9
apply to vertex as well
VinInn Dec 12, 2018
002a576
clean assert
VinInn Dec 13, 2018
49df121
fix missing Free
VinInn Dec 14, 2018
2078b24
Merged gpuTracksFastRFit from repository VinInn with cms-merge-topic
VinInn Dec 17, 2018
5aeaba2
Fix setting the data pointer of error SimpleVector
makortel Dec 19, 2018
5be8fe9
Merged fixPixelErrors from repository makortel with cms-merge-topic
VinInn Dec 20, 2018
c46e716
silenced initchk
VinInn Dec 20, 2018
fa4a912
Merged GPUFastTracksOptFix from repository VinInn with cms-merge-topic
VinInn Dec 20, 2018
8af902a
add NN to clustering
VinInn Dec 22, 2018
705d218
Merged GPUFastTracksNNClus from repository VinInn with cms-merge-topic
VinInn Dec 22, 2018
c190bd2
revert to use topology
VinInn Dec 22, 2018
048af11
parallelize inner loop
VinInn Dec 22, 2018
3e1a505
Merged GPUFastTracksNNClus from repository VinInn with cms-merge-topic
VinInn Dec 22, 2018
940fe95
use 2D grid instead of hand-made stride
VinInn Dec 26, 2018
8f5d217
use 2D grid instead of hand-made stride
VinInn Dec 26, 2018
1c2f268
widen cell_connect as well
VinInn Dec 27, 2018
7067416
Full workflow from raw data to pixel tracks and vertices on GPUs (#216)
VinInn Jan 8, 2019
e15a883
Merge branch 'CMSSW_10_4_X_Patatrack' into GPUFastTracksNNClus
fwyzard Jan 8, 2019
8fff3ec
Merge branch 'CMSSW_10_4_X_Patatrack' into GPUFastTracksNNClus
fwyzard Jan 9, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 40 additions & 38 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h

// #define CLUS_LIMIT_LOOP

#include <cstdint>
#include <cstdio>

Expand Down Expand Up @@ -87,8 +85,8 @@ namespace gpuClustering {
__syncthreads();

assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
assert(msize-firstPixel<maxPixInModule);
assert(msize-firstPixel<maxPixInModule);


#ifdef GPU_DEBUG
__shared__ uint32_t totGood;
Expand Down Expand Up @@ -122,20 +120,16 @@ namespace gpuClustering {
hist.fill(y[i],i-firstPixel);
}

#ifdef CLUS_LIMIT_LOOP
// assume that we can cover the whole module with up to 10 blockDim.x-wide iterations
constexpr int maxiter = 10;
if (threadIdx.x==0) {
assert((hist.size()/ blockDim.x) <= maxiter);
}
uint16_t const * jmax[maxiter];
// nearest neighbour
uint16_t nn[maxiter][5];
uint8_t nnn[maxiter]; // number of nn
for (int k = 0; k < maxiter; ++k)
jmax[k] = hist.end();
#endif

__shared__ int nloops;
nloops=0;

nnn[k] = 0;

__syncthreads(); // for hit filling!

Expand All @@ -144,7 +138,7 @@ namespace gpuClustering {
__shared__ uint32_t n40,n60;
n40=n60=0;
__syncthreads();
for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) {
for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) {
if(hist.size(j)>60) atomicAdd(&n60,1);
if(hist.size(j)>40) atomicAdd(&n40,1);
}
Expand All @@ -156,11 +150,31 @@ namespace gpuClustering {
__syncthreads();
#endif

// fill NN
for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
auto p = hist.begin()+j;
auto i = *p + firstPixel;
assert (id[i] != InvId);
assert(id[i] == thisModuleId); // same module
int be = Hist::bin(y[i]+1);
auto e = hist.end(be);
++p;
for (;p<e;++p) {
auto m = (*p)+firstPixel;
assert(m!=i);
if (std::abs(int(x[m]) - int(x[i])) > 1) continue;
auto l = nnn[k]++;
assert(l<5);
nn[k][l]=*p;
}
}

// for each pixel, look at all the pixels until the end of the module;
// when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
// after the loop, all the pixel in each cluster should have the id equeal to the lowest
// pixel in the cluster ( clus[i] == i ).
bool more = true;
int nloops=0;
while (__syncthreads_or(more)) {
if (1==nloops%2) {
for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
Expand All @@ -175,45 +189,33 @@ namespace gpuClustering {
for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
auto p = hist.begin()+j;
auto i = *p + firstPixel;
assert (id[i] != InvId);
assert(id[i] == thisModuleId); // same module
#ifdef CLUS_LIMIT_LOOP
auto jm = jmax[k];
jmax[k] = p + 1;
#endif
int be = Hist::bin(y[i]+1);
auto e = hist.end(be);
#ifdef CLUS_LIMIT_LOOP
e = std::min(e,jm);
#endif
// loop to columns
auto loop = [&](uint16_t const * kk) {
auto m = (*kk)+firstPixel;
for (int kk=0; kk<nnn[k]; ++kk) {
auto l = nn[k][kk];
auto m = l+firstPixel;
assert(m!=i);
if (std::abs(int(x[m]) - int(x[i])) > 1) return;
// if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
auto old = atomicMin(&clusterId[m], clusterId[i]);
if (old != clusterId[i]) {
// end the loop only if no changes were applied
more = true;
}
atomicMin(&clusterId[i], old);
#ifdef CLUS_LIMIT_LOOP
// update the loop boundary for the next iteration
jmax[k] = std::max(kk + 1,jmax[k]);
#endif
};
++p;
for (;p<e;++p) loop(p);
} // nnloop
} // pixel loop
}
if (threadIdx.x==0) ++nloops;
}
++nloops;
} // end while

#ifdef GPU_DEBUG
{
__shared__ int n0;
if (threadIdx.x == 0) n0=nloops;
__syncthreads();
auto ok = n0==nloops;
assert(__syncthreads_and(ok));
if (thisModuleId % 100 == 1)
if (threadIdx.x == 0)
printf("# loops %d\n",nloops);
}
#endif

__shared__ unsigned int foundClusters;
Expand Down
2 changes: 1 addition & 1 deletion RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ int main(void)
cuda::memory::copy(d_y.get(), h_y.get(), size16);
cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
// Launch CUDA Kernels
int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 64 : 256);
int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 128 : 256);
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout
<< "CUDA countModules kernel launch with " << blocksPerGrid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,19 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to z
constexpr auto hardCurvCut = 1.f/(0.35f * 87.f); // FIXME VI tune
constexpr auto ptmin = 0.9f; // FIXME original "tune"

auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
auto first = threadIdx.x;
auto stride = blockDim.x;

if (0==cellIndex) { (*apc1)=0; (*apc2)=0; }// ready for next kernel
if (0==(cellIndex+first)) { (*apc1)=0; (*apc2)=0; }// ready for next kernel

if (cellIndex >= (*nCells) ) return;
auto const & thisCell = cells[cellIndex];
if (thisCell.theDoubletId<0) return;
auto innerHitId = thisCell.get_inner_hit_id();
auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
auto vi = isOuterHitOfCell[innerHitId].data();
for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
for (auto j = first; j < numberOfPossibleNeighbors; j+=stride) {
auto otherCell = __ldg(vi+j);
if (cells[otherCell].theDoubletId<0) continue;
if (thisCell.check_alignment(hh,
Expand All @@ -172,6 +174,8 @@ void kernel_find_ntuplets(
unsigned int minHitsPerNtuplet)
{

// recursive: not obvious to widen

auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
if (cellIndex >= (*nCells) ) return;
auto &thisCell = cells[cellIndex];
Expand Down Expand Up @@ -246,23 +250,27 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
assert(nhits <= PixelGPUConstants::maxNumberOfHits);

if (earlyFishbone_) {
auto blockSize = 128;
auto blockSize = 64;
auto stride = 4;
auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
numberOfBlocks *=stride;

fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
dim3 blks(1,numberOfBlocks,1);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here and later, and in the kernel code: do we expect any differences using

dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);

or

dim3 blks(numberOfBlocks,1,1);
dim3 thrs(blockSize,stride,1);

assuming the .x and .y are swapped accordingly inside the kernels ?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, do we expect any performance difference using

kernel<<<(1, blocks, 1), (stride, size,  1)>>>(...);

or

kernel<<<blocks, size*stride>>>(..., stride);

?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for spitting the PR.

Answer to first question:
According to CUDA doc and examples "x" run faster then "y" so swapping "x" with "y" will NOT achieve the desired result of having the inner loop run in contiguous cuda thread:
The current implementation should be in my intentions equivalent to the hand-made one in terms of thread assignment.

second question:
IN PRINCIPLE the two approaches should be fully equivalent: the use of a 2D grid is clearly more CUDA-style, and does not require the percolation of the stride.
I should have coded directly using the 2D grid.
IN PRACTICE: I cannot exclude a different overhead between the two implementations.
I have simple unit tests/examples

https://github.com/VinInn/ctest/blob/master/cuda/combiHM.cu

https://github.com/VinInn/ctest/blob/master/cuda/combiXY.cu

The hand-made seems a bit faster.

My opinion is that the 2D grid is the way to code it in CUDA: It is surely more easy to understand and maintain. (is like in C using 1D arrays and computing the offset by hands instead of using a 2D array...)
We could investigate with cuda/nvcc experts: not sure we get anywhere.

dim3 thrs(stride,blockSize,1);
fishbone<<<blks,thrs, 0, cudaStream>>>(
hh.gpu_d,
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
nhits, stride, false
nhits, false
);
cudaCheck(cudaGetLastError());
}

auto blockSize = 64;
auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
auto stride = 4;
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);

kernel_connect<<<blks, thrs, 0, cudaStream>>>(
gpu_.apc_d, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel
hh.gpu_d,
device_theCells_, device_nCells_,
Expand All @@ -282,14 +290,16 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(gpu_.apc_d,gpu_.tuples_d);

if (lateFishbone_) {
auto stride=4;
numberOfBlocks = (nhits + blockSize - 1)/blockSize;
numberOfBlocks *=stride;
fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
auto blockSize = 64;
auto stride = 4;
auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
dim3 blks(1,numberOfBlocks,1);
dim3 thrs(stride,blockSize,1);
fishbone<<<blks,thrs, 0, cudaStream>>>(
hh.gpu_d,
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
nhits, stride, true
nhits, true
);
cudaCheck(cudaGetLastError());
}
Expand All @@ -312,9 +322,13 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
auto nhits = hh.nHits;

int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize;
int stride=4;
int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride;
int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
dim3 blks(1,blocks,1);
dim3 thrs(stride,threadsPerBlock,1);
gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(
device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
cudaCheck(cudaGetLastError());
}

Expand All @@ -330,4 +344,3 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple
kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d);

}

19 changes: 9 additions & 10 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
GPUCACell * cells, uint32_t const * __restrict__ nCells,
GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
uint32_t nHits,
uint32_t stride, bool checkTrack) {
bool checkTrack) {

constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;

Expand All @@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d;
auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};

auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
auto idx = ldx/stride;
auto first = ldx - idx*stride;
assert(first<stride);
// x run faster...
auto idy = threadIdx.y + blockIdx.y * blockDim.y;
auto first = threadIdx.x;

if (idx>=nHits) return;
auto const & vc = isOuterHitOfCell[idx];
if (idy>=nHits) return;
auto const & vc = isOuterHitOfCell[idy];
auto s = vc.size();
if (s<2) return;
// if alligned kill one of the two.
Expand All @@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
++sg;
}
if (sg<2) return;
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=stride) {
// here we parallelize
for (uint32_t ic=first; ic<sg-1; ic+=blockDim.x) {
auto & ci = cells[cc[ic]];
for (auto jc=ic+1; jc<sg; ++jc) {
auto & cj = cells[cc[jc]];
Expand All @@ -90,4 +89,4 @@ namespace gpuPixelDoublets {

}

#endif
#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
15 changes: 10 additions & 5 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ namespace gpuPixelDoublets {
int16_t const * __restrict__ phicuts,
float const * __restrict__ minz,
float const * __restrict__ maxz,
float const * __restrict__ maxr)
float const * __restrict__ maxr
)
{
auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };

Expand All @@ -50,8 +51,11 @@ namespace gpuPixelDoublets {
}
auto ntot = innerLayerCumulativeSize[nPairs-1];

auto idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
// x runs faster
auto idy = blockIdx.y * blockDim.y + threadIdx.y;
auto first = threadIdx.x;
auto stride = blockDim.x;
for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {

uint32_t pairLayerId=0;
while (j >= innerLayerCumulativeSize[pairLayerId++]);
Expand Down Expand Up @@ -115,7 +119,8 @@ namespace gpuPixelDoublets {
nmin += hist.size(kk+hoff);
auto const * __restrict__ p = hist.begin(kk+hoff);
auto const * __restrict__ e = hist.end(kk+hoff);
for (;p < e; ++p) {
p+=first;
for (;p < e; p+=stride) {
auto oi=__ldg(p);
assert(oi>=offsets[outer]);
assert(oi<offsets[outer+1]);
Expand All @@ -139,7 +144,7 @@ namespace gpuPixelDoublets {
} // loop in block...
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y
constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;

__global__
Expand Down