diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index dba9fe12f5492..5c21a39302d70 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -1,8 +1,6 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h -// #define CLUS_LIMIT_LOOP - #include #include @@ -87,8 +85,8 @@ namespace gpuClustering { __syncthreads(); assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId))); - assert(msize-firstPixel60) atomicAdd(&n60,1); if(hist.size(j)>40) atomicAdd(&n40,1); } @@ -156,11 +150,31 @@ namespace gpuClustering { __syncthreads(); #endif + // fill NN + for (int j=threadIdx.x, k = 0; j 1) continue; + auto l = nnn[k]++; + assert(l<5); + nn[k][l]=*p; + } + } + // for each pixel, look at all the pixels until the end of the module; // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum; // after the loop, all the pixel in each cluster should have the id equeal to the lowest // pixel in the cluster ( clus[i] == i ). bool more = true; + int nloops=0; while (__syncthreads_or(more)) { if (1==nloops%2) { for (int j=threadIdx.x, k = 0; j 1) return; - // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1 auto old = atomicMin(&clusterId[m], clusterId[i]); if (old != clusterId[i]) { // end the loop only if no changes were applied more = true; } atomicMin(&clusterId[i], old); -#ifdef CLUS_LIMIT_LOOP - // update the loop boundary for the next iteration - jmax[k] = std::max(kk + 1,jmax[k]); -#endif - }; - ++p; - for (;p= (*nCells) ) return; auto const & thisCell = cells[cellIndex]; @@ -154,7 +156,7 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2, // just to z auto innerHitId = thisCell.get_inner_hit_id(); auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); auto vi = isOuterHitOfCell[innerHitId].data(); - for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { + for (auto j = first; j < numberOfPossibleNeighbors; j+=stride) { auto otherCell = __ldg(vi+j); if (cells[otherCell].theDoubletId<0) continue; if (thisCell.check_alignment(hh, @@ -172,6 +174,8 @@ void kernel_find_ntuplets( unsigned int minHitsPerNtuplet) { + // recursive: not obvious to widen + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; if (cellIndex >= (*nCells) ) return; auto &thisCell = cells[cellIndex]; @@ -246,23 +250,27 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... assert(nhits <= PixelGPUConstants::maxNumberOfHits); if (earlyFishbone_) { - auto blockSize = 128; + auto blockSize = 64; auto stride = 4; auto numberOfBlocks = (nhits + blockSize - 1)/blockSize; - numberOfBlocks *=stride; - - fishbone<<>>( + dim3 blks(1,numberOfBlocks,1); + dim3 thrs(stride,blockSize,1); + fishbone<<>>( hh.gpu_d, device_theCells_, device_nCells_, device_isOuterHitOfCell_, - nhits, stride, false + nhits, false ); cudaCheck(cudaGetLastError()); } auto blockSize = 64; auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize; - kernel_connect<<>>( + auto stride = 4; + dim3 blks(1,numberOfBlocks,1); + dim3 thrs(stride,blockSize,1); + + kernel_connect<<>>( gpu_.apc_d, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel hh.gpu_d, device_theCells_, device_nCells_, @@ -282,14 +290,16 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... cudautils::finalizeBulk<<>>(gpu_.apc_d,gpu_.tuples_d); if (lateFishbone_) { - auto stride=4; - numberOfBlocks = (nhits + blockSize - 1)/blockSize; - numberOfBlocks *=stride; - fishbone<<>>( + auto blockSize = 64; + auto stride = 4; + auto numberOfBlocks = (nhits + blockSize - 1)/blockSize; + dim3 blks(1,numberOfBlocks,1); + dim3 thrs(stride,blockSize,1); + fishbone<<>>( hh.gpu_d, device_theCells_, device_nCells_, device_isOuterHitOfCell_, - nhits, stride, true + nhits, true ); cudaCheck(cudaGetLastError()); } @@ -312,9 +322,13 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms.... void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) { auto nhits = hh.nHits; - int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize; + int stride=4; + int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride; int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock; - gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_); + dim3 blks(1,blocks,1); + dim3 thrs(stride,threadsPerBlock,1); + gpuPixelDoublets::getDoubletsFromHisto<<>>( + device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_); cudaCheck(cudaGetLastError()); } @@ -330,4 +344,3 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple kernel_fastDuplicateRemover<<>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d); } - diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index 717cbf777fcdb..796241eaf50ff 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -26,7 +26,7 @@ namespace gpuPixelDoublets { GPUCACell * cells, uint32_t const * __restrict__ nCells, GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell, uint32_t nHits, - uint32_t stride, bool checkTrack) { + bool checkTrack) { constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit; @@ -35,13 +35,12 @@ namespace gpuPixelDoublets { uint8_t const * __restrict__ layerp = hh.phase1TopologyLayer_d; auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);}; - auto ldx = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = ldx/stride; - auto first = ldx - idx*stride; - assert(first=nHits) return; - auto const & vc = isOuterHitOfCell[idx]; + if (idy>=nHits) return; + auto const & vc = isOuterHitOfCell[idy]; auto s = vc.size(); if (s<2) return; // if alligned kill one of the two. @@ -66,8 +65,8 @@ namespace gpuPixelDoublets { ++sg; } if (sg<2) return; - // here we parallelize - for (uint32_t ic=first; ic= innerLayerCumulativeSize[pairLayerId++]); @@ -115,7 +119,8 @@ namespace gpuPixelDoublets { nmin += hist.size(kk+hoff); auto const * __restrict__ p = hist.begin(kk+hoff); auto const * __restrict__ e = hist.end(kk+hoff); - for (;p < e; ++p) { + p+=first; + for (;p < e; p+=stride) { auto oi=__ldg(p); assert(oi>=offsets[outer]); assert(oi