diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 0baa2ba87508b..eb6a7587534f5 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3598,6 +3598,7 @@ namespace PFClusterCudaHCAL { } } __syncthreads(); + if (threadIdx.x == 0) { *pcrhFracSize = totalSeedFracOffset; //printf("At the end of topoClusterContraction, found *pcrhFracSize = %d\n", *pcrhFracSize); @@ -3747,8 +3748,11 @@ namespace PFClusterCudaHCAL { else pfrh_edgeMask[idx] = 0; } + __syncthreads();//!! do { + __syncthreads();//!! + if (threadIdx.x == 0) { notDone = false; } @@ -3780,6 +3784,9 @@ namespace PFClusterCudaHCAL { } } } + + __syncthreads();//!! + if (threadIdx.x == 0) iter++; @@ -3793,7 +3800,6 @@ namespace PFClusterCudaHCAL { if (threadIdx.x == 0) { notDone = false; } - __syncthreads(); // Even linking @@ -3804,7 +3810,6 @@ namespace PFClusterCudaHCAL { pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]); } } - __syncthreads(); // edgeParent @@ -3825,6 +3830,8 @@ namespace PFClusterCudaHCAL { } } + __syncthreads();//!! + if (threadIdx.x == 0) iter++; @@ -3832,13 +3839,10 @@ namespace PFClusterCudaHCAL { } while (notDone); - *topoIter = iter; -#ifdef DEBUG_GPU_HCAL -// if (threadIdx.x == 0) { -// printf("*** Topo clustering converged in %d iterations ***\n", iter); -// } -// __syncthreads(); -#endif + __syncthreads();//!! + + if (threadIdx.x == 0) + *topoIter = iter; } __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) { @@ -3898,6 +3902,7 @@ namespace PFClusterCudaHCAL { } __device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) { + assert(size == 32); int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); s_Data[pos] = 0; pos += size; @@ -3905,7 +3910,9 @@ namespace PFClusterCudaHCAL { for (int offset = 1; offset < size; offset <<= 1) { int t = s_Data[pos] + s_Data[pos - offset]; + __syncwarp(); s_Data[pos] = t; + __syncwarp(); } return s_Data[pos]; @@ -4395,18 +4402,9 @@ namespace PFClusterCudaHCAL { ::PFClustering::HCAL::ScratchDataGPU& scratchGPU, float (&timer)[8]) { -#ifdef DEBUG_GPU_HCAL - cudaProfilerStart(); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, cudaStream); -#endif - int nRH = inputPFRecHits.size; // Combined seeding & topo clustering thresholds, array initialization - seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH, inputPFRecHits.pfrh_energy.get(), inputPFRecHits.pfrh_x.get(), @@ -4426,23 +4424,6 @@ namespace PFClusterCudaHCAL { outputGPU.topoSeedList.get(), outputGPU.pfc_iter.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); - -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[0], start, stop); - cudaEventRecord(start, cudaStream); -#endif - - // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( - // nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); - // Topo clustering // Fill edgeId, edgeList arrays with rechit neighbors // Has a bug when using more than 128 threads.. @@ -4452,34 +4433,6 @@ namespace PFClusterCudaHCAL { inputPFRecHits.pfrh_neighbours.get(), scratchGPU.pfrh_edgeId.get(), scratchGPU.pfrh_edgeList.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); - - // prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>( - // nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); - -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[4], start, stop); - //printf("\nprepareTopoInputs took %f ms\n", timer[4]); - - compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get(), - nEdges, - inputGPU.pfrh_edgeId.get(), - inputGPU.pfrh_edgeList.get(), - nRH, - inputGPU.pfNeighFourInd.get(), - inputPFRecHits.pfrh_neighbours.get()); - - cudaEventRecord(start, cudaStream); -#endif // Topo clustering topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, @@ -4488,10 +4441,8 @@ namespace PFClusterCudaHCAL { scratchGPU.pfrh_edgeId.get(), scratchGPU.pfrh_edgeList.get(), scratchGPU.pfrh_edgeMask.get(), - //inputGPU.pfrh_edgeMask.get(), outputGPU.pfrh_passTopoThresh.get(), outputGPU.topoIter.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.pfrh_topoId.get(), @@ -4506,13 +4457,6 @@ namespace PFClusterCudaHCAL { outputGPU.pcrh_frac.get(), outputGPU.pcrhFracSize.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[1], start, stop); - cudaEventRecord(start, cudaStream); -#endif - dim3 grid((nRH + 31) / 32, (nRH + 31) / 32); dim3 block(32, 32); @@ -4525,13 +4469,6 @@ namespace PFClusterCudaHCAL { scratchGPU.rhcount.get(), outputGPU.pcrh_fracInd.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[2], start, stop); - cudaEventRecord(start, cudaStream); -#endif - hcalFastCluster_selection<<>>(nRH, inputPFRecHits.pfrh_x.get(), inputPFRecHits.pfrh_y.get(), @@ -4555,11 +4492,5 @@ namespace PFClusterCudaHCAL { inputGPU.pfc_prevPos4.get(), inputGPU.pfc_energy.get(), outputGPU.pfc_iter.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[3], start, stop); - cudaProfilerStop(); -#endif } } // namespace PFClusterCudaHCAL