From 5f1a79861214abc25ffe14c5d562577c709a13ee Mon Sep 17 00:00:00 2001 From: Marino Missiroli Date: Fri, 30 Sep 2022 00:05:55 +0200 Subject: [PATCH 1/2] more kernel improvements to avoid race conditions --- .../PFClusterProducer/plugins/PFClusterCudaHCAL.cu | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 0baa2ba87508b..f6f8d0a0ef02e 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3832,13 +3832,8 @@ namespace PFClusterCudaHCAL { } while (notDone); - *topoIter = iter; -#ifdef DEBUG_GPU_HCAL -// if (threadIdx.x == 0) { -// printf("*** Topo clustering converged in %d iterations ***\n", iter); -// } -// __syncthreads(); -#endif + if (threadIdx.x == 0) + *topoIter = iter; } __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) { @@ -3898,6 +3893,7 @@ namespace PFClusterCudaHCAL { } __device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) { + assert(size == 32); int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); s_Data[pos] = 0; pos += size; @@ -3905,7 +3901,9 @@ namespace PFClusterCudaHCAL { for (int offset = 1; offset < size; offset <<= 1) { int t = s_Data[pos] + s_Data[pos - offset]; + __syncwarp(); s_Data[pos] = t; + __syncwarp(); } return s_Data[pos]; From 8189a9f0f0806abe63b55303ca09d98d61f51129 Mon Sep 17 00:00:00 2001 From: Marino Missiroli Date: Fri, 30 Sep 2022 01:01:41 +0200 Subject: [PATCH 2/2] more syncthreads calls in topoClusterLinking kernel --- .../plugins/PFClusterCudaHCAL.cu | 89 +++---------------- 1 file changed, 11 insertions(+), 78 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index f6f8d0a0ef02e..eb6a7587534f5 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3598,6 +3598,7 @@ namespace PFClusterCudaHCAL { } } __syncthreads(); + if (threadIdx.x == 0) { *pcrhFracSize = totalSeedFracOffset; //printf("At the end of topoClusterContraction, found *pcrhFracSize = %d\n", *pcrhFracSize); @@ -3747,8 +3748,11 @@ namespace PFClusterCudaHCAL { else pfrh_edgeMask[idx] = 0; } + __syncthreads();//!! do { + __syncthreads();//!! + if (threadIdx.x == 0) { notDone = false; } @@ -3780,6 +3784,9 @@ namespace PFClusterCudaHCAL { } } } + + __syncthreads();//!! + if (threadIdx.x == 0) iter++; @@ -3793,7 +3800,6 @@ namespace PFClusterCudaHCAL { if (threadIdx.x == 0) { notDone = false; } - __syncthreads(); // Even linking @@ -3804,7 +3810,6 @@ namespace PFClusterCudaHCAL { pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]); } } - __syncthreads(); // edgeParent @@ -3825,6 +3830,8 @@ namespace PFClusterCudaHCAL { } } + __syncthreads();//!! + if (threadIdx.x == 0) iter++; @@ -3832,6 +3839,8 @@ namespace PFClusterCudaHCAL { } while (notDone); + __syncthreads();//!! + if (threadIdx.x == 0) *topoIter = iter; } @@ -4393,18 +4402,9 @@ namespace PFClusterCudaHCAL { ::PFClustering::HCAL::ScratchDataGPU& scratchGPU, float (&timer)[8]) { -#ifdef DEBUG_GPU_HCAL - cudaProfilerStart(); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, cudaStream); -#endif - int nRH = inputPFRecHits.size; // Combined seeding & topo clustering thresholds, array initialization - seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH, inputPFRecHits.pfrh_energy.get(), inputPFRecHits.pfrh_x.get(), @@ -4424,23 +4424,6 @@ namespace PFClusterCudaHCAL { outputGPU.topoSeedList.get(), outputGPU.pfc_iter.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); - -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[0], start, stop); - cudaEventRecord(start, cudaStream); -#endif - - // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( - // nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); - // Topo clustering // Fill edgeId, edgeList arrays with rechit neighbors // Has a bug when using more than 128 threads.. @@ -4450,34 +4433,6 @@ namespace PFClusterCudaHCAL { inputPFRecHits.pfrh_neighbours.get(), scratchGPU.pfrh_edgeId.get(), scratchGPU.pfrh_edgeList.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); - - // prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>( - // nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); - -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[4], start, stop); - //printf("\nprepareTopoInputs took %f ms\n", timer[4]); - - compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get(), - nEdges, - inputGPU.pfrh_edgeId.get(), - inputGPU.pfrh_edgeList.get(), - nRH, - inputGPU.pfNeighFourInd.get(), - inputPFRecHits.pfrh_neighbours.get()); - - cudaEventRecord(start, cudaStream); -#endif // Topo clustering topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, @@ -4486,10 +4441,8 @@ namespace PFClusterCudaHCAL { scratchGPU.pfrh_edgeId.get(), scratchGPU.pfrh_edgeList.get(), scratchGPU.pfrh_edgeMask.get(), - //inputGPU.pfrh_edgeMask.get(), outputGPU.pfrh_passTopoThresh.get(), outputGPU.topoIter.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.pfrh_topoId.get(), @@ -4504,13 +4457,6 @@ namespace PFClusterCudaHCAL { outputGPU.pcrh_frac.get(), outputGPU.pcrhFracSize.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[1], start, stop); - cudaEventRecord(start, cudaStream); -#endif - dim3 grid((nRH + 31) / 32, (nRH + 31) / 32); dim3 block(32, 32); @@ -4523,13 +4469,6 @@ namespace PFClusterCudaHCAL { scratchGPU.rhcount.get(), outputGPU.pcrh_fracInd.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[2], start, stop); - cudaEventRecord(start, cudaStream); -#endif - hcalFastCluster_selection<<>>(nRH, inputPFRecHits.pfrh_x.get(), inputPFRecHits.pfrh_y.get(), @@ -4553,11 +4492,5 @@ namespace PFClusterCudaHCAL { inputGPU.pfc_prevPos4.get(), inputGPU.pfc_energy.get(), outputGPU.pfc_iter.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[3], start, stop); - cudaProfilerStop(); -#endif } } // namespace PFClusterCudaHCAL