From 5f1a79861214abc25ffe14c5d562577c709a13ee Mon Sep 17 00:00:00 2001
From: Marino Missiroli <m.missiroli@cern.ch>
Date: Fri, 30 Sep 2022 00:05:55 +0200
Subject: [PATCH 1/2] more kernel improvements to avoid race conditions

---
 .../PFClusterProducer/plugins/PFClusterCudaHCAL.cu   | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index 0baa2ba87508b..f6f8d0a0ef02e 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3832,13 +3832,8 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
-    *topoIter = iter;
-#ifdef DEBUG_GPU_HCAL
-//    if (threadIdx.x == 0) {
-//        printf("*** Topo clustering converged in %d iterations ***\n", iter);
-//    }
-//    __syncthreads();
-#endif
+    if (threadIdx.x == 0)
+      *topoIter = iter;
   }
 
   __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {
@@ -3898,6 +3893,7 @@ namespace PFClusterCudaHCAL {
   }
 
   __device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) {
+    assert(size == 32);
     int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
     s_Data[pos] = 0;
     pos += size;
@@ -3905,7 +3901,9 @@ namespace PFClusterCudaHCAL {
 
     for (int offset = 1; offset < size; offset <<= 1) {
       int t = s_Data[pos] + s_Data[pos - offset];
+      __syncwarp();
       s_Data[pos] = t;
+      __syncwarp();
     }
 
     return s_Data[pos];

From 8189a9f0f0806abe63b55303ca09d98d61f51129 Mon Sep 17 00:00:00 2001
From: Marino Missiroli <m.missiroli@cern.ch>
Date: Fri, 30 Sep 2022 01:01:41 +0200
Subject: [PATCH 2/2] more syncthreads calls in topoClusterLinking kernel

---
 .../plugins/PFClusterCudaHCAL.cu              | 89 +++----------------
 1 file changed, 11 insertions(+), 78 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index f6f8d0a0ef02e..eb6a7587534f5 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3598,6 +3598,7 @@ namespace PFClusterCudaHCAL {
       }
     }
     __syncthreads();
+
     if (threadIdx.x == 0) {
       *pcrhFracSize = totalSeedFracOffset;
       //printf("At the end of topoClusterContraction, found *pcrhFracSize = %d\n", *pcrhFracSize);
@@ -3747,8 +3748,11 @@ namespace PFClusterCudaHCAL {
       else
         pfrh_edgeMask[idx] = 0;
     }
+    __syncthreads();//!!
 
     do {
+      __syncthreads();//!!
+
       if (threadIdx.x == 0) {
         notDone = false;
       }
@@ -3780,6 +3784,9 @@ namespace PFClusterCudaHCAL {
           }
         }
       }
+
+      __syncthreads();//!!
+
       if (threadIdx.x == 0)
         iter++;
 
@@ -3793,7 +3800,6 @@ namespace PFClusterCudaHCAL {
       if (threadIdx.x == 0) {
         notDone = false;
       }
-
       __syncthreads();
 
       // Even linking
@@ -3804,7 +3810,6 @@ namespace PFClusterCudaHCAL {
           pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]);
         }
       }
-
       __syncthreads();
 
       // edgeParent
@@ -3825,6 +3830,8 @@ namespace PFClusterCudaHCAL {
         }
       }
 
+      __syncthreads();//!!
+
       if (threadIdx.x == 0)
         iter++;
 
@@ -3832,6 +3839,8 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
+    __syncthreads();//!!
+
     if (threadIdx.x == 0)
       *topoIter = iter;
   }
@@ -4393,18 +4402,9 @@ namespace PFClusterCudaHCAL {
       ::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
       float (&timer)[8]) {
 
-#ifdef DEBUG_GPU_HCAL
-    cudaProfilerStart();
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     int nRH = inputPFRecHits.size;
 
     // Combined seeding & topo clustering thresholds, array initialization
-
     seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
                                                                          inputPFRecHits.pfrh_energy.get(),
                                                                          inputPFRecHits.pfrh_x.get(),
@@ -4424,23 +4424,6 @@ namespace PFClusterCudaHCAL {
                                                                          outputGPU.topoSeedList.get(),
                                                                          outputGPU.pfc_iter.get());
 
-    cudaCheck(cudaStreamSynchronize(cudaStream));
-
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[0], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
-    //    prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
-    //        nRH,
-    //        outputGPU.nEdges.get(),
-    //        outputGPU.pfrh_passTopoThresh.get(),
-    //        inputPFRecHits.pfrh_neighbours.get(),
-    //        scratchGPU.pfrh_edgeId.get(),
-    //        scratchGPU.pfrh_edgeList.get());
-
     // Topo clustering
     // Fill edgeId, edgeList arrays with rechit neighbors
     // Has a bug when using more than 128 threads..
@@ -4450,34 +4433,6 @@ namespace PFClusterCudaHCAL {
                                                                            inputPFRecHits.pfrh_neighbours.get(),
                                                                            scratchGPU.pfrh_edgeId.get(),
                                                                            scratchGPU.pfrh_edgeList.get());
-    cudaCheck(cudaStreamSynchronize(cudaStream));
-
-    //    prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
-    //        nRH,
-    //        outputGPU.nEdges.get(),
-    //        outputGPU.pfrh_passTopoThresh.get(),
-    //        inputPFRecHits.pfrh_neighbours.get(),
-    //        scratchGPU.pfrh_edgeId.get(),
-    //        scratchGPU.pfrh_edgeList.get());
-
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[4], start, stop);
-    //printf("\nprepareTopoInputs took %f ms\n", timer[4]);
-
-    compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(),
-                                               scratchGPU.pfrh_edgeId.get(),
-                                               scratchGPU.pfrh_edgeList.get(),
-                                               nEdges,
-                                               inputGPU.pfrh_edgeId.get(),
-                                               inputGPU.pfrh_edgeList.get(),
-                                               nRH,
-                                               inputGPU.pfNeighFourInd.get(),
-                                               inputPFRecHits.pfrh_neighbours.get());
-
-    cudaEventRecord(start, cudaStream);
-#endif
 
     // Topo clustering
     topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
@@ -4486,10 +4441,8 @@ namespace PFClusterCudaHCAL {
                                                   scratchGPU.pfrh_edgeId.get(),
                                                   scratchGPU.pfrh_edgeList.get(),
                                                   scratchGPU.pfrh_edgeMask.get(),
-                                                  //inputGPU.pfrh_edgeMask.get(),
                                                   outputGPU.pfrh_passTopoThresh.get(),
                                                   outputGPU.topoIter.get());
-    cudaCheck(cudaStreamSynchronize(cudaStream));
 
     topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
                                                       outputGPU.pfrh_topoId.get(),
@@ -4504,13 +4457,6 @@ namespace PFClusterCudaHCAL {
                                                       outputGPU.pcrh_frac.get(),
                                                       outputGPU.pcrhFracSize.get());
 
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[1], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     dim3 grid((nRH + 31) / 32, (nRH + 31) / 32);
     dim3 block(32, 32);
 
@@ -4523,13 +4469,6 @@ namespace PFClusterCudaHCAL {
                                                  scratchGPU.rhcount.get(),
                                                  outputGPU.pcrh_fracInd.get());
 
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[2], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     hcalFastCluster_selection<<<nRH, 256, 0, cudaStream>>>(nRH,
                                                            inputPFRecHits.pfrh_x.get(),
                                                            inputPFRecHits.pfrh_y.get(),
@@ -4553,11 +4492,5 @@ namespace PFClusterCudaHCAL {
                                                            inputGPU.pfc_prevPos4.get(),
                                                            inputGPU.pfc_energy.get(),
                                                            outputGPU.pfc_iter.get());
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[3], start, stop);
-    cudaProfilerStop();
-#endif
   }
 }  // namespace PFClusterCudaHCAL