Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 16 additions & 85 deletions RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3598,6 +3598,7 @@ namespace PFClusterCudaHCAL {
}
}
__syncthreads();

if (threadIdx.x == 0) {
*pcrhFracSize = totalSeedFracOffset;
//printf("At the end of topoClusterContraction, found *pcrhFracSize = %d\n", *pcrhFracSize);
Expand Down Expand Up @@ -3747,8 +3748,11 @@ namespace PFClusterCudaHCAL {
else
pfrh_edgeMask[idx] = 0;
}
__syncthreads();//!!

do {
__syncthreads();//!!

if (threadIdx.x == 0) {
notDone = false;
}
Expand Down Expand Up @@ -3780,6 +3784,9 @@ namespace PFClusterCudaHCAL {
}
}
}

__syncthreads();//!!

if (threadIdx.x == 0)
iter++;

Expand All @@ -3793,7 +3800,6 @@ namespace PFClusterCudaHCAL {
if (threadIdx.x == 0) {
notDone = false;
}

__syncthreads();

// Even linking
Expand All @@ -3804,7 +3810,6 @@ namespace PFClusterCudaHCAL {
pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]);
}
}

__syncthreads();

// edgeParent
Expand All @@ -3825,20 +3830,19 @@ namespace PFClusterCudaHCAL {
}
}

__syncthreads();//!!

if (threadIdx.x == 0)
iter++;

__syncthreads();

} while (notDone);

*topoIter = iter;
#ifdef DEBUG_GPU_HCAL
// if (threadIdx.x == 0) {
// printf("*** Topo clustering converged in %d iterations ***\n", iter);
// }
// __syncthreads();
#endif
__syncthreads();//!!

if (threadIdx.x == 0)
*topoIter = iter;
}

__device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {
Expand Down Expand Up @@ -3898,14 +3902,17 @@ namespace PFClusterCudaHCAL {
}

__device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) {
assert(size == 32);
int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
s_Data[pos] = 0;
pos += size;
s_Data[pos] = idata;

for (int offset = 1; offset < size; offset <<= 1) {
int t = s_Data[pos] + s_Data[pos - offset];
__syncwarp();
s_Data[pos] = t;
__syncwarp();
}

return s_Data[pos];
Expand Down Expand Up @@ -4395,18 +4402,9 @@ namespace PFClusterCudaHCAL {
::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
float (&timer)[8]) {

#ifdef DEBUG_GPU_HCAL
cudaProfilerStart();
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, cudaStream);
#endif

int nRH = inputPFRecHits.size;

// Combined seeding & topo clustering thresholds, array initialization

seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
inputPFRecHits.pfrh_energy.get(),
inputPFRecHits.pfrh_x.get(),
Expand All @@ -4426,23 +4424,6 @@ namespace PFClusterCudaHCAL {
outputGPU.topoSeedList.get(),
outputGPU.pfc_iter.get());

cudaCheck(cudaStreamSynchronize(cudaStream));

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[0], start, stop);
cudaEventRecord(start, cudaStream);
#endif

// prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
// nRH,
// outputGPU.nEdges.get(),
// outputGPU.pfrh_passTopoThresh.get(),
// inputPFRecHits.pfrh_neighbours.get(),
// scratchGPU.pfrh_edgeId.get(),
// scratchGPU.pfrh_edgeList.get());

// Topo clustering
// Fill edgeId, edgeList arrays with rechit neighbors
// Has a bug when using more than 128 threads..
Expand All @@ -4452,34 +4433,6 @@ namespace PFClusterCudaHCAL {
inputPFRecHits.pfrh_neighbours.get(),
scratchGPU.pfrh_edgeId.get(),
scratchGPU.pfrh_edgeList.get());
cudaCheck(cudaStreamSynchronize(cudaStream));

// prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
// nRH,
// outputGPU.nEdges.get(),
// outputGPU.pfrh_passTopoThresh.get(),
// inputPFRecHits.pfrh_neighbours.get(),
// scratchGPU.pfrh_edgeId.get(),
// scratchGPU.pfrh_edgeList.get());

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[4], start, stop);
//printf("\nprepareTopoInputs took %f ms\n", timer[4]);

compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(),
scratchGPU.pfrh_edgeId.get(),
scratchGPU.pfrh_edgeList.get(),
nEdges,
inputGPU.pfrh_edgeId.get(),
inputGPU.pfrh_edgeList.get(),
nRH,
inputGPU.pfNeighFourInd.get(),
inputPFRecHits.pfrh_neighbours.get());

cudaEventRecord(start, cudaStream);
#endif

// Topo clustering
topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
Expand All @@ -4488,10 +4441,8 @@ namespace PFClusterCudaHCAL {
scratchGPU.pfrh_edgeId.get(),
scratchGPU.pfrh_edgeList.get(),
scratchGPU.pfrh_edgeMask.get(),
//inputGPU.pfrh_edgeMask.get(),
outputGPU.pfrh_passTopoThresh.get(),
outputGPU.topoIter.get());
cudaCheck(cudaStreamSynchronize(cudaStream));

topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
outputGPU.pfrh_topoId.get(),
Expand All @@ -4506,13 +4457,6 @@ namespace PFClusterCudaHCAL {
outputGPU.pcrh_frac.get(),
outputGPU.pcrhFracSize.get());

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[1], start, stop);
cudaEventRecord(start, cudaStream);
#endif

dim3 grid((nRH + 31) / 32, (nRH + 31) / 32);
dim3 block(32, 32);

Expand All @@ -4525,13 +4469,6 @@ namespace PFClusterCudaHCAL {
scratchGPU.rhcount.get(),
outputGPU.pcrh_fracInd.get());

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[2], start, stop);
cudaEventRecord(start, cudaStream);
#endif

hcalFastCluster_selection<<<nRH, 256, 0, cudaStream>>>(nRH,
inputPFRecHits.pfrh_x.get(),
inputPFRecHits.pfrh_y.get(),
Expand All @@ -4555,11 +4492,5 @@ namespace PFClusterCudaHCAL {
inputGPU.pfc_prevPos4.get(),
inputGPU.pfc_energy.get(),
outputGPU.pfc_iter.get());
#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[3], start, stop);
cudaProfilerStop();
#endif
}
} // namespace PFClusterCudaHCAL