From ebe462be46df63494d57b03f3f298825dad0fcc7 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 12 Sep 2018 09:47:46 +0200
Subject: [PATCH 01/50] Add new concurrent vertex algo (cms-patatrack#158)

Add a new concurrent vertex algorithm based on DBSCAN (that reuses parts of the pixel cluster algorithm).
It still needs to be tuned to reach at least the performance of current DivisiveClustering algorithm used at HLT.
---
 .../python/RecoPixelVertexing_cff.py          |   7 +-
 .../python/PixelVertexes_cfi.py               |   3 +
 .../PixelVertexFinding/test/BuildFile.xml     |  34 ++-
 .../test/gpuVertexFinder_t.cu                 | 207 ++++++++++++++++++
 4 files changed, 238 insertions(+), 13 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 34ee6fadb04de..5f541dd19a412 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -4,7 +4,6 @@
 #
 # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex
 #
-#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
-from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
-recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
-recopixelvertexing = cms.Sequence(recopixelvertexingTask)
+from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
+# from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
+recopixelvertexing = cms.Sequence(pixelTracksSequence*pixelVertices)
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index 77a9f367b9d9b..ea9e4b1e4e037 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -20,3 +20,6 @@
 )
 
 
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexHeterogeneousProducer_cfi import pixelVertexHeterogeneousProducer as _pixelVertexHeterogeneousProducer
+gpu.toReplaceWith(pixelVertices, _pixelVertexHeterogeneousProducer)
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index 0f4f4dee63832..ad1f03999fbea 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -1,9 +1,25 @@
-<use name="boost"/>
-<use name="root"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
-<use name="MagneticField/Records"/>
-<use name="MagneticField/Engine"/>
-<use name="TrackingTools/TransientTrack"/>
-<use name="RecoVertex/KalmanVertexFit"/>
-<use name="SimDataFormats/Track"/>
+<use   name="boost"/>
+<use   name="root"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/PluginManager"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="Geometry/Records"/>
+<use   name="Geometry/CommonDetUnit"/>
+<use   name="Geometry/TrackerGeometryBuilder"/>
+<use   name="CommonTools/Clustering1D"/>
+<use   name="DataFormats/TrackerRecHit2D"/>
+<use   name="RecoTracker/TkHitPairs"/>
+<use   name="RecoTracker/TkTrackingRegions"/>
+<use   name="RecoPixelVertexing/PixelTriplets"/>
+<use   name="RecoPixelVertexing/PixelTrackFitting"/>
+<use   name="MagneticField/Records"/>
+<use   name="MagneticField/Engine"/>
+<use   name="TrackingTools/TransientTrack"/>
+<use   name="RecoVertex/KalmanVertexFit"/>
+<use   name="SimDataFormats/Track"/>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinder_t">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
new file mode 100644
index 0000000000000..f47c4362503ae
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -0,0 +1,207 @@
+#include<random>
+#include<vector>
+#include<cstdint>
+#include<cmath>
+
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h"
+using namespace  gpuVertexFinder;
+#include <cuda/api_wrappers.h>
+
+
+struct Event {
+  std::vector<float> zvert;
+  std::vector<uint16_t>  itrack;
+  std::vector<float> ztrack;
+  std::vector<float> eztrack;
+  std::vector<uint16_t> ivert;
+};
+
+struct ClusterGenerator {
+
+  explicit ClusterGenerator(float nvert, float ntrack) :
+    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.)
+  {}
+
+  void operator()(Event & ev) {
+
+    int nclus = clusGen(reng);
+    ev.zvert.resize(nclus);
+    ev.itrack.resize(nclus);
+    for (auto & z : ev.zvert) { 
+       z = 3.5f*gauss(reng);
+    }
+
+    ev.ztrack.clear(); 
+    ev.eztrack.clear();
+    ev.ivert.clear();
+    for (int iv=0; iv<nclus; ++iv) {
+      auto nt = trackGen(reng);
+      ev.itrack[nclus] = nt;
+      for (int it=0; it<nt; ++it) {
+       auto err = errgen(reng); // reality is not flat....
+       ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
+       ev.eztrack.push_back(err*err);
+       ev.ivert.push_back(iv);
+      }
+    }
+    // add noise
+    auto nt = 2*trackGen(reng);
+    for (int it=0; it<nt; ++it) {
+      auto err = 0.03f;
+      ev.ztrack.push_back(rgen(reng));
+      ev.eztrack.push_back(err*err);
+      ev.ivert.push_back(9999);
+    }
+
+  }
+
+  std::mt19937 reng;
+  std::uniform_real_distribution<float> rgen;
+  std::uniform_real_distribution<float> errgen;
+  std::poisson_distribution<int> clusGen;
+  std::poisson_distribution<int> trackGen;
+  std::normal_distribution<float> gauss;
+
+
+};
+
+
+#include<iostream>
+
+int main() {
+
+  if (cuda::device::count() == 0) {
+    std::cerr << "No CUDA devices on this system" << "\n";
+    exit(EXIT_FAILURE);
+  }
+
+  auto current_device = cuda::device::current::get();
+
+  auto zt_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
+  auto ezt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
+  auto zv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto wv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto chi2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+
+  auto izt_d = cuda::memory::device::make_unique<int8_t[]>(current_device, 64000);
+  auto nn_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
+  auto iv_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
+
+  auto nv_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
+ 
+  auto onGPU_d = cuda::memory::device::make_unique<OnGPU[]>(current_device, 1);
+
+  OnGPU onGPU;
+
+  onGPU.zt = zt_d.get();
+  onGPU.ezt2 = ezt2_d.get();
+  onGPU.zv = zv_d.get();
+  onGPU.wv = wv_d.get();
+  onGPU.chi2 = chi2_d.get();
+  onGPU.nv = nv_d.get();
+  onGPU.izt = izt_d.get();
+  onGPU.nn = nn_d.get();
+  onGPU.iv = iv_d.get();
+
+
+  cuda::memory::copy(onGPU_d.get(), &onGPU, sizeof(OnGPU));
+
+
+  Event  ev;
+
+  for (int nav=30;nav<80;nav+=20){ 
+
+  ClusterGenerator gen(nav,10);
+
+  for (int i=8; i<20; ++i) {
+
+  auto  kk=i/4;  // M param
+
+  gen(ev);
+  
+  std::cout << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+
+  cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
+  cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
+
+  float eps = 0.1f;
+  
+  std::cout << "M eps " << kk << ' ' << eps << std::endl;
+  
+  if ( (i%4) == 0 )
+    cuda::launch(clusterTracks,
+		 { 1, 1024 },
+		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 0.02f,12.0f
+		 );
+  
+  if ( (i%4) == 1 )
+    cuda::launch(clusterTracks,
+		 { 1, 1024 },
+		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 0.02f,9.0f
+		 );
+  
+  if ( (i%4) == 2 )
+    cuda::launch(clusterTracks,
+		 { 1, 1024 },
+		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 0.01f,9.0f
+		 );
+  
+  if ( (i%4) == 3 )
+    cuda::launch(clusterTracks,
+		 { 1, 1024 },
+		 ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps,
+		 0.01f,9.0f
+		 );
+  
+
+
+  uint32_t nv;
+  cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t));
+  float zv[nv];
+  float	wv[nv];
+  float	chi2[nv];
+  int32_t nn[nv];
+  cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float));
+  cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float));
+  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
+   
+  {
+    auto mx = std::minmax_element(wv,wv+nv);
+    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
+  }
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "min max chi2 " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+  
+
+  float dd[nv];
+  uint32_t ii=0;
+  for (auto zr : zv) {
+   auto md=500.0f;
+   for (auto zs : ev.ztrack) { 
+     auto d = std::abs(zr-zs);
+     md = std::min(d,md);
+   }
+   dd[ii++] = md;
+  }
+  assert(ii==nv);
+  if (i==6) {
+    for (auto d:dd) std::cout << d << ' ';
+    std::cout << std::endl;
+  }
+  auto mx = std::minmax_element(dd,dd+nv);
+  float rms=0;
+  for (auto d:dd) rms+=d*d; rms = std::sqrt(rms)/(nv-1);
+  std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+  } // loop on events
+  } // lopp on ave vert
+  
+  return 0;
+}

From 2ac2da97a20f08c68cb8bfd1e36fb34f1c936ff7 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 26 Sep 2018 16:11:42 +0200
Subject: [PATCH 02/50] Disable vertex reconstruction on GPU
 (cms-patatrack#177)

On K40c and P100 we get a segmentation violation in
PixelVertexHeterogeneousProducer::produceGPUCuda(...).
---
 .../Configuration/python/RecoPixelVertexing_cff.py            | 4 ++--
 .../PixelVertexFinding/python/PixelVertexes_cfi.py            | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 5f541dd19a412..6ba1498313cb7 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -4,6 +4,6 @@
 #
 # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex
 #
-from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
-# from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
+#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
+from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
 recopixelvertexing = cms.Sequence(pixelTracksSequence*pixelVertices)
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index ea9e4b1e4e037..77a9f367b9d9b 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -20,6 +20,3 @@
 )
 
 
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoPixelVertexing.PixelVertexFinding.pixelVertexHeterogeneousProducer_cfi import pixelVertexHeterogeneousProducer as _pixelVertexHeterogeneousProducer
-gpu.toReplaceWith(pixelVertices, _pixelVertexHeterogeneousProducer)

From c0d30e4c89072128ad35afadec46e71480b31edc Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 26 Sep 2018 17:53:25 +0200
Subject: [PATCH 03/50] Introduce Cluster Charge Cut, optimize Histogram
 (bucket sorting) (cms-patatrack#171)

---
 .../test/gpuVertexFinder_t.cu                 | 49 +++++++++++++++----
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index f47c4362503ae..a92c116702231 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -13,13 +13,14 @@ struct Event {
   std::vector<uint16_t>  itrack;
   std::vector<float> ztrack;
   std::vector<float> eztrack;
+  std::vector<float> pttrack;
   std::vector<uint16_t> ivert;
 };
 
 struct ClusterGenerator {
 
   explicit ClusterGenerator(float nvert, float ntrack) :
-    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.)
+    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.)
   {}
 
   void operator()(Event & ev) {
@@ -42,6 +43,8 @@ struct ClusterGenerator {
        ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
        ev.eztrack.push_back(err*err);
        ev.ivert.push_back(iv);
+       ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) );
+       ev.pttrack.back()*=ev.pttrack.back();
       }
     }
     // add noise
@@ -51,6 +54,8 @@ struct ClusterGenerator {
       ev.ztrack.push_back(rgen(reng));
       ev.eztrack.push_back(err*err);
       ev.ivert.push_back(9999);
+      ev.pttrack.push_back( 0.5f + ptGen(reng) );
+      ev.pttrack.back()*=ev.pttrack.back();
     }
 
   }
@@ -61,7 +66,7 @@ struct ClusterGenerator {
   std::poisson_distribution<int> clusGen;
   std::poisson_distribution<int> trackGen;
   std::normal_distribution<float> gauss;
-
+  std::exponential_distribution<float> ptGen;
 
 };
 
@@ -79,11 +84,14 @@ int main() {
 
   auto zt_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto ezt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
+  auto ptt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto zv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
   auto wv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
   auto chi2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto ptv2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, 256);
 
-  auto izt_d = cuda::memory::device::make_unique<int8_t[]>(current_device, 64000);
+  auto izt_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, 64000);
   auto nn_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
   auto iv_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
 
@@ -95,9 +103,12 @@ int main() {
 
   onGPU.zt = zt_d.get();
   onGPU.ezt2 = ezt2_d.get();
+  onGPU.ptt2 = ptt2_d.get();
   onGPU.zv = zv_d.get();
   onGPU.wv = wv_d.get();
   onGPU.chi2 = chi2_d.get();
+  onGPU.ptv2 = ptv2_d.get();
+  onGPU.sortInd = ind_d.get();
   onGPU.nv = nv_d.get();
   onGPU.izt = izt_d.get();
   onGPU.nn = nn_d.get();
@@ -123,6 +134,7 @@ int main() {
 
   cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
   cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
+  cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
 
   float eps = 0.1f;
   
@@ -130,44 +142,58 @@ int main() {
   
   if ( (i%4) == 0 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.02f,12.0f
 		 );
   
   if ( (i%4) == 1 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.02f,9.0f
 		 );
   
   if ( (i%4) == 2 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.01f,9.0f
 		 );
   
   if ( (i%4) == 3 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps,
 		 0.01f,9.0f
 		 );
   
-
+  cudaDeviceSynchronize();
+  cuda::launch(sortByPt2,
+               { 1, 256 },
+               ev.ztrack.size(), onGPU_d.get()
+              );
 
   uint32_t nv;
   cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t));
+
+  if (nv==0) {
+    std::cout << "NO VERTICES???" << std::endl;
+    continue;
+  }
+
   float zv[nv];
   float	wv[nv];
   float	chi2[nv];
+  float ptv2[nv];
   int32_t nn[nv];
+  uint16_t ind[nv];
   cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float));
   cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float));
   cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  cuda::memory::copy(&ptv2, onGPU.ptv2, nv*sizeof(float));
   cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t));
   for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
    
   {
@@ -178,7 +204,12 @@ int main() {
     auto mx = std::minmax_element(chi2,chi2+nv);
     std::cout << "min max chi2 " << *mx.first << ' ' <<  *mx.second << std::endl;
   }
-  
+  {
+    auto mx = std::minmax_element(ptv2,ptv2+nv);
+    std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
+    std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' <<  ptv2[ind[nv-1]] << " at "  << ind[0] << ' ' << ind[nv-1] << std::endl;
+
+  }  
 
   float dd[nv];
   uint32_t ii=0;

From 7fb964f3f504c6c0f9c2550ae51943d5fe681b23 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 28 Nov 2018 18:06:28 +0100
Subject: [PATCH 04/50] Migrate tracker local reconstruction and pixel tracking
 to Tasks (backport #25163) (cms-patatrack#202)

Backport "Migrate tracker local reconstruction and pixel tracking to Tasks" (#25163) to the Patatrack branch:
  - migrate RecoLocalTracker_cff to Tasks;
  - migrate RecoPixelVertexing_cff to Tasks;
  - keeping sequences to avoid massive migration (for now).
---
 .../Configuration/python/RecoPixelVertexing_cff.py             | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 6ba1498313cb7..34ee6fadb04de 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -6,4 +6,5 @@
 #
 #from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
 from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
-recopixelvertexing = cms.Sequence(pixelTracksSequence*pixelVertices)
+recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
+recopixelvertexing = cms.Sequence(recopixelvertexingTask)

From b9029aa79747e26a21fda2fc82052c49da110f8f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 8 Jan 2019 18:29:08 +0100
Subject: [PATCH 05/50] Full workflow from raw data to pixel tracks and
 vertices on GPUs (cms-patatrack#216)

Port and optimise the full workflow from pixel raw data to pixel tracks and vertices to GPUs.
Clean the pixel n-tuplets with the "fishbone" algorithm (only on GPUs).

Other changes:
  - recover the Riemann fit updates lost during the merge with CMSSW 10.4.x;
  - speed up clustering and track fitting;
  - minor bug fix to avoid trivial regression with the optimized fit.
---
 .../python/RecoPixelVertexing_cff.py          |   4 +-
 .../python/PixelVertexes_cfi.py               |   3 +
 .../PixelVertexFinding/test/BuildFile.xml     |   1 +
 .../test/gpuVertexFinder_t.cu                 | 112 ++++++++++++++----
 4 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 34ee6fadb04de..e784b53b7ce1f 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -4,7 +4,7 @@
 #
 # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex
 #
-#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
-from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
+from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
+#from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
 recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
 recopixelvertexing = cms.Sequence(recopixelvertexingTask)
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index 77a9f367b9d9b..ea9e4b1e4e037 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -20,3 +20,6 @@
 )
 
 
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexHeterogeneousProducer_cfi import pixelVertexHeterogeneousProducer as _pixelVertexHeterogeneousProducer
+gpu.toReplaceWith(pixelVertices, _pixelVertexHeterogeneousProducer)
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index ad1f03999fbea..dc3b98f8456a5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -21,5 +21,6 @@
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinder_t">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   <flags CXXFLAGS="-g"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index a92c116702231..d1f508ca98798 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -4,6 +4,11 @@
 #include<cmath>
 
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
+
+
 using namespace  gpuVertexFinder;
 #include <cuda/api_wrappers.h>
 
@@ -81,7 +86,8 @@ int main() {
   }
 
   auto current_device = cuda::device::current::get();
-
+  
+  auto ntrks_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
   auto zt_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto ezt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto ptt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
@@ -96,11 +102,13 @@ int main() {
   auto iv_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
 
   auto nv_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
+  auto nv2_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
  
   auto onGPU_d = cuda::memory::device::make_unique<OnGPU[]>(current_device, 1);
 
   OnGPU onGPU;
 
+  onGPU.ntrks = ntrks_d.get();
   onGPU.zt = zt_d.get();
   onGPU.ezt2 = ezt2_d.get();
   onGPU.ptt2 = ptt2_d.get();
@@ -109,7 +117,8 @@ int main() {
   onGPU.chi2 = chi2_d.get();
   onGPU.ptv2 = ptv2_d.get();
   onGPU.sortInd = ind_d.get();
-  onGPU.nv = nv_d.get();
+  onGPU.nvFinal = nv_d.get();
+  onGPU.nvIntermediate = nv2_d.get();
   onGPU.izt = izt_d.get();
   onGPU.nn = nn_d.get();
   onGPU.iv = iv_d.get();
@@ -131,7 +140,8 @@ int main() {
   gen(ev);
   
   std::cout << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
-
+  auto nt = ev.ztrack.size();
+  cuda::memory::copy(onGPU.ntrks,&nt,sizeof(uint32_t));
   cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
   cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
   cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
@@ -143,51 +153,101 @@ int main() {
   if ( (i%4) == 0 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 onGPU_d.get(),kk,eps,
 		 0.02f,12.0f
 		 );
   
   if ( (i%4) == 1 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 onGPU_d.get(),kk,eps,
 		 0.02f,9.0f
 		 );
   
   if ( (i%4) == 2 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
+		 onGPU_d.get(),kk,eps,
 		 0.01f,9.0f
 		 );
   
   if ( (i%4) == 3 )
     cuda::launch(clusterTracks,
 		 { 1, 512+256 },
-		 ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps,
+		 onGPU_d.get(),kk,0.7f*eps,
 		 0.01f,9.0f
 		 );
-  
+  cudaCheck(cudaGetLastError());
   cudaDeviceSynchronize();
+
+  cuda::launch(fitVertices, 
+               { 1,1024-256 },
+               onGPU_d.get(),50.f
+              );
+  cudaCheck(cudaGetLastError());
+
+  uint32_t nv;
+  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
+  if (nv==0) {
+    std::cout << "NO VERTICES???" << std::endl;
+    continue;
+  }
+  float chi2[2*nv];  // make space for splitting...
+  float zv[2*nv];
+  float wv[2*nv];
+  float ptv2[2*nv];
+  int32_t nn[2*nv];
+  uint16_t ind[2*nv];
+
+  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "after fit min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+
+  cuda::launch(fitVertices,
+               { 1,1024-256 },
+               onGPU_d.get(), 50.f
+              );
+  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
+  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "before splitting min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+
+  cuda::launch(splitVertices,
+               { 1024, 64 },
+               onGPU_d.get(),
+               9.f
+              );
+ cuda::memory::copy(&nv, onGPU.nvIntermediate, sizeof(uint32_t));
+ std::cout << "after split " << nv << std::endl; 
+
+  cuda::launch(fitVertices,
+               { 1,1024-256 },
+               onGPU_d.get(),5000.f
+              );
+  cudaCheck(cudaGetLastError());
+
+
   cuda::launch(sortByPt2,
                { 1, 256 },
-               ev.ztrack.size(), onGPU_d.get()
+               onGPU_d.get()
               );
 
-  uint32_t nv;
-  cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t));
+  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
 
   if (nv==0) {
     std::cout << "NO VERTICES???" << std::endl;
     continue;
   }
 
-  float zv[nv];
-  float	wv[nv];
-  float	chi2[nv];
-  float ptv2[nv];
-  int32_t nn[nv];
-  uint16_t ind[nv];
+
   cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float));
   cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float));
   cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
@@ -195,15 +255,16 @@ int main() {
   cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
   cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t));
   for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
-   
   {
-    auto mx = std::minmax_element(wv,wv+nv);
-    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
   }
+
   {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "min max chi2 " << *mx.first << ' ' <<  *mx.second << std::endl;
+    auto mx = std::minmax_element(wv,wv+nv);
+    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
   }
+
   {
     auto mx = std::minmax_element(ptv2,ptv2+nv);
     std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
@@ -212,16 +273,15 @@ int main() {
   }  
 
   float dd[nv];
-  uint32_t ii=0;
-  for (auto zr : zv) {
+  for (auto kv=0U; kv<nv; ++kv) {
+   auto zr = zv[kv];
    auto md=500.0f;
    for (auto zs : ev.ztrack) { 
      auto d = std::abs(zr-zs);
      md = std::min(d,md);
    }
-   dd[ii++] = md;
+   dd[kv] = md;
   }
-  assert(ii==nv);
   if (i==6) {
     for (auto d:dd) std::cout << d << ' ';
     std::cout << std::endl;

From bc773904e705ee5988d97a8a0592563f4071eee5 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 17 Jan 2019 15:50:11 +0100
Subject: [PATCH 06/50] Skip CUDA-related tests if no GPU is present
 (cms-patatrack#252)

Make unit tests that require a CUDA device skip the test and exit
succesfully if the CUDA runtime is not available, or no CUDA devices
are available.
---
 .../PixelVertexFinding/test/BuildFile.xml     | 39 ++++++++++---------
 .../test/gpuVertexFinder_t.cu                 | 25 +++++-------
 2 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index dc3b98f8456a5..6ab97d3171d8e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -1,22 +1,23 @@
-<use   name="boost"/>
-<use   name="root"/>
-<use   name="FWCore/Framework"/>
-<use   name="FWCore/PluginManager"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="Geometry/Records"/>
-<use   name="Geometry/CommonDetUnit"/>
-<use   name="Geometry/TrackerGeometryBuilder"/>
-<use   name="CommonTools/Clustering1D"/>
-<use   name="DataFormats/TrackerRecHit2D"/>
-<use   name="RecoTracker/TkHitPairs"/>
-<use   name="RecoTracker/TkTrackingRegions"/>
-<use   name="RecoPixelVertexing/PixelTriplets"/>
-<use   name="RecoPixelVertexing/PixelTrackFitting"/>
-<use   name="MagneticField/Records"/>
-<use   name="MagneticField/Engine"/>
-<use   name="TrackingTools/TransientTrack"/>
-<use   name="RecoVertex/KalmanVertexFit"/>
-<use   name="SimDataFormats/Track"/>
+<use name="boost"/>
+<use name="root"/>
+<use name="CommonTools/Clustering1D"/>
+<use name="DataFormats/TrackerRecHit2D"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="Geometry/CommonDetUnit"/>
+<use name="Geometry/Records"/>
+<use name="Geometry/TrackerGeometryBuilder"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="MagneticField/Engine"/>
+<use name="MagneticField/Records"/>
+<use name="RecoPixelVertexing/PixelTrackFitting"/>
+<use name="RecoPixelVertexing/PixelTriplets"/>
+<use name="RecoTracker/TkHitPairs"/>
+<use name="RecoTracker/TkTrackingRegions"/>
+<use name="RecoVertex/KalmanVertexFit"/>
+<use name="SimDataFormats/Track"/>
+<use name="TrackingTools/TransientTrack"/>
 
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinder_t">
   <use name="cuda"/>
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index d1f508ca98798..84334917052ed 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -1,17 +1,18 @@
-#include<random>
-#include<vector>
-#include<cstdint>
-#include<cmath>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
 
+#include <cuda/api_wrappers.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
 
-
-using namespace  gpuVertexFinder;
-#include <cuda/api_wrappers.h>
-
+using namespace gpuVertexFinder;
 
 struct Event {
   std::vector<float> zvert;
@@ -76,14 +77,8 @@ struct ClusterGenerator {
 };
 
 
-#include<iostream>
-
 int main() {
-
-  if (cuda::device::count() == 0) {
-    std::cerr << "No CUDA devices on this system" << "\n";
-    exit(EXIT_FAILURE);
-  }
+  exitSansCUDADevices();
 
   auto current_device = cuda::device::current::get();
   

From 2e22e5d6e24d3669244a3d1b35df3bbc3f20f2d6 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 15 Mar 2019 09:26:12 -0500
Subject: [PATCH 07/50] Various updates to pixel track/vertex DQM and MTV
 (cms-patatrack#285)

* Add DQM for pixel vertices

* Add pT>0.9GeV pixel track collections to MTV

* Add dzPV0p1, Pt0to1, Pt1 variants of pixel track DQM
---
 .../python/pixelVertexResolutionClient_cfi.py |   7 +
 .../python/DQMOffline_SecondStep_cff.py       | 222 +++++--------
 .../Configuration/python/DQMOffline_cff.py    | 301 +++++++-----------
 .../RecoB/python/PixelVertexMonitor_cff.py    |   7 +
 4 files changed, 215 insertions(+), 322 deletions(-)
 create mode 100644 DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
 create mode 100644 DQMOffline/RecoB/python/PixelVertexMonitor_cff.py

diff --git a/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
new file mode 100644
index 0000000000000..2558e88d26012
--- /dev/null
+++ b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
@@ -0,0 +1,7 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQM.TrackingMonitorClient.primaryVertexResolutionClient_cfi import primaryVertexResolutionClient as _primaryVertexResolutionClient
+
+pixelVertexResolutionClient = _primaryVertexResolutionClient.clone(
+    subDirs = ["OfflinePixelPV/Resolution/*"]
+)
diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 368b328632fd8..35c6082146f68 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -1,196 +1,135 @@
 import FWCore.ParameterSet.Config as cms
 
+from CondTools.DQM.DQMReferenceHistogramRootFileEventSetupAnalyzer_cfi import *
 from DQMServices.Components.DQMMessageLoggerClient_cff import *
+from DQMServices.Components.DQMDcsInfoClient_cfi import *
 from DQMServices.Components.DQMFastTimerServiceClient_cfi import *
 
 from DQMOffline.Ecal.ecal_dqm_client_offline_cff import *
-from DQM.EcalPreshowerMonitorClient.es_dqm_client_offline_cff import *
 from DQM.SiStripMonitorClient.SiStripClientConfig_Tier0_cff import *
 from DQM.SiPixelCommon.SiPixelOfflineDQM_client_cff import *
 from DQM.DTMonitorClient.dtDQMOfflineClients_cff import *
 from DQM.RPCMonitorClient.RPCTier0Client_cff import *
 from DQM.CSCMonitorModule.csc_dqm_offlineclient_collisions_cff import *
-from DQMOffline.Muon.gem_dqm_offline_client_cff import *
-from DQMOffline.Hcal.HcalDQMOfflinePostProcessor_cff import *
-from DQM.HcalTasks.OfflineHarvestingSequence_pp import *
+from DQM.EcalPreshowerMonitorClient.es_dqm_client_offline_cff import *
+from DQM.BeamMonitor.AlcaBeamMonitorClient_cff import *
 from DQMServices.Components.DQMFEDIntegrityClient_cff import *
+from Validation.RecoTau.DQMSequences_cfi import *
+from DQMOffline.Hcal.HcalDQMOfflinePostProcessor_cff import *
 from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
-from DQM.SiTrackerPhase2.Phase2TrackerDQMHarvesting_cff import *
-
-DQMNone = cms.Sequence()
-
-DQMOffline_SecondStepEcal = cms.Sequence( ecal_dqm_client_offline *
-					  es_dqm_client_offline )
-
-DQMOffline_SecondStepTrackerStrip = cms.Sequence( SiStripOfflineDQMClient )
-
-DQMOffline_SecondStepTrackerPixel = cms.Sequence( PixelOfflineDQMClientNoDataCertification )
+from DQM.HcalTasks.OfflineHarvestingSequence_pp import *
+from PhysicsTools.NanoAOD.nanoDQM_cff import *
 
-DQMOffline_SecondStepMuonDPG = cms.Sequence( dtClients *
+DQMOffline_SecondStep_PreDPG = cms.Sequence( dqmDcsInfoClient *
+                                             ecal_dqm_client_offline *
+                                             SiStripOfflineDQMClient *
+                                             PixelOfflineDQMClientNoDataCertification *
+                                             dtClients *
                                              rpcTier0Client *
-                                             cscOfflineCollisionsClients )
-
-from Configuration.Eras.Modifier_run3_GEM_cff import run3_GEM
-_run3_GEM_DQMOffline_SecondStepMuonDPG = DQMOffline_SecondStepMuonDPG.copy()
-_run3_GEM_DQMOffline_SecondStepMuonDPG += gemClients
-run3_GEM.toReplaceWith(DQMOffline_SecondStepMuonDPG, _run3_GEM_DQMOffline_SecondStepMuonDPG)
-
-DQMOffline_SecondStepHcal = cms.Sequence( hcalOfflineHarvesting )
-
-DQMOffline_SecondStepHcal2 = cms.Sequence(  HcalDQMOfflinePostProcessor )
-
-DQMOffline_SecondStepFED = cms.Sequence( dqmFEDIntegrityClient )
-
-DQMOffline_SecondStepL1T = cms.Sequence( l1TriggerDqmOfflineClient )
-
-DQMOffline_SecondStep_PreDPG = cms.Sequence( 
-                                             DQMOffline_SecondStepEcal *
-                                             DQMOffline_SecondStepTrackerStrip *
-					     DQMOffline_SecondStepTrackerPixel *
-                                             DQMOffline_SecondStepMuonDPG *
-					     DQMOffline_SecondStepHcal *
-					     DQMOffline_SecondStepHcal2 *
-                                             DQMOffline_SecondStepFED *
-					     DQMOffline_SecondStepL1T )
-
-DQMOffline_SecondStepDPG = cms.Sequence(
+                                             cscOfflineCollisionsClients *
+                                             es_dqm_client_offline *
+                                             hcalOfflineHarvesting *
+                                             HcalDQMOfflinePostProcessor *
+                                             dqmFEDIntegrityClient *
+                                             l1TriggerDqmOfflineClient )
+
+DQMOffline_SecondStepDPG = cms.Sequence( dqmRefHistoRootFileGetter *
                                          DQMOffline_SecondStep_PreDPG *
                                          DQMMessageLoggerClientSeq )
 
-
-from DQM.TrackingMonitorClient.TrackingClientConfig_Tier0_cff import *
 from DQMOffline.Muon.muonQualityTests_cff import *
 from DQMOffline.EGamma.egammaPostProcessing_cff import *
 from DQMOffline.Trigger.DQMOffline_Trigger_Client_cff import *
 from DQMOffline.Trigger.DQMOffline_HLT_Client_cff import *
 from DQMOffline.RecoB.dqmCollector_cff import *
-from DQM.BeamMonitor.AlcaBeamMonitorClient_cff import *
 from DQMOffline.JetMET.SusyPostProcessor_cff import *
+from DQMOffline.JetMET.dataCertificationJetMET_cff import *
+from DQM.TrackingMonitorClient.TrackingClientConfig_Tier0_cff import *
+from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
+from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import *
+from DQM.SiOuterTracker.OuterTrackerClientConfig_cff import *
 
-DQMOffline_SecondStepTracking = cms.Sequence ( TrackingOfflineDQMClient )
-
-DQMOffline_SecondStepMUO = cms.Sequence ( muonQualityTests )
-
-DQMOffline_SecondStepEGamma = cms.Sequence( egammaPostProcessing )
-
-DQMOffline_SecondStepTrigger = cms.Sequence( triggerOfflineDQMClient *
-						hltOfflineDQMClient )
-
-DQMOffline_SecondStepBTag = cms.Sequence( bTagCollectorSequenceDATA )
-
-DQMOffline_SecondStepBeam = cms.Sequence( alcaBeamMonitorClient )
-
-DQMOffline_SecondStepJetMET = cms.Sequence( SusyPostProcessorSequence )
-
-DQMOffline_SecondStep_PrePOG = cms.Sequence( DQMOffline_SecondStepTracking *
-                                             DQMOffline_SecondStepMUO *
-                                             DQMOffline_SecondStepEGamma *
-                                             DQMOffline_SecondStepTrigger *
-                                             DQMOffline_SecondStepBTag *
-                                             DQMOffline_SecondStepBeam *
-                                             DQMOffline_SecondStepJetMET )
-
-DQMOffline_SecondStepPOG = cms.Sequence(
+DQMOffline_SecondStep_PrePOG = cms.Sequence( TrackingOfflineDQMClient *
+                                             muonQualityTests *
+                                             egammaPostProcessing *
+                                             triggerOfflineDQMClient *
+                                             hltOfflineDQMClient *
+                                             bTagCollectorSequenceDATA *
+                                             alcaBeamMonitorClient *
+                                             SusyPostProcessorSequence *
+                                             runTauEff)
+from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
+
+DQMOffline_SecondStepPOG = cms.Sequence( dqmRefHistoRootFileGetter *
                                          DQMOffline_SecondStep_PrePOG *
                                          DQMMessageLoggerClientSeq )
 
 
 HLTMonitoringClient = cms.Sequence(trackingMonitorClientHLT * trackingForDisplacedJetMonitorClientHLT)
 HLTMonitoringClientPA= cms.Sequence(trackingMonitorClientHLT * PAtrackingMonitorClientHLT)
-
-DQMOffline_SecondStep = cms.Sequence(
+DQMOffline_SecondStep = cms.Sequence( dqmRefHistoRootFileGetter *
                                       DQMOffline_SecondStep_PreDPG *
                                       DQMOffline_SecondStep_PrePOG *
                                       HLTMonitoringClient *
                                       DQMMessageLoggerClientSeq *
                                       dqmFastTimerServiceClient)
 
-DQMOffline_SecondStep_ExtraHLT = cms.Sequence( hltOfflineDQMClientExtra )
+DQMOffline_SecondStep_ExtraHLT = cms.Sequence(
+    hltOfflineDQMClientExtra
+)
 
 DQMOffline_SecondStep_FakeHLT = cms.Sequence( DQMOffline_SecondStep )
 DQMOffline_SecondStep_FakeHLT.remove( HLTMonitoringClient )
-DQMOffline_SecondStep_FakeHLT.remove( DQMOffline_SecondStepTrigger )
 
 DQMOffline_SecondStep_PrePOGMC = cms.Sequence( bTagCollectorSequenceDATA )
 
-DQMOffline_SecondStepPOGMC = cms.Sequence( DQMOffline_SecondStep_PrePOGMC *
+DQMOffline_SecondStepPOGMC = cms.Sequence( dqmRefHistoRootFileGetter *
+                                           DQMOffline_SecondStep_PrePOGMC *
                                            DQMMessageLoggerClientSeq )
 
-# Harvest
-from DQMOffline.JetMET.dataCertificationJetMET_cff import *
-from DQM.SiOuterTracker.OuterTrackerClientConfig_cff import *
-from DQM.CTPPS.ctppsDQM_cff import *
-from Validation.RecoTau.DQMSequences_cfi import *
-from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
-
-DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient )
-
-DQMHarvestTrackerPixel = cms.Sequence ( PixelOfflineDQMClientNoDataCertification )
-
-DQMHarvestTrack = cms.Sequence ( TrackingOfflineDQMClient )
-
-DQMHarvestTrigger = cms.Sequence ( triggerOfflineDQMClient *
-				    hltOfflineDQMClient )
-
-DQMHarvestFED = cms.Sequence ( dqmFEDIntegrityClient )
-
-DQMHarvestBeam = cms.Sequence ( alcaBeamMonitorClient )
-
-DQMHarvestTAU = cms.Sequence ( runTauEff )
-
-DQMHarvestL1T = cms.Sequence( l1TriggerDqmOfflineClient )
-
-DQMHarvestL1TEgamma = cms.Sequence( l1TriggerEgDqmOfflineClient )
-
-DQMHarvestL1TMuon = cms.Sequence( l1TriggerMuonDqmOfflineClient )
-
-DQMHarvestCommon = cms.Sequence( DQMMessageLoggerClientSeq *
-                                 DQMHarvestTrackerStrip *
-                                 DQMHarvestTrack *
-                                 DQMHarvestTrackerPixel *
-				 DQMHarvestTrigger *
-                                 DQMHarvestFED *
-                                 DQMHarvestBeam *
-                                 DQMHarvestTAU *
+DQMHarvestCommon = cms.Sequence( dqmRefHistoRootFileGetter *
+                                 DQMMessageLoggerClientSeq *
+                                 dqmDcsInfoClient *
+                                 SiStripOfflineDQMClient *
+                                 TrackingOfflineDQMClient *
+                                 PixelOfflineDQMClientNoDataCertification *
+                                 triggerOfflineDQMClient *
+                                 hltOfflineDQMClient *
+                                 dqmFEDIntegrityClient *
+                                 alcaBeamMonitorClient *
+                                 runTauEff *
                                  dqmFastTimerServiceClient
                                 )
-
-DQMHarvestCommonFakeHLT = cms.Sequence( DQMHarvestCommon )
-DQMHarvestCommonFakeHLT.remove( DQMHarvestTrigger )
-
-DQMHarvestCommonSiStripZeroBias = cms.Sequence(
+DQMHarvestCommonSiStripZeroBias = cms.Sequence(dqmRefHistoRootFileGetter *
                                                DQMMessageLoggerClientSeq *
-                                               DQMHarvestTrackerStrip *
-                                               DQMHarvestTrack *
-                                               DQMHarvestTrackerPixel *
-                                               DQMHarvestTrigger *
-                                               DQMHarvestL1T *
-                                               DQMHarvestFED *
-                                               DQMHarvestBeam *
+                                               dqmDcsInfoClient *
+                                               SiStripOfflineDQMClient *
+                                               TrackingOfflineDQMClient *
+                                               PixelOfflineDQMClientNoDataCertification *
+                                               triggerOfflineDQMClient *
+                                               hltOfflineDQMClient *
+                                               l1TriggerDqmOfflineClient *
+                                               dqmFEDIntegrityClient *
+                                               alcaBeamMonitorClient *
+                                               runTauEff  *
                                                dqmFastTimerServiceClient
                                                )
 
-DQMHarvestCommonSiStripZeroBiasFakeHLT = cms.Sequence( DQMHarvestCommonSiStripZeroBias )
-DQMHarvestCommonSiStripZeroBiasFakeHLT.remove( DQMHarvestTrigger )
-
 DQMHarvestTracking = cms.Sequence( TrackingOfflineDQMClient *
                                    dqmFastTimerServiceClient )
 
-DQMHarvestTrackingZeroBias = cms.Sequence( TrackingOfflineDQMClientZeroBias *
-                                           dqmFastTimerServiceClient )
+DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern *
+                                        pixelVertexResolutionClient )
 
-DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern )
-
-DQMHarvestOuterTracker = cms.Sequence(
+DQMHarvestOuterTracker = cms.Sequence( dqmRefHistoRootFileGetter *
+                                 dqmDcsInfoClient *
                                  OuterTrackerClient *
                                  dqmFEDIntegrityClient *
                                  DQMMessageLoggerClientSeq *
                                  dqmFastTimerServiceClient
                                  )
-DQMHarvestTrackerPhase2 = cms.Sequence(trackerphase2DQMHarvesting)
-
 
-DQMHarvestCTPPS = cms.Sequence( ctppsDQMOfflineHarvest )
+DQMHarvestLumi = cms.Sequence()
 
 DQMHarvestMuon = cms.Sequence( dtClients *
                                rpcTier0Client *
@@ -198,17 +137,11 @@
                                muonQualityTests
                                )
 
-_run3_GEM_DQMHarvestMuon = DQMHarvestMuon.copy()
-_run3_GEM_DQMHarvestMuon += gemClients
-run3_GEM.toReplaceWith(DQMHarvestMuon, _run3_GEM_DQMHarvestMuon)
-
 DQMHarvestEcal = cms.Sequence( ecal_dqm_client_offline *
                                 es_dqm_client_offline
                               )
 
-DQMHarvestHcal = cms.Sequence( hcalOfflineHarvesting )
-
-DQMHarvestHcal2 = cms.Sequence( HcalDQMOfflinePostProcessor )
+DQMHarvestHcal = cms.Sequence(hcalOfflineHarvesting)
 
 DQMHarvestJetMET = cms.Sequence( SusyPostProcessorSequence )
 
@@ -216,9 +149,12 @@
 
 DQMHarvestBTag = cms.Sequence( bTagCollectorSequenceDATA )
 
-from PhysicsTools.NanoAOD.nanoDQM_cff import *
-from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import *
-
-DQMHarvestMiniAOD = cms.Sequence( dataCertificationJetMETSequence * muonQualityTests_miniAOD * DQMHarvestPF )
+DQMHarvestMiniAOD = cms.Sequence( dataCertificationJetMETSequence * muonQualityTests_miniAOD)
 DQMHarvestNanoAOD = cms.Sequence( nanoHarvest )
 
+# L1 trigger sequences
+DQMHarvestL1TMonitoring = cms.Sequence( l1TriggerDqmOfflineClient )
+
+DQMHarvestL1TEgamma = cms.Sequence( l1TriggerEgDqmOfflineClient )
+
+DQMHarvestL1TMuon = cms.Sequence( l1TriggerMuonDqmOfflineClient )
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 2001c22352a48..5cb9af6d3a960 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -1,264 +1,207 @@
 import FWCore.ParameterSet.Config as cms
 
 from DQMServices.Components.DQMMessageLogger_cfi import *
-from DQMServices.Components.DQMProvInfo_cfi import *
+from DQMServices.Components.DQMDcsInfo_cfi import *
 from DQMServices.Components.DQMFastTimerService_cff import *
 
-from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQMOffline.Ecal.ecal_dqm_source_offline_cff import *
-from DQM.EcalPreshowerMonitorModule.es_dqm_source_offline_cff import *
 from DQM.HcalTasks.OfflineSourceSequence_pp import *
-from DQMOffline.Hcal.HcalDQMOfflineSequence_cff import *
 from DQM.SiStripMonitorClient.SiStripSourceConfigTier0_cff import *
 from DQM.SiPixelCommon.SiPixelOfflineDQM_source_cff import *
 from DQM.DTMonitorModule.dtDQMOfflineSources_cff import *
 from DQM.RPCMonitorClient.RPCTier0Source_cff import *
 from DQM.CSCMonitorModule.csc_dqm_sourceclient_offline_cff import *
-from DQMOffline.Muon.gem_dqm_offline_source_cff import *
+from DQM.EcalPreshowerMonitorModule.es_dqm_source_offline_cff import *
+from DQM.BeamMonitor.AlcaBeamMonitor_cff import *
 from DQM.CastorMonitor.castor_dqm_sourceclient_offline_cff import *
+from Validation.RecoTau.DQMSequences_cfi import *
+from DQMOffline.Hcal.HcalDQMOfflineSequence_cff import *
+from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQM.CTPPS.ctppsDQM_cff import *
-from DQM.SiTrackerPhase2.Phase2TrackerDQMFirstStep_cff import *
 
 DQMNone = cms.Sequence()
 
-DQMMessageLoggerSeq = cms.Sequence( DQMMessageLogger )
-
-dqmProvInfo.runType = "pp_run"
-dqmProvInfo.dcsRecord = cms.untracked.InputTag("onlineMetaDataDigis")
-DQMOfflineDCS = cms.Sequence( dqmProvInfo )
-
-# L1 trigger sequences
-DQMOfflineL1T = cms.Sequence( l1TriggerDqmOffline ) # L1 emulator is run within this sequence for real data
-
-DQMOfflineL1TEgamma = cms.Sequence( l1TriggerEgDqmOffline )
-
-DQMOfflineL1TMuon = cms.Sequence( l1TriggerMuonDqmOffline )
-
-#DPGs
-DQMOfflineEcalOnly = cms.Sequence(
-    ecalOnly_dqm_source_offline +
-    es_dqm_source_offline )
-
-DQMOfflineEcal = cms.Sequence(
-    ecal_dqm_source_offline +
-    es_dqm_source_offline )
-
-#offline version of the online DQM: used in validation/certification
-DQMOfflineHcal = cms.Sequence( hcalOfflineSourceSequence )
-
-# offline DQM: used in Release validation
-DQMOfflineHcal2 = cms.Sequence( HcalDQMOfflineSequence )
-
-DQMOfflineHcalOnly = cms.Sequence( hcalOnlyOfflineSourceSequence )
-
-DQMOfflineHcal2Only = cms.Sequence( RecHitsDQMOffline )
-
-DQMOfflineTrackerStrip = cms.Sequence( SiStripDQMTier0 )
-
-DQMOfflineTrackerPixel = cms.Sequence( 	siPixelOfflineDQM_source )
-
-DQMOfflineMuonDPG = cms.Sequence( dtSources *
-                                  rpcTier0Source *
-                                  cscSources )
-
-from Configuration.Eras.Modifier_run3_GEM_cff import run3_GEM
-_run3_GEM_DQMOfflineMuonDPG = DQMOfflineMuonDPG.copy()
-_run3_GEM_DQMOfflineMuonDPG += gemSources
-run3_GEM.toReplaceWith(DQMOfflineMuonDPG, _run3_GEM_DQMOfflineMuonDPG)
-
-DQMOfflineCASTOR = cms.Sequence( castorSources )
-
-DQMOfflineCTPPS = cms.Sequence( ctppsDQMOfflineSource )
-
-DQMOfflinePreDPG = cms.Sequence( DQMOfflineDCS *
-				 DQMOfflineL1T *
-                                 DQMOfflineEcal *
-                                 DQMOfflineHcal *
-				 DQMOfflineHcal2 *
-                                 DQMOfflineTrackerStrip *
-				 DQMOfflineTrackerPixel *
-				 DQMOfflineMuonDPG *
-                                 DQMOfflineCASTOR *
-                                 DQMOfflineCTPPS )
+DQMOfflinePreDPG = cms.Sequence( dqmDcsInfo *
+                                 l1TriggerDqmOffline * # L1 emulator is run within this sequence for real data
+                                 ecal_dqm_source_offline *
+                                 hcalOfflineSourceSequence *
+                                 SiStripDQMTier0 *
+                                 siPixelOfflineDQM_source *
+                                 dtSources *
+                                 rpcTier0Source *
+                                 cscSources *
+                                 es_dqm_source_offline *
+                                 castorSources *
+                                 HcalDQMOfflineSequence )
 
 DQMOfflineDPG = cms.Sequence( DQMOfflinePreDPG *
                               DQMMessageLogger )
 
-from DQM.TrackingMonitorSource.TrackingSourceConfig_Tier0_cff import *
-from DQMOffline.RecoB.PrimaryVertexMonitor_cff import *
-from DQM.TrackingMonitor.trackingRecoMaterialAnalyzer_cfi import materialDumperAnalyzer
 from DQMOffline.Muon.muonMonitors_cff import *
 from DQMOffline.JetMET.jetMETDQMOfflineSource_cff import *
 from DQMOffline.EGamma.egammaDQMOffline_cff import *
 from DQMOffline.Trigger.DQMOffline_Trigger_cff import *
+from DQMOffline.RecoB.PrimaryVertexMonitor_cff import *
 from DQMOffline.RecoB.dqmAnalyzer_cff import *
-from DQM.BeamMonitor.AlcaBeamMonitor_cff import *
+from DQMOffline.Lumi.ZCounting_cff import *
 from DQM.Physics.DQMPhysics_cff import *
-
-DQMOfflineVertex = cms.Sequence( pvMonitor )
-
+from DQM.Physics.DQMTopMiniAOD_cff import *
+from Validation.RecoTau.DQMSequences_cfi import *
+from DQM.TrackingMonitorSource.TrackingSourceConfig_Tier0_cff import *
+from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import *
+from DQMOffline.RecoB.PixelVertexMonitor_cff import *
+from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
+# miniAOD DQM validation
+from Validation.RecoParticleFlow.miniAODDQM_cff import *
+from DQM.TrackingMonitor.tracksDQMMiniAOD_cff import * 
+from DQM.TrackingMonitor.trackingRecoMaterialAnalyzer_cfi import materialDumperAnalyzer
 materialDumperAnalyzer.usePV = True
-DQMOfflineTracking = cms.Sequence( TrackingDQMSourceTier0 *
-                                   DQMOfflineVertex *
-                                   materialDumperAnalyzer )
-
-DQMOfflineMUO = cms.Sequence(muonMonitors)
-muonRecoAnalyzer.doMVA =         cms.bool( True )
-muonRecoAnalyzer_miniAOD.doMVA = cms.bool( True )
-
-DQMOfflineJetMET = cms.Sequence( jetMETDQMOfflineSource )
-
-DQMOfflineEGamma = cms.Sequence( egammaDQMOffline )
-
-DQMOfflineTrigger = cms.Sequence( triggerOfflineDQMSource )
-
-DQMOfflineBTag = cms.Sequence( bTagPlotsDATA )
-
-DQMOfflineBeam = cms.Sequence( alcaBeamMonitor )
-
-DQMOfflinePhysics = cms.Sequence( dqmPhysics )
-
-DQMOfflinePrePOG = cms.Sequence( DQMOfflineTracking *
-                                 DQMOfflineMUO *
-                                 DQMOfflineJetMET *
-                                 DQMOfflineEGamma *
-                                 DQMOfflineTrigger *
-                                 DQMOfflineBTag *
-                                 DQMOfflineBeam *
-                                 DQMOfflinePhysics )
 
+DQMOfflinePrePOG = cms.Sequence( TrackingDQMSourceTier0 *
+                                 muonMonitors *
+                                 jetMETDQMOfflineSource *
+                                 egammaDQMOffline *
+                                 triggerOfflineDQMSource *
+                                 pvMonitor *
+                                 materialDumperAnalyzer *
+                                 bTagPlotsDATA *
+                                 alcaBeamMonitor *
+                                 dqmPhysics *
+                                 produceDenoms *
+                                 pfTauRunDQMValidation)
+from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
 
 DQMOfflinePOG = cms.Sequence( DQMOfflinePrePOG *
                               DQMMessageLogger )
 
 HLTMonitoring = cms.Sequence( OfflineHLTMonitoring )
 HLTMonitoringPA = cms.Sequence( OfflineHLTMonitoringPA )
-
-# Data
 DQMOffline = cms.Sequence( DQMOfflinePreDPG *
                            DQMOfflinePrePOG *
                            HLTMonitoring *
+                           # dqmFastTimerServiceLuminosity *
                            DQMMessageLogger )
 
-DQMOfflineExtraHLT = cms.Sequence( offlineValidationHLTSource )
+_ctpps_2016_DQMOffline = DQMOffline.copy()
+_ctpps_2016_DQMOffline *= ctppsDQM
+from Configuration.Eras.Modifier_ctpps_2016_cff import ctpps_2016
+ctpps_2016.toReplaceWith(DQMOffline, _ctpps_2016_DQMOffline)
+
+_ctpps_2016_DQMOffline = DQMOffline.copy()
+#_ctpps_2016_DQMOffline *= ctppsDQM
+ctpps_2016.toReplaceWith(DQMOffline, _ctpps_2016_DQMOffline)
+
+DQMOfflineExtraHLT = cms.Sequence(
+    offlineValidationHLTSource
+)
 
 
 DQMOfflineFakeHLT = cms.Sequence( DQMOffline )
 DQMOfflineFakeHLT.remove( HLTMonitoring )
-DQMOfflineFakeHLT.remove( DQMOfflineTrigger )
 
-#MC
-DQMOfflinePrePOGMC = cms.Sequence( DQMOfflineVertex *
-                                   DQMOfflineBTag *
-                                   DQMOfflinePhysics )
+DQMOfflinePrePOGMC = cms.Sequence( pvMonitor *
+                                   bTagPlotsDATA *
+                                   dqmPhysics )
 
 DQMOfflinePOGMC = cms.Sequence( DQMOfflinePrePOGMC *
                                 DQMMessageLogger )
 
-#DQMOfflineCommon
-from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import *
-from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
-from Validation.RecoTau.DQMSequences_cfi import *
-
-DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring )
+DQMOfflinePhysics = cms.Sequence( dqmPhysics )
 
-DQMOuterTracker = cms.Sequence( DQMOfflineDCS *
-                                OuterTrackerSource *
-                                DQMMessageLogger *
-                                DQMOfflinePhysics *
-                                DQMOfflineVertex 
-                                )
 
-DQMOfflineTrackerPhase2 = cms.Sequence( trackerphase2DQMSource )
 
-DQMOfflineTAU = cms.Sequence( produceDenomsData *
-				pfTauRunDQMValidation )
+DQMOfflineTracking = cms.Sequence( TrackingDQMSourceTier0Common *
+                                   pvMonitor *
+                                   materialDumperAnalyzer
+                                 )
 
-DQMOfflineTrackerStripCommon = cms.Sequence( SiStripDQMTier0Common )
+DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring +
+                                        pixelPVMonitor )
 
-DQMOfflineTrackerPixel = cms.Sequence( siPixelOfflineDQM_source )
+DQMOuterTracker = cms.Sequence( dqmDcsInfo *
+                                OuterTrackerSource *
+                                DQMMessageLogger *
+                                dqmPhysics *
+                                pvMonitor *
+                                produceDenoms
+                                )
 
-DQMOfflineCommon = cms.Sequence( DQMOfflineDCS *
+DQMOfflineCommon = cms.Sequence( dqmDcsInfo *
                                  DQMMessageLogger *
-				 DQMOfflineTrackerStrip * 
-				 DQMOfflineTrackerPixel *
+                                 SiStripDQMTier0Common *
+                                 siPixelOfflineDQM_source *
                                  DQMOfflineTracking *
-                                 DQMOfflineTrigger *
-                                 DQMOfflineBeam *
-                                 DQMOfflineCASTOR *
-                                 DQMOfflinePhysics *
-				 DQMOfflineTAU
+                                 triggerOfflineDQMSource *
+                                 alcaBeamMonitor *
+                                 castorSources *
+                                 dqmPhysics *
+                                 produceDenoms *
+                                 pfTauRunDQMValidation
                                 )
-
-DQMOfflineCommonFakeHLT = cms.Sequence( DQMOfflineCommon )
-DQMOfflineCommonFakeHLT.remove( DQMOfflineTrigger )
-
-#MinBias/ZeroBias
-DQMOfflineTrackerStripMinBias = cms.Sequence( SiStripDQMTier0MinBias )
-
-DQMOfflineTrackingMinBias = cms.Sequence( TrackingDQMSourceTier0MinBias *
-                                   DQMOfflineVertex *
-                                   materialDumperAnalyzer )
-
-
-DQMOfflineCommonSiStripZeroBias = cms.Sequence( DQMOfflineDCS *
+DQMOfflineCommonSiStripZeroBias = cms.Sequence( dqmDcsInfo *
                                  DQMMessageLogger *
-				 DQMOfflineTrackerStripMinBias *
-				 DQMOfflineTrackerPixel *
-                                 DQMOfflineL1T *
-                                 DQMOfflineTrigger *
-                                 DQMOfflineBeam *
-                                 DQMOfflineCASTOR *
-                                 DQMOfflinePhysics *
-				 DQMOfflineTrackingMinBias
+                                 SiStripDQMTier0MinBias *
+                                 TrackingDQMSourceTier0MinBias *
+                                 siPixelOfflineDQM_source *
+                                 l1TriggerDqmOffline *
+                                 triggerOfflineDQMSource *
+                                 alcaBeamMonitor *
+                                 castorSources *
+                                 dqmPhysics *
+                                 pvMonitor *
+                                 materialDumperAnalyzer *
+                                 produceDenoms *
+                                 pfTauRunDQMValidation
                                  )
-
-DQMOfflineCommonSiStripZeroBiasFakeHLT = cms.Sequence( DQMOfflineCommonSiStripZeroBias )
-DQMOfflineCommonSiStripZeroBiasFakeHLT.remove( DQMOfflineTrigger )
-
-#Other definitons
-from DQMOffline.Lumi.ZCounting_cff import *
-
 DQMOfflineLumi = cms.Sequence ( zcounting )
 
+muonRecoAnalyzer.doMVA =         cms.bool( True )
+muonRecoAnalyzer_miniAOD.doMVA = cms.bool( True )
+
 DQMOfflineMuon = cms.Sequence( dtSources *
                                rpcTier0Source *
                                cscSources *
                                muonMonitors
                               )
 
-_run3_GEM_DQMOfflineMuon = DQMOfflineMuon.copy()
-_run3_GEM_DQMOfflineMuon += gemSources
-run3_GEM.toReplaceWith(DQMOfflineMuon, _run3_GEM_DQMOfflineMuon)
+DQMOfflineHcal = cms.Sequence( hcalOfflineSourceSequence )
 
-#Taus not created in pp conditions for HI
-from Configuration.ProcessModifiers.pp_on_AA_cff import pp_on_AA
-_DQMOfflineTAU = cms.Sequence()
-pp_on_AA.toReplaceWith(DQMOfflineTAU, _DQMOfflineTAU)
+DQMOfflineEcal = cms.Sequence( ecal_dqm_source_offline *
+                               es_dqm_source_offline
+                             )
+DQMOfflineJetMET = cms.Sequence( jetMETDQMOfflineSource )
 
+DQMOfflineEGamma = cms.Sequence( egammaDQMOffline )
+
+DQMOfflineBTag = cms.Sequence( bTagPlotsDATA )
 
-# miniAOD DQM validation
-from Validation.RecoParticleFlow.miniAODDQM_cff import * # On MiniAOD vs RECO
-from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import * # MiniAOD PF variables
-from DQM.TrackingMonitor.tracksDQMMiniAOD_cff import *
 from DQMOffline.Muon.miniAOD_cff import *
-from DQM.Physics.DQMTopMiniAOD_cff import *
 
-DQMOfflineMiniAOD = cms.Sequence(jetMETDQMOfflineRedoProductsMiniAOD*muonMonitors_miniAOD*MuonMiniAOD*DQMOfflinePF)
+DQMOfflineMiniAOD = cms.Sequence(jetMETDQMOfflineRedoProductsMiniAOD*muonMonitors_miniAOD*MuonMiniAOD)
 
 #Post sequences are automatically placed in the EndPath by ConfigBuilder if PAT is run.
 #miniAOD DQM sequences need to access the filter results.
 
+
 PostDQMOfflineMiniAOD = cms.Sequence(miniAODDQMSequence*jetMETDQMOfflineSourceMiniAOD*tracksDQMMiniAOD*topPhysicsminiAOD)
 PostDQMOffline = cms.Sequence()
 
-from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
-run3_HB.toReplaceWith( PostDQMOfflineMiniAOD, PostDQMOfflineMiniAOD.copyAndExclude([
-    pfMetDQMAnalyzerMiniAOD, pfPuppiMetDQMAnalyzerMiniAOD # No hcalnoise (yet)
+from Configuration.Eras.Modifier_phase2_hcal_cff import phase2_hcal
+phase2_hcal.toReplaceWith( PostDQMOfflineMiniAOD, PostDQMOfflineMiniAOD.copyAndExclude([
+    pfMetDQMAnalyzerMiniAOD, pfPuppiMetDQMAnalyzerMiniAOD # No hcalnoise yet
 ]))
 
+from Configuration.Eras.Modifier_pp_on_AA_2018_cff import pp_on_AA_2018
+_pfTauRunDQMValidation = cms.Sequence()
+pp_on_AA_2018.toReplaceWith(pfTauRunDQMValidation, _pfTauRunDQMValidation)
+
 from PhysicsTools.NanoAOD.nanoDQM_cff import nanoDQM
 DQMOfflineNanoAOD = cms.Sequence(nanoDQM)
 #PostDQMOfflineNanoAOD = cms.Sequence(nanoDQM)
-from PhysicsTools.NanoAOD.nanogenDQM_cff import nanogenDQM
-DQMOfflineNanoGen = cms.Sequence(nanogenDQM)
+
+# L1 trigger sequences
+DQMOfflineL1TMonitoring = cms.Sequence( l1TriggerDqmOffline ) # L1 emulator is run within this sequence for real data
+
+DQMOfflineL1TEgamma = cms.Sequence( l1TriggerEgDqmOffline )
+
+DQMOfflineL1TMuon = cms.Sequence( l1TriggerMuonDqmOffline )
diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
new file mode 100644
index 0000000000000..3c2e3d7d6700e
--- /dev/null
+++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
@@ -0,0 +1,7 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQMOffline.RecoB.PrimaryVertexMonitor_cff import pvMonitor as _pvMonitor
+pixelPVMonitor = _pvMonitor.clone(
+    TopFolderName = "OfflinePixelPV",
+    vertexLabel = "pixelVertices",
+)

From 93ebb23a36b253c7c145f1ac43f01bd8f9b88e8b Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 14 Feb 2019 17:29:35 +0100
Subject: [PATCH 08/50] Rework the GPU pixel track clusterizer and vertex
 finder (cms-patatrack#338)

Add two alternative (faster) track clusterizers: one based on DBSCAN,
and one "by density"; use the latter by default.

Allow all pixel triplets, but protect the vertex from triplets.

Use a larger-then-needed nearest neightbours array to allow for possible
duplicate pixels, as a pixel can appear more than once in the same event.

Use a separate workspace for temporary data.
---
 .../PixelVertexFinding/test/BuildFile.xml     |  25 +-
 .../PixelVertexFinding/test/VertexFinder_t.h  | 360 ++++++++++++++++++
 .../test/cpuVertexFinder_t.cpp                |   1 +
 .../test/gpuVertexFinder_t.cu                 | 294 +-------------
 4 files changed, 386 insertions(+), 294 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index 6ab97d3171d8e..dbb25529e884e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -19,9 +19,32 @@
 <use name="SimDataFormats/Track"/>
 <use name="TrackingTools/TransientTrack"/>
 
-<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinder_t">
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   <flags CXXFLAGS="-g"/>
 </bin>
+
+<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderByDensity_t">
+  <flags CXXFLAGS="-g  -DGPU_DEBUG"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderDBSCAN_t">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_DBSCAN"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderIterative_t">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
+  <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+</bin>
+
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
new file mode 100644
index 0000000000000..130af51e9b83c
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -0,0 +1,360 @@
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include <cuda/api_wrappers.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#ifdef USE_DBSCAN
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
+#define CLUSTERIZE clusterTracksDBSCAN
+#elif USE_ITERATIVE
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksIterative.h"
+#define CLUSTERIZE clusterTracksIterative
+#else
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h"
+#define CLUSTERIZE clusterTracksByDensity
+#endif
+
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
+#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
+
+using namespace gpuVertexFinder;
+
+struct Event {
+  std::vector<float> zvert;
+  std::vector<uint16_t>  itrack;
+  std::vector<float> ztrack;
+  std::vector<float> eztrack;
+  std::vector<float> pttrack;
+  std::vector<uint16_t> ivert;
+};
+
+struct ClusterGenerator {
+
+  explicit ClusterGenerator(float nvert, float ntrack) :
+    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.)
+  {}
+
+  void operator()(Event & ev) {
+
+    int nclus = clusGen(reng);
+    ev.zvert.resize(nclus);
+    ev.itrack.resize(nclus);
+    for (auto & z : ev.zvert) { 
+       z = 3.5f*gauss(reng);
+    }
+
+    ev.ztrack.clear(); 
+    ev.eztrack.clear();
+    ev.ivert.clear();
+    for (int iv=0; iv<nclus; ++iv) {
+      auto nt = trackGen(reng);
+      ev.itrack[nclus] = nt;
+      for (int it=0; it<nt; ++it) {
+       auto err = errgen(reng); // reality is not flat....
+       ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
+       ev.eztrack.push_back(err*err);
+       ev.ivert.push_back(iv);
+       ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) );
+       ev.pttrack.back()*=ev.pttrack.back();
+      }
+    }
+    // add noise
+    auto nt = 2*trackGen(reng);
+    for (int it=0; it<nt; ++it) {
+      auto err = 0.03f;
+      ev.ztrack.push_back(rgen(reng));
+      ev.eztrack.push_back(err*err);
+      ev.ivert.push_back(9999);
+      ev.pttrack.push_back( 0.5f + ptGen(reng) );
+      ev.pttrack.back()*=ev.pttrack.back();
+    }
+
+  }
+
+  std::mt19937 reng;
+  std::uniform_real_distribution<float> rgen;
+  std::uniform_real_distribution<float> errgen;
+  std::poisson_distribution<int> clusGen;
+  std::poisson_distribution<int> trackGen;
+  std::normal_distribution<float> gauss;
+  std::exponential_distribution<float> ptGen;
+
+};
+
+// a macro SORRY
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) +offsetof(ZVertices,M))
+#define LOC_WS(M) ((char*)(ws_d.get()) +offsetof(WorkSpace,M))
+
+__global__
+void print(ZVertices const * pdata, WorkSpace const * pws) {
+  auto const & __restrict__ data = *pdata;
+  auto const & __restrict__ ws = *pws;
+  printf("nt,nv %d %d,%d\n",ws.ntrks,data.nvFinal,ws.nvIntermediate); 
+
+}
+
+int main() {
+
+#ifdef __CUDACC__
+  exitSansCUDADevices();
+
+  auto current_device = cuda::device::current::get();
+
+  auto onGPU_d = cuda::memory::device::make_unique<ZVertices[]>(current_device, 1);
+  auto ws_d = cuda::memory::device::make_unique<WorkSpace[]>(current_device, 1);
+#else
+  auto onGPU_d = std::make_unique<ZVertices>();
+  auto ws_d = std::make_unique<WorkSpace>();
+#endif
+
+  Event  ev;
+
+  float eps = 0.1f;
+  std::array<float,3> par{{eps, 0.01f,9.0f}};
+  for (int nav=30;nav<80;nav+=20){ 
+
+  ClusterGenerator gen(nav,10);
+
+  for (int i=8; i<20; ++i) {
+
+  auto  kk=i/4;  // M param
+
+  gen(ev);
+  
+#ifdef __CUDACC__
+  init<<<1,1,0,0>>>(onGPU_d.get(),ws_d.get());
+#else
+  onGPU_d->init();ws_d->init();
+#endif
+
+  std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+  auto nt = ev.ztrack.size();
+#ifdef __CUDACC__
+  cuda::memory::copy(LOC_WS(ntrks),&nt,sizeof(uint32_t));
+  cuda::memory::copy(LOC_WS(zt),ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
+  cuda::memory::copy(LOC_WS(ezt2),ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
+  cuda::memory::copy(LOC_WS(ptt2),ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
+#else
+  ::memcpy(LOC_WS(ntrks),&nt,sizeof(uint32_t));  
+  ::memcpy(LOC_WS(zt),ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
+  ::memcpy(LOC_WS(ezt2),ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
+  ::memcpy(LOC_WS(ptt2),ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
+#endif
+
+  std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i%4) << std::endl;
+  
+  if ( (i%4) == 0 ) par = {{eps, 0.02f,12.0f}};
+  if ( (i%4) == 1 ) par = {{eps, 0.02f,9.0f}};
+  if ( (i%4) == 2 ) par = {{eps, 0.01f,9.0f}};
+  if ( (i%4) == 3 ) par = {{0.7f*eps, 0.01f,9.0f}};
+
+  uint32_t nv=0;
+#ifdef __CUDACC__
+  print<<<1,1,0,0>>>(onGPU_d.get(),ws_d.get());
+  cudaCheck(cudaGetLastError());
+  cudaDeviceSynchronize();
+
+  cuda::launch(CLUSTERIZE,
+		 { 1, 512+256 },
+		 onGPU_d.get(),ws_d.get(),kk,par[0],
+		 par[1],par[2]
+		 );
+  print<<<1,1,0,0>>>(onGPU_d.get(),ws_d.get());
+
+  cudaCheck(cudaGetLastError());
+  cudaDeviceSynchronize();
+
+
+  cuda::launch(fitVertices, 
+               { 1,1024-256 },
+               onGPU_d.get(),ws_d.get(),50.f
+              );
+  cudaCheck(cudaGetLastError());
+  cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+
+#else
+  print(onGPU_d.get(),ws_d.get());
+  CLUSTERIZE(
+                 onGPU_d.get(),ws_d.get(),kk,par[0],
+                 par[1],par[2]
+                 );
+  print(onGPU_d.get(),ws_d.get());
+  fitVertices(
+               onGPU_d.get(),ws_d.get(),50.f
+              );
+  nv = onGPU_d->nvFinal;
+#endif
+
+  if (nv==0) {
+    std::cout << "NO VERTICES???" << std::endl;
+    continue;
+  }
+
+  float * zv = nullptr;
+  float * wv = nullptr;
+  float * ptv2 = nullptr;
+  int32_t * nn = nullptr;
+  uint16_t * ind = nullptr;
+
+  // keep chi2 separated...
+  float chi2[2*nv];  // make space for splitting...
+
+#ifdef __CUDACC__
+  float hzv[2*nv];
+  float hwv[2*nv];
+  float hptv2[2*nv];
+  int32_t hnn[2*nv];
+  uint16_t hind[2*nv];
+
+  zv = hzv;
+  wv = hwv;
+  ptv2 = hptv2;
+  nn = hnn;
+  ind = hind;
+#else
+  zv = onGPU_d->zv;
+  wv = onGPU_d->wv;
+  ptv2 = onGPU_d->ptv2;
+  nn = onGPU_d->ndof;
+  ind = onGPU_d->sortInd;
+#endif
+
+#ifdef __CUDACC__
+  cuda::memory::copy(nn, LOC_ONGPU(ndof), nv*sizeof(int32_t));
+  cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+#else
+  memcpy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+#endif
+
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+
+#ifdef __CUDACC__
+  cuda::launch(fitVertices,
+               { 1,1024-256 },
+               onGPU_d.get(),ws_d.get(), 50.f
+              );
+  cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+  cuda::memory::copy(nn, LOC_ONGPU(ndof), nv*sizeof(int32_t));
+  cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+#else
+  fitVertices(
+               onGPU_d.get(),ws_d.get(),50.f
+              );
+  nv = onGPU_d->nvFinal;
+  memcpy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+#endif
+
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+
+#ifdef __CUDACC__
+  // one vertex per block!!!
+  cuda::launch(splitVertices,
+               { 1024, 64 },
+               onGPU_d.get(),ws_d.get(),
+               9.f
+              );
+  cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t));
+#else
+  gridDim.x = 1024; // nv ????
+  assert(blockIdx.x==0);
+  for(;blockIdx.x<gridDim.x; ++blockIdx.x)
+    splitVertices(
+               onGPU_d.get(),ws_d.get(),
+               9.f
+              );
+   resetGrid();
+   nv = ws_d->nvIntermediate;
+#endif
+ std::cout << "after split " << nv << std::endl; 
+
+
+#ifdef __CUDACC__
+  cuda::launch(fitVertices,
+               { 1,1024-256 },
+               onGPU_d.get(),ws_d.get(),5000.f
+              );
+  cudaCheck(cudaGetLastError());
+
+  cuda::launch(sortByPt2,
+               { 1, 256 },
+               onGPU_d.get(),ws_d.get()
+              );
+  cudaCheck(cudaGetLastError());
+  cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+#else
+  fitVertices(onGPU_d.get(),ws_d.get(),5000.f);
+  sortByPt2(onGPU_d.get(),ws_d.get());
+  nv = onGPU_d->nvFinal;
+  memcpy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+#endif
+
+
+  if (nv==0) {
+    std::cout << "NO VERTICES???" << std::endl;
+    continue;
+  }
+
+#ifdef __CUDACC__
+  cuda::memory::copy(zv, LOC_ONGPU(zv), nv*sizeof(float));
+  cuda::memory::copy(wv, LOC_ONGPU(wv), nv*sizeof(float));
+  cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+  cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv*sizeof(float));
+  cuda::memory::copy(nn, LOC_ONGPU(ndof), nv*sizeof(int32_t));
+  cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv*sizeof(uint16_t));
+#endif
+  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
+  {
+    auto mx = std::minmax_element(chi2,chi2+nv);
+    std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
+  }
+
+  {
+    auto mx = std::minmax_element(wv,wv+nv);
+    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
+  }
+
+  {
+    auto mx = std::minmax_element(ptv2,ptv2+nv);
+    std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
+    std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' <<  ptv2[ind[nv-1]] << " at "  << ind[0] << ' ' << ind[nv-1] << std::endl;
+
+  }  
+
+  float dd[nv];
+  for (auto kv=0U; kv<nv; ++kv) {
+   auto zr = zv[kv];
+   auto md=500.0f;
+   for (auto zs : ev.ztrack) { 
+     auto d = std::abs(zr-zs);
+     md = std::min(d,md);
+   }
+   dd[kv] = md;
+  }
+  if (i==6) {
+    for (auto d:dd) std::cout << d << ' ';
+    std::cout << std::endl;
+  }
+  auto mx = std::minmax_element(dd,dd+nv);
+  float rms=0;
+  for (auto d:dd) rms+=d*d; 
+  rms = std::sqrt(rms)/(nv-1);
+  std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+  } // loop on events
+  } // lopp on ave vert
+  
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
new file mode 100644
index 0000000000000..a7906fe0d03f5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index 84334917052ed..a7906fe0d03f5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -1,293 +1 @@
-#include <cmath>
-#include <cstdint>
-#include <iostream>
-#include <random>
-#include <vector>
-
-#include <cuda/api_wrappers.h>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
-
-using namespace gpuVertexFinder;
-
-struct Event {
-  std::vector<float> zvert;
-  std::vector<uint16_t>  itrack;
-  std::vector<float> ztrack;
-  std::vector<float> eztrack;
-  std::vector<float> pttrack;
-  std::vector<uint16_t> ivert;
-};
-
-struct ClusterGenerator {
-
-  explicit ClusterGenerator(float nvert, float ntrack) :
-    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.)
-  {}
-
-  void operator()(Event & ev) {
-
-    int nclus = clusGen(reng);
-    ev.zvert.resize(nclus);
-    ev.itrack.resize(nclus);
-    for (auto & z : ev.zvert) { 
-       z = 3.5f*gauss(reng);
-    }
-
-    ev.ztrack.clear(); 
-    ev.eztrack.clear();
-    ev.ivert.clear();
-    for (int iv=0; iv<nclus; ++iv) {
-      auto nt = trackGen(reng);
-      ev.itrack[nclus] = nt;
-      for (int it=0; it<nt; ++it) {
-       auto err = errgen(reng); // reality is not flat....
-       ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
-       ev.eztrack.push_back(err*err);
-       ev.ivert.push_back(iv);
-       ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) );
-       ev.pttrack.back()*=ev.pttrack.back();
-      }
-    }
-    // add noise
-    auto nt = 2*trackGen(reng);
-    for (int it=0; it<nt; ++it) {
-      auto err = 0.03f;
-      ev.ztrack.push_back(rgen(reng));
-      ev.eztrack.push_back(err*err);
-      ev.ivert.push_back(9999);
-      ev.pttrack.push_back( 0.5f + ptGen(reng) );
-      ev.pttrack.back()*=ev.pttrack.back();
-    }
-
-  }
-
-  std::mt19937 reng;
-  std::uniform_real_distribution<float> rgen;
-  std::uniform_real_distribution<float> errgen;
-  std::poisson_distribution<int> clusGen;
-  std::poisson_distribution<int> trackGen;
-  std::normal_distribution<float> gauss;
-  std::exponential_distribution<float> ptGen;
-
-};
-
-
-int main() {
-  exitSansCUDADevices();
-
-  auto current_device = cuda::device::current::get();
-  
-  auto ntrks_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
-  auto zt_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
-  auto ezt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
-  auto ptt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
-  auto zv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
-  auto wv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
-  auto chi2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
-  auto ptv2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
-  auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, 256);
-
-  auto izt_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, 64000);
-  auto nn_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
-  auto iv_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
-
-  auto nv_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
-  auto nv2_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, 1);
- 
-  auto onGPU_d = cuda::memory::device::make_unique<OnGPU[]>(current_device, 1);
-
-  OnGPU onGPU;
-
-  onGPU.ntrks = ntrks_d.get();
-  onGPU.zt = zt_d.get();
-  onGPU.ezt2 = ezt2_d.get();
-  onGPU.ptt2 = ptt2_d.get();
-  onGPU.zv = zv_d.get();
-  onGPU.wv = wv_d.get();
-  onGPU.chi2 = chi2_d.get();
-  onGPU.ptv2 = ptv2_d.get();
-  onGPU.sortInd = ind_d.get();
-  onGPU.nvFinal = nv_d.get();
-  onGPU.nvIntermediate = nv2_d.get();
-  onGPU.izt = izt_d.get();
-  onGPU.nn = nn_d.get();
-  onGPU.iv = iv_d.get();
-
-
-  cuda::memory::copy(onGPU_d.get(), &onGPU, sizeof(OnGPU));
-
-
-  Event  ev;
-
-  for (int nav=30;nav<80;nav+=20){ 
-
-  ClusterGenerator gen(nav,10);
-
-  for (int i=8; i<20; ++i) {
-
-  auto  kk=i/4;  // M param
-
-  gen(ev);
-  
-  std::cout << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
-  auto nt = ev.ztrack.size();
-  cuda::memory::copy(onGPU.ntrks,&nt,sizeof(uint32_t));
-  cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
-  cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
-  cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
-
-  float eps = 0.1f;
-  
-  std::cout << "M eps " << kk << ' ' << eps << std::endl;
-  
-  if ( (i%4) == 0 )
-    cuda::launch(clusterTracks,
-		 { 1, 512+256 },
-		 onGPU_d.get(),kk,eps,
-		 0.02f,12.0f
-		 );
-  
-  if ( (i%4) == 1 )
-    cuda::launch(clusterTracks,
-		 { 1, 512+256 },
-		 onGPU_d.get(),kk,eps,
-		 0.02f,9.0f
-		 );
-  
-  if ( (i%4) == 2 )
-    cuda::launch(clusterTracks,
-		 { 1, 512+256 },
-		 onGPU_d.get(),kk,eps,
-		 0.01f,9.0f
-		 );
-  
-  if ( (i%4) == 3 )
-    cuda::launch(clusterTracks,
-		 { 1, 512+256 },
-		 onGPU_d.get(),kk,0.7f*eps,
-		 0.01f,9.0f
-		 );
-  cudaCheck(cudaGetLastError());
-  cudaDeviceSynchronize();
-
-  cuda::launch(fitVertices, 
-               { 1,1024-256 },
-               onGPU_d.get(),50.f
-              );
-  cudaCheck(cudaGetLastError());
-
-  uint32_t nv;
-  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
-  if (nv==0) {
-    std::cout << "NO VERTICES???" << std::endl;
-    continue;
-  }
-  float chi2[2*nv];  // make space for splitting...
-  float zv[2*nv];
-  float wv[2*nv];
-  float ptv2[2*nv];
-  int32_t nn[2*nv];
-  uint16_t ind[2*nv];
-
-  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
-  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
-  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
-  {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "after fit min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
-  }
-
-  cuda::launch(fitVertices,
-               { 1,1024-256 },
-               onGPU_d.get(), 50.f
-              );
-  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
-  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
-  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
-  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
-  {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "before splitting min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
-  }
-
-  cuda::launch(splitVertices,
-               { 1024, 64 },
-               onGPU_d.get(),
-               9.f
-              );
- cuda::memory::copy(&nv, onGPU.nvIntermediate, sizeof(uint32_t));
- std::cout << "after split " << nv << std::endl; 
-
-  cuda::launch(fitVertices,
-               { 1,1024-256 },
-               onGPU_d.get(),5000.f
-              );
-  cudaCheck(cudaGetLastError());
-
-
-  cuda::launch(sortByPt2,
-               { 1, 256 },
-               onGPU_d.get()
-              );
-
-  cuda::memory::copy(&nv, onGPU.nvFinal, sizeof(uint32_t));
-
-  if (nv==0) {
-    std::cout << "NO VERTICES???" << std::endl;
-    continue;
-  }
-
-
-  cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float));
-  cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float));
-  cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
-  cuda::memory::copy(&ptv2, onGPU.ptv2, nv*sizeof(float));
-  cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
-  cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t));
-  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
-  {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
-  }
-
-  {
-    auto mx = std::minmax_element(wv,wv+nv);
-    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
-  }
-
-  {
-    auto mx = std::minmax_element(ptv2,ptv2+nv);
-    std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
-    std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' <<  ptv2[ind[nv-1]] << " at "  << ind[0] << ' ' << ind[nv-1] << std::endl;
-
-  }  
-
-  float dd[nv];
-  for (auto kv=0U; kv<nv; ++kv) {
-   auto zr = zv[kv];
-   auto md=500.0f;
-   for (auto zs : ev.ztrack) { 
-     auto d = std::abs(zr-zs);
-     md = std::min(d,md);
-   }
-   dd[kv] = md;
-  }
-  if (i==6) {
-    for (auto d:dd) std::cout << d << ' ';
-    std::cout << std::endl;
-  }
-  auto mx = std::minmax_element(dd,dd+nv);
-  float rms=0;
-  for (auto d:dd) rms+=d*d; rms = std::sqrt(rms)/(nv-1);
-  std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
-
-  } // loop on events
-  } // lopp on ave vert
-  
-  return 0;
-}
+#include "VertexFinder_t.h"

From efab2dbbdc1c6d9e004874aa00366a756d26e198 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 14 May 2019 23:31:50 +0200
Subject: [PATCH 09/50] Clean up by clang-format (cms-patatrack#338)

---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 423 ++++++++----------
 1 file changed, 196 insertions(+), 227 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 130af51e9b83c..8fe08c4964fde 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -17,7 +17,6 @@
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h"
 #define CLUSTERIZE clusterTracksByDensity
 #endif
-
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
@@ -26,7 +25,7 @@ using namespace gpuVertexFinder;
 
 struct Event {
   std::vector<float> zvert;
-  std::vector<uint16_t>  itrack;
+  std::vector<uint16_t> itrack;
   std::vector<float> ztrack;
   std::vector<float> eztrack;
   std::vector<float> pttrack;
@@ -34,46 +33,42 @@ struct Event {
 };
 
 struct ClusterGenerator {
+  explicit ClusterGenerator(float nvert, float ntrack)
+      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
 
-  explicit ClusterGenerator(float nvert, float ntrack) :
-    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.)
-  {}
-
-  void operator()(Event & ev) {
-
+  void operator()(Event& ev) {
     int nclus = clusGen(reng);
     ev.zvert.resize(nclus);
     ev.itrack.resize(nclus);
-    for (auto & z : ev.zvert) { 
-       z = 3.5f*gauss(reng);
+    for (auto& z : ev.zvert) {
+      z = 3.5f * gauss(reng);
     }
 
-    ev.ztrack.clear(); 
+    ev.ztrack.clear();
     ev.eztrack.clear();
     ev.ivert.clear();
-    for (int iv=0; iv<nclus; ++iv) {
+    for (int iv = 0; iv < nclus; ++iv) {
       auto nt = trackGen(reng);
       ev.itrack[nclus] = nt;
-      for (int it=0; it<nt; ++it) {
-       auto err = errgen(reng); // reality is not flat....
-       ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
-       ev.eztrack.push_back(err*err);
-       ev.ivert.push_back(iv);
-       ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) );
-       ev.pttrack.back()*=ev.pttrack.back();
+      for (int it = 0; it < nt; ++it) {
+        auto err = errgen(reng);  // reality is not flat....
+        ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
+        ev.eztrack.push_back(err * err);
+        ev.ivert.push_back(iv);
+        ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
+        ev.pttrack.back() *= ev.pttrack.back();
       }
     }
     // add noise
-    auto nt = 2*trackGen(reng);
-    for (int it=0; it<nt; ++it) {
+    auto nt = 2 * trackGen(reng);
+    for (int it = 0; it < nt; ++it) {
       auto err = 0.03f;
       ev.ztrack.push_back(rgen(reng));
-      ev.eztrack.push_back(err*err);
+      ev.eztrack.push_back(err * err);
       ev.ivert.push_back(9999);
-      ev.pttrack.push_back( 0.5f + ptGen(reng) );
-      ev.pttrack.back()*=ev.pttrack.back();
+      ev.pttrack.push_back(0.5f + ptGen(reng));
+      ev.pttrack.back() *= ev.pttrack.back();
     }
-
   }
 
   std::mt19937 reng;
@@ -83,23 +78,19 @@ struct ClusterGenerator {
   std::poisson_distribution<int> trackGen;
   std::normal_distribution<float> gauss;
   std::exponential_distribution<float> ptGen;
-
 };
 
 // a macro SORRY
-#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) +offsetof(ZVertices,M))
-#define LOC_WS(M) ((char*)(ws_d.get()) +offsetof(WorkSpace,M))
-
-__global__
-void print(ZVertices const * pdata, WorkSpace const * pws) {
-  auto const & __restrict__ data = *pdata;
-  auto const & __restrict__ ws = *pws;
-  printf("nt,nv %d %d,%d\n",ws.ntrks,data.nvFinal,ws.nvIntermediate); 
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(ZVertices, M))
+#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(WorkSpace, M))
 
+__global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
+  auto const& __restrict__ data = *pdata;
+  auto const& __restrict__ ws = *pws;
+  printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
 }
 
 int main() {
-
 #ifdef __CUDACC__
   exitSansCUDADevices();
 
@@ -112,249 +103,227 @@ int main() {
   auto ws_d = std::make_unique<WorkSpace>();
 #endif
 
-  Event  ev;
+  Event ev;
 
   float eps = 0.1f;
-  std::array<float,3> par{{eps, 0.01f,9.0f}};
-  for (int nav=30;nav<80;nav+=20){ 
+  std::array<float, 3> par{{eps, 0.01f, 9.0f}};
+  for (int nav = 30; nav < 80; nav += 20) {
+    ClusterGenerator gen(nav, 10);
 
-  ClusterGenerator gen(nav,10);
+    for (int i = 8; i < 20; ++i) {
+      auto kk = i / 4;  // M param
 
-  for (int i=8; i<20; ++i) {
+      gen(ev);
 
-  auto  kk=i/4;  // M param
-
-  gen(ev);
-  
 #ifdef __CUDACC__
-  init<<<1,1,0,0>>>(onGPU_d.get(),ws_d.get());
+      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 #else
-  onGPU_d->init();ws_d->init();
+      onGPU_d->init();
+      ws_d->init();
 #endif
 
-  std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
-  auto nt = ev.ztrack.size();
+      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+      auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-  cuda::memory::copy(LOC_WS(ntrks),&nt,sizeof(uint32_t));
-  cuda::memory::copy(LOC_WS(zt),ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
-  cuda::memory::copy(LOC_WS(ezt2),ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
-  cuda::memory::copy(LOC_WS(ptt2),ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
+      cuda::memory::copy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
+      cuda::memory::copy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      cuda::memory::copy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      cuda::memory::copy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #else
-  ::memcpy(LOC_WS(ntrks),&nt,sizeof(uint32_t));  
-  ::memcpy(LOC_WS(zt),ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
-  ::memcpy(LOC_WS(ezt2),ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
-  ::memcpy(LOC_WS(ptt2),ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
+      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
+      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #endif
 
-  std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i%4) << std::endl;
-  
-  if ( (i%4) == 0 ) par = {{eps, 0.02f,12.0f}};
-  if ( (i%4) == 1 ) par = {{eps, 0.02f,9.0f}};
-  if ( (i%4) == 2 ) par = {{eps, 0.01f,9.0f}};
-  if ( (i%4) == 3 ) par = {{0.7f*eps, 0.01f,9.0f}};
+      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
 
-  uint32_t nv=0;
-#ifdef __CUDACC__
-  print<<<1,1,0,0>>>(onGPU_d.get(),ws_d.get());
-  cudaCheck(cudaGetLastError());
-  cudaDeviceSynchronize();
+      if ((i % 4) == 0)
+        par = {{eps, 0.02f, 12.0f}};
+      if ((i % 4) == 1)
+        par = {{eps, 0.02f, 9.0f}};
+      if ((i % 4) == 2)
+        par = {{eps, 0.01f, 9.0f}};
+      if ((i % 4) == 3)
+        par = {{0.7f * eps, 0.01f, 9.0f}};
 
-  cuda::launch(CLUSTERIZE,
-		 { 1, 512+256 },
-		 onGPU_d.get(),ws_d.get(),kk,par[0],
-		 par[1],par[2]
-		 );
-  print<<<1,1,0,0>>>(onGPU_d.get(),ws_d.get());
+      uint32_t nv = 0;
+#ifdef __CUDACC__
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
 
-  cudaCheck(cudaGetLastError());
-  cudaDeviceSynchronize();
+      cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
 
-  cuda::launch(fitVertices, 
-               { 1,1024-256 },
-               onGPU_d.get(),ws_d.get(),50.f
-              );
-  cudaCheck(cudaGetLastError());
-  cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cuda::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudaCheck(cudaGetLastError());
+      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
 
 #else
-  print(onGPU_d.get(),ws_d.get());
-  CLUSTERIZE(
-                 onGPU_d.get(),ws_d.get(),kk,par[0],
-                 par[1],par[2]
-                 );
-  print(onGPU_d.get(),ws_d.get());
-  fitVertices(
-               onGPU_d.get(),ws_d.get(),50.f
-              );
-  nv = onGPU_d->nvFinal;
+      print(onGPU_d.get(), ws_d.get());
+      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.get(), ws_d.get());
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
 #endif
 
-  if (nv==0) {
-    std::cout << "NO VERTICES???" << std::endl;
-    continue;
-  }
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
 
-  float * zv = nullptr;
-  float * wv = nullptr;
-  float * ptv2 = nullptr;
-  int32_t * nn = nullptr;
-  uint16_t * ind = nullptr;
+      float* zv = nullptr;
+      float* wv = nullptr;
+      float* ptv2 = nullptr;
+      int32_t* nn = nullptr;
+      uint16_t* ind = nullptr;
 
-  // keep chi2 separated...
-  float chi2[2*nv];  // make space for splitting...
+      // keep chi2 separated...
+      float chi2[2 * nv];  // make space for splitting...
 
 #ifdef __CUDACC__
-  float hzv[2*nv];
-  float hwv[2*nv];
-  float hptv2[2*nv];
-  int32_t hnn[2*nv];
-  uint16_t hind[2*nv];
-
-  zv = hzv;
-  wv = hwv;
-  ptv2 = hptv2;
-  nn = hnn;
-  ind = hind;
+      float hzv[2 * nv];
+      float hwv[2 * nv];
+      float hptv2[2 * nv];
+      int32_t hnn[2 * nv];
+      uint16_t hind[2 * nv];
+
+      zv = hzv;
+      wv = hwv;
+      ptv2 = hptv2;
+      nn = hnn;
+      ind = hind;
 #else
-  zv = onGPU_d->zv;
-  wv = onGPU_d->wv;
-  ptv2 = onGPU_d->ptv2;
-  nn = onGPU_d->ndof;
-  ind = onGPU_d->sortInd;
+      zv = onGPU_d->zv;
+      wv = onGPU_d->wv;
+      ptv2 = onGPU_d->ptv2;
+      nn = onGPU_d->ndof;
+      ind = onGPU_d->sortInd;
 #endif
 
 #ifdef __CUDACC__
-  cuda::memory::copy(nn, LOC_ONGPU(ndof), nv*sizeof(int32_t));
-  cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
+      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #else
-  memcpy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
-  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
-  {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
-  }
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
 
 #ifdef __CUDACC__
-  cuda::launch(fitVertices,
-               { 1,1024-256 },
-               onGPU_d.get(),ws_d.get(), 50.f
-              );
-  cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
-  cuda::memory::copy(nn, LOC_ONGPU(ndof), nv*sizeof(int32_t));
-  cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+      cuda::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
+      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #else
-  fitVertices(
-               onGPU_d.get(),ws_d.get(),50.f
-              );
-  nv = onGPU_d->nvFinal;
-  memcpy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
-  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]);
-  {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
-  }
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
 
 #ifdef __CUDACC__
-  // one vertex per block!!!
-  cuda::launch(splitVertices,
-               { 1024, 64 },
-               onGPU_d.get(),ws_d.get(),
-               9.f
-              );
-  cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t));
+      // one vertex per block!!!
+      cuda::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t));
 #else
-  gridDim.x = 1024; // nv ????
-  assert(blockIdx.x==0);
-  for(;blockIdx.x<gridDim.x; ++blockIdx.x)
-    splitVertices(
-               onGPU_d.get(),ws_d.get(),
-               9.f
-              );
-   resetGrid();
-   nv = ws_d->nvIntermediate;
+      gridDim.x = 1024;  // nv ????
+      assert(blockIdx.x == 0);
+      for (; blockIdx.x < gridDim.x; ++blockIdx.x)
+        splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+      resetGrid();
+      nv = ws_d->nvIntermediate;
 #endif
- std::cout << "after split " << nv << std::endl; 
-
+      std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-  cuda::launch(fitVertices,
-               { 1,1024-256 },
-               onGPU_d.get(),ws_d.get(),5000.f
-              );
-  cudaCheck(cudaGetLastError());
-
-  cuda::launch(sortByPt2,
-               { 1, 256 },
-               onGPU_d.get(),ws_d.get()
-              );
-  cudaCheck(cudaGetLastError());
-  cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cuda::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cudaCheck(cudaGetLastError());
+
+      cuda::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
 #else
-  fitVertices(onGPU_d.get(),ws_d.get(),5000.f);
-  sortByPt2(onGPU_d.get(),ws_d.get());
-  nv = onGPU_d->nvFinal;
-  memcpy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
+      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
+      sortByPt2(onGPU_d.get(), ws_d.get());
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
-
-  if (nv==0) {
-    std::cout << "NO VERTICES???" << std::endl;
-    continue;
-  }
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
 
 #ifdef __CUDACC__
-  cuda::memory::copy(zv, LOC_ONGPU(zv), nv*sizeof(float));
-  cuda::memory::copy(wv, LOC_ONGPU(wv), nv*sizeof(float));
-  cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv*sizeof(float));
-  cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv*sizeof(float));
-  cuda::memory::copy(nn, LOC_ONGPU(ndof), nv*sizeof(int32_t));
-  cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv*sizeof(uint16_t));
+      cuda::memory::copy(zv, LOC_ONGPU(zv), nv * sizeof(float));
+      cuda::memory::copy(wv, LOC_ONGPU(wv), nv * sizeof(float));
+      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float));
+      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
+      cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t));
 #endif
-  for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
-  {
-    auto mx = std::minmax_element(chi2,chi2+nv);
-    std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' <<  *mx.second << std::endl;
-  }
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
 
-  {
-    auto mx = std::minmax_element(wv,wv+nv);
-    std::cout << "min max error " << 1./std::sqrt(*mx.first) << ' ' <<  1./std::sqrt(*mx.second) << std::endl;
-  }
+      {
+        auto mx = std::minmax_element(wv, wv + nv);
+        std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
+      }
+
+      {
+        auto mx = std::minmax_element(ptv2, ptv2 + nv);
+        std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
+        std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
+                  << ind[nv - 1] << std::endl;
+      }
+
+      float dd[nv];
+      for (auto kv = 0U; kv < nv; ++kv) {
+        auto zr = zv[kv];
+        auto md = 500.0f;
+        for (auto zs : ev.ztrack) {
+          auto d = std::abs(zr - zs);
+          md = std::min(d, md);
+        }
+        dd[kv] = md;
+      }
+      if (i == 6) {
+        for (auto d : dd)
+          std::cout << d << ' ';
+        std::cout << std::endl;
+      }
+      auto mx = std::minmax_element(dd, dd + nv);
+      float rms = 0;
+      for (auto d : dd)
+        rms += d * d;
+      rms = std::sqrt(rms) / (nv - 1);
+      std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+    }  // loop on events
+  }    // lopp on ave vert
 
-  {
-    auto mx = std::minmax_element(ptv2,ptv2+nv);
-    std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
-    std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' <<  ptv2[ind[nv-1]] << " at "  << ind[0] << ' ' << ind[nv-1] << std::endl;
-
-  }  
-
-  float dd[nv];
-  for (auto kv=0U; kv<nv; ++kv) {
-   auto zr = zv[kv];
-   auto md=500.0f;
-   for (auto zs : ev.ztrack) { 
-     auto d = std::abs(zr-zs);
-     md = std::min(d,md);
-   }
-   dd[kv] = md;
-  }
-  if (i==6) {
-    for (auto d:dd) std::cout << d << ' ';
-    std::cout << std::endl;
-  }
-  auto mx = std::minmax_element(dd,dd+nv);
-  float rms=0;
-  for (auto d:dd) rms+=d*d; 
-  rms = std::sqrt(rms)/(nv-1);
-  std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
-
-  } // loop on events
-  } // lopp on ave vert
-  
   return 0;
 }

From 5a55ac5abd030b9496dfcfba76dc9115279d1656 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 15 May 2019 14:11:42 +0200
Subject: [PATCH 10/50] Synchronise with CMSSW_10_6_0

---
 .../python/DQMOffline_SecondStep_cff.py           | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 35c6082146f68..325aa34117745 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -1,6 +1,5 @@
 import FWCore.ParameterSet.Config as cms
 
-from CondTools.DQM.DQMReferenceHistogramRootFileEventSetupAnalyzer_cfi import *
 from DQMServices.Components.DQMMessageLoggerClient_cff import *
 from DQMServices.Components.DQMDcsInfoClient_cfi import *
 from DQMServices.Components.DQMFastTimerServiceClient_cfi import *
@@ -33,7 +32,7 @@
                                              dqmFEDIntegrityClient *
                                              l1TriggerDqmOfflineClient )
 
-DQMOffline_SecondStepDPG = cms.Sequence( dqmRefHistoRootFileGetter *
+DQMOffline_SecondStepDPG = cms.Sequence(
                                          DQMOffline_SecondStep_PreDPG *
                                          DQMMessageLoggerClientSeq )
 
@@ -60,14 +59,14 @@
                                              runTauEff)
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
 
-DQMOffline_SecondStepPOG = cms.Sequence( dqmRefHistoRootFileGetter *
+DQMOffline_SecondStepPOG = cms.Sequence(
                                          DQMOffline_SecondStep_PrePOG *
                                          DQMMessageLoggerClientSeq )
 
 
 HLTMonitoringClient = cms.Sequence(trackingMonitorClientHLT * trackingForDisplacedJetMonitorClientHLT)
 HLTMonitoringClientPA= cms.Sequence(trackingMonitorClientHLT * PAtrackingMonitorClientHLT)
-DQMOffline_SecondStep = cms.Sequence( dqmRefHistoRootFileGetter *
+DQMOffline_SecondStep = cms.Sequence(
                                       DQMOffline_SecondStep_PreDPG *
                                       DQMOffline_SecondStep_PrePOG *
                                       HLTMonitoringClient *
@@ -83,11 +82,11 @@
 
 DQMOffline_SecondStep_PrePOGMC = cms.Sequence( bTagCollectorSequenceDATA )
 
-DQMOffline_SecondStepPOGMC = cms.Sequence( dqmRefHistoRootFileGetter *
+DQMOffline_SecondStepPOGMC = cms.Sequence(
                                            DQMOffline_SecondStep_PrePOGMC *
                                            DQMMessageLoggerClientSeq )
 
-DQMHarvestCommon = cms.Sequence( dqmRefHistoRootFileGetter *
+DQMHarvestCommon = cms.Sequence(
                                  DQMMessageLoggerClientSeq *
                                  dqmDcsInfoClient *
                                  SiStripOfflineDQMClient *
@@ -100,7 +99,7 @@
                                  runTauEff *
                                  dqmFastTimerServiceClient
                                 )
-DQMHarvestCommonSiStripZeroBias = cms.Sequence(dqmRefHistoRootFileGetter *
+DQMHarvestCommonSiStripZeroBias = cms.Sequence(
                                                DQMMessageLoggerClientSeq *
                                                dqmDcsInfoClient *
                                                SiStripOfflineDQMClient *
@@ -121,7 +120,7 @@
 DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern *
                                         pixelVertexResolutionClient )
 
-DQMHarvestOuterTracker = cms.Sequence( dqmRefHistoRootFileGetter *
+DQMHarvestOuterTracker = cms.Sequence(
                                  dqmDcsInfoClient *
                                  OuterTrackerClient *
                                  dqmFEDIntegrityClient *

From a56cd9afc514a5c65c8af29ca678b15d14390e74 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 5 Jul 2019 11:59:12 +0200
Subject: [PATCH 11/50] Port the whole pixel workflow to new heterogeneous
 framework (cms-patatrack#384)

  - port the whole pixel workflow to new heterogeneous framework
  - implement a legacy cluster to SoA converter for the pixel RecHits
  - update the vertex producer to run on CPU as well as GPU
---
 CUDADataFormats/Vertex/BuildFile.xml          | 10 ++
 .../Vertex/interface/ZVertexHeterogeneous.h   | 15 +++
 CUDADataFormats/Vertex/interface/ZVertexSoA.h | 29 ++++++
 CUDADataFormats/Vertex/src/classes.h          |  8 ++
 CUDADataFormats/Vertex/src/classes_def.xml    |  6 ++
 .../python/RecoPixelVertexing_cff.py          | 15 +++
 .../plugins/PixelTrackDumpCUDA.cc             | 94 +++++++++++++++++++
 .../python/PixelVertexes_cfi.py               |  5 -
 8 files changed, 177 insertions(+), 5 deletions(-)
 create mode 100644 CUDADataFormats/Vertex/BuildFile.xml
 create mode 100644 CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
 create mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoA.h
 create mode 100644 CUDADataFormats/Vertex/src/classes.h
 create mode 100644 CUDADataFormats/Vertex/src/classes_def.xml
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc

diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
new file mode 100644
index 0000000000000..521ea8fe29753
--- /dev/null
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -0,0 +1,10 @@
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSetReader"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
new file mode 100644
index 0000000000000..611831b53b6af
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -0,0 +1,15 @@
+#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
+#define CUDADataFormatsVertexZVertexHeterogeneous_H
+
+#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+
+using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
+#ifndef __CUDACC__
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+using ZVertexCUDAProduct =  CUDAProduct<ZVertexHeterogeneous>;
+#endif
+
+#endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
new file mode 100644
index 0000000000000..bbb54ec55cae7
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -0,0 +1,29 @@
+#ifndef CUDADataFormatsVertexZVertexSoA_H
+#define CUDADataFormatsVertexZVertexSoA_H
+
+#include<cstdint>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+
+// SOA for vertices
+// These vertices are clusterized and fitted only along the beam line (z)
+// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
+struct ZVertexSoA {
+    static constexpr uint32_t MAXTRACKS = 32*1024;
+    static constexpr uint32_t MAXVTX = 1024;
+
+    int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
+    float zv[MAXVTX];          // output z-posistion of found vertices
+    float wv[MAXVTX];          // output weight (1/error^2) on the above
+    float chi2[MAXVTX];        // vertices chi2
+    float ptv2[MAXVTX];        // vertices pt^2
+    int32_t ndof[MAXVTX];      // vertices number of dof (reused as workspace for the number of nearest neighbours)
+    uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
+    uint32_t nvFinal;          // the number of vertices
+
+    __host__ __device__ void init() { nvFinal = 0; }
+
+};
+
+#endif // CUDADataFormatsVertexZVertexSoA.H
+
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
new file mode 100644
index 0000000000000..12b1828741d08
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats__src_classes_h
+#define CUDADataFormats__src_classes_h
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif  
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
new file mode 100644
index 0000000000000..c43814eb03def
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="CUDAProduct<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
+  <class name="ZVertexHeterogeneous" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
+</lcgdict>
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index e784b53b7ce1f..e1cd387360698 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -8,3 +8,18 @@
 #from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
 recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
 recopixelvertexing = cms.Sequence(recopixelvertexingTask)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi import pixelVertexSoA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA
+
+_pixelVertexingCUDATask = cms.Task(pixelTracksTask,pixelVertexCUDA,pixelVertexSoA,pixelVertices)
+
+# pixelVertexSoAonCPU = pixelVertexCUDA.clone()
+# pixelVertexSoAonCPU.onGPU = False;
+
+gpu.toReplaceWith(pixelVertices,_pixelVertexFromSoA)
+gpu.toReplaceWith(recopixelvertexingTask,_pixelVertexingCUDATask)
+
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
new file mode 100644
index 0000000000000..025e7abd99cf1
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -0,0 +1,94 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+
+class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
+public:
+  explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);
+  ~PixelTrackDumpCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const override;
+  const bool m_onGPU;
+  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<CUDAProduct<ZVertexHeterogeneous>> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+
+
+};
+
+PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) :
+  m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+  if (m_onGPU) {
+    tokenGPUTrack_ = consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  } else {
+    tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  }
+}
+
+void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+   desc.add<bool>("onGPU",true);
+   desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+   desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVertexCUDA"));
+   descriptions.add("pixelTrackDumpCUDA", desc);
+}
+
+void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const {
+  if (m_onGPU) {
+
+    auto const & hTracks = iEvent.get(tokenGPUTrack_);
+    CUDAScopedContextProduce ctx{hTracks};
+
+    auto const& tracks = ctx.get(hTracks);
+    auto const * tsoa = tracks.get();
+    assert(tsoa);
+
+    auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
+    auto const * vsoa = vertices.get();
+    assert(vsoa);
+
+  } else {
+    auto const * tsoa = iEvent.get(tokenSoATrack_).get();
+    assert(tsoa);
+
+    auto const * vsoa = iEvent.get(tokenSoAVertex_).get();
+    assert(vsoa);
+
+  }
+
+}
+
+
+DEFINE_FWK_MODULE(PixelTrackDumpCUDA);
+
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index ea9e4b1e4e037..903c2a894ff86 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -18,8 +18,3 @@
        refToPSet_ = cms.string('pvClusterComparer')
     )
 )
-
-
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoPixelVertexing.PixelVertexFinding.pixelVertexHeterogeneousProducer_cfi import pixelVertexHeterogeneousProducer as _pixelVertexHeterogeneousProducer
-gpu.toReplaceWith(pixelVertices, _pixelVertexHeterogeneousProducer)

From 9dc2a212efc16c7ce12b6589ceb47b415281c234 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 10 Sep 2019 16:03:58 -0500
Subject: [PATCH 12/50] Move event and stream caches, and caching allocators
 out from CUDAService (cms-patatrack#364)

To reduce dependencies on edm::Service, and to make CUDAService less
of a collection of everything, split off from it:
  - the CUDAEventCache
  - the CUDAStreamCache
  - the caching allocators

Other changes:
  - clean up unnecessary use of CUDAService
  - fix maxCachedFraction, add debug printouts
  - add make_*_unique_uninitialized that avoid the static_assert
---
 CUDADataFormats/Vertex/BuildFile.xml                          | 4 +---
 .../PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc           | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
index 521ea8fe29753..2aa4baedb0bb2 100644
--- a/CUDADataFormats/Vertex/BuildFile.xml
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -1,8 +1,6 @@
 <use name="cuda-api-wrappers"/>
 <use name="rootcore"/>
-<use name="FWCore/ServiceRegistry"/>
-<use name="FWCore/ParameterSetReader"/>
-<use name="HeterogeneousCore/CUDAServices"/>
+<use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="eigen"/>
 <export>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index 025e7abd99cf1..bebadb2f13094 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -13,12 +13,10 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"

From 9c4e8fb6058eaec43491346faa33a03882d01c53 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 12 Sep 2019 00:22:05 +0200
Subject: [PATCH 13/50] Apply clang-format style formatting

---
 .../Vertex/interface/ZVertexHeterogeneous.h   |  3 +-
 CUDADataFormats/Vertex/interface/ZVertexSoA.h | 33 +++++++--------
 CUDADataFormats/Vertex/src/classes.h          |  2 +-
 .../plugins/PixelTrackDumpCUDA.cc             | 42 +++++++++----------
 4 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
index 611831b53b6af..d12ed5f3d98de 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -5,11 +5,10 @@
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 
-
 using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
 #ifndef __CUDACC__
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
-using ZVertexCUDAProduct =  CUDAProduct<ZVertexHeterogeneous>;
+using ZVertexCUDAProduct = CUDAProduct<ZVertexHeterogeneous>;
 #endif
 
 #endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
index bbb54ec55cae7..cd1f8aea4e340 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -1,29 +1,26 @@
 #ifndef CUDADataFormatsVertexZVertexSoA_H
 #define CUDADataFormatsVertexZVertexSoA_H
 
-#include<cstdint>
+#include <cstdint>
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
-
 // SOA for vertices
 // These vertices are clusterized and fitted only along the beam line (z)
 // to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
 struct ZVertexSoA {
-    static constexpr uint32_t MAXTRACKS = 32*1024;
-    static constexpr uint32_t MAXVTX = 1024;
-
-    int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
-    float zv[MAXVTX];          // output z-posistion of found vertices
-    float wv[MAXVTX];          // output weight (1/error^2) on the above
-    float chi2[MAXVTX];        // vertices chi2
-    float ptv2[MAXVTX];        // vertices pt^2
-    int32_t ndof[MAXVTX];      // vertices number of dof (reused as workspace for the number of nearest neighbours)
-    uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
-    uint32_t nvFinal;          // the number of vertices
-
-    __host__ __device__ void init() { nvFinal = 0; }
-
+  static constexpr uint32_t MAXTRACKS = 32 * 1024;
+  static constexpr uint32_t MAXVTX = 1024;
+
+  int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
+  float zv[MAXVTX];          // output z-posistion of found vertices
+  float wv[MAXVTX];          // output weight (1/error^2) on the above
+  float chi2[MAXVTX];        // vertices chi2
+  float ptv2[MAXVTX];        // vertices pt^2
+  int32_t ndof[MAXVTX];      // vertices number of dof (reused as workspace for the number of nearest neighbours)
+  uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
+  uint32_t nvFinal;          // the number of vertices
+
+  __host__ __device__ void init() { nvFinal = 0; }
 };
 
-#endif // CUDADataFormatsVertexZVertexSoA.H
-
+#endif  // CUDADataFormatsVertexZVertexSoA.H
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index 12b1828741d08..f1144d1e3014e 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -5,4 +5,4 @@
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
-#endif  
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index bebadb2f13094..3f4013c196f07 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -23,7 +23,6 @@
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
-
 class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
 public:
   explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);
@@ -32,21 +31,21 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  void analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const override;
+  void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
   edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenGPUTrack_;
   edm::EDGetTokenT<CUDAProduct<ZVertexHeterogeneous>> tokenGPUVertex_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
   edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
-
-
 };
 
-PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) :
-  m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
+    : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
-    tokenGPUTrack_ = consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenGPUVertex_ = consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+    tokenGPUTrack_ =
+        consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ =
+        consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
     tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
@@ -56,37 +55,34 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) :
 void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
-   desc.add<bool>("onGPU",true);
-   desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
-   desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVertexCUDA"));
-   descriptions.add("pixelTrackDumpCUDA", desc);
+  desc.add<bool>("onGPU", true);
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+  desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVertexCUDA"));
+  descriptions.add("pixelTrackDumpCUDA", desc);
 }
 
-void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const {
+void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
+                                 edm::Event const& iEvent,
+                                 const edm::EventSetup& iSetup) const {
   if (m_onGPU) {
-
-    auto const & hTracks = iEvent.get(tokenGPUTrack_);
+    auto const& hTracks = iEvent.get(tokenGPUTrack_);
     CUDAScopedContextProduce ctx{hTracks};
 
     auto const& tracks = ctx.get(hTracks);
-    auto const * tsoa = tracks.get();
+    auto const* tsoa = tracks.get();
     assert(tsoa);
 
     auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
-    auto const * vsoa = vertices.get();
+    auto const* vsoa = vertices.get();
     assert(vsoa);
 
   } else {
-    auto const * tsoa = iEvent.get(tokenSoATrack_).get();
+    auto const* tsoa = iEvent.get(tokenSoATrack_).get();
     assert(tsoa);
 
-    auto const * vsoa = iEvent.get(tokenSoAVertex_).get();
+    auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
     assert(vsoa);
-
   }
-
 }
 
-
 DEFINE_FWK_MODULE(PixelTrackDumpCUDA);
-

From a6a7eaa7ad80c816a0d79ff3105424228cc50fa2 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 12 Sep 2019 05:45:55 +0200
Subject: [PATCH 14/50] Synchronise with CMSSW_11_0_0_pre7

---
 .../python/DQMOffline_SecondStep_cff.py       | 13 +++++++-
 .../Configuration/python/DQMOffline_cff.py    | 33 +++++++++++--------
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 325aa34117745..7150ef165106f 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -18,6 +18,7 @@
 from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQM.HcalTasks.OfflineHarvestingSequence_pp import *
 from PhysicsTools.NanoAOD.nanoDQM_cff import *
+from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import *
 
 DQMOffline_SecondStep_PreDPG = cms.Sequence( dqmDcsInfoClient *
                                              ecal_dqm_client_offline *
@@ -99,6 +100,11 @@
                                  runTauEff *
                                  dqmFastTimerServiceClient
                                 )
+
+DQMHarvestCommonFakeHLT = cms.Sequence( DQMHarvestCommon )
+DQMHarvestCommonFakeHLT.remove( triggerOfflineDQMClient )
+DQMHarvestCommonFakeHLT.remove( hltOfflineDQMClient )
+
 DQMHarvestCommonSiStripZeroBias = cms.Sequence(
                                                DQMMessageLoggerClientSeq *
                                                dqmDcsInfoClient *
@@ -113,6 +119,9 @@
                                                runTauEff  *
                                                dqmFastTimerServiceClient
                                                )
+DQMHarvestCommonSiStripZeroBiasFakeHLT = cms.Sequence( DQMHarvestCommonSiStripZeroBias )
+DQMHarvestCommonSiStripZeroBiasFakeHLT.remove( triggerOfflineDQMClient )
+DQMHarvestCommonSiStripZeroBiasFakeHLT.remove( hltOfflineDQMClient )
 
 DQMHarvestTracking = cms.Sequence( TrackingOfflineDQMClient *
                                    dqmFastTimerServiceClient )
@@ -130,6 +139,8 @@
 
 DQMHarvestLumi = cms.Sequence()
 
+DQMHarvestCTPPS = cms.Sequence()
+
 DQMHarvestMuon = cms.Sequence( dtClients *
                                rpcTier0Client *
                                cscOfflineCollisionsClients *
@@ -148,7 +159,7 @@
 
 DQMHarvestBTag = cms.Sequence( bTagCollectorSequenceDATA )
 
-DQMHarvestMiniAOD = cms.Sequence( dataCertificationJetMETSequence * muonQualityTests_miniAOD)
+DQMHarvestMiniAOD = cms.Sequence( dataCertificationJetMETSequence * muonQualityTests_miniAOD * DQMHarvestPF )
 DQMHarvestNanoAOD = cms.Sequence( nanoHarvest )
 
 # L1 trigger sequences
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 5cb9af6d3a960..80ba153d9f835 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -32,6 +32,7 @@
                                  cscSources *
                                  es_dqm_source_offline *
                                  castorSources *
+                                 ctppsDQM *
                                  HcalDQMOfflineSequence )
 
 DQMOfflineDPG = cms.Sequence( DQMOfflinePreDPG *
@@ -52,7 +53,8 @@
 from DQMOffline.RecoB.PixelVertexMonitor_cff import *
 from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
 # miniAOD DQM validation
-from Validation.RecoParticleFlow.miniAODDQM_cff import *
+from Validation.RecoParticleFlow.miniAODDQM_cff import * # On MiniAOD vs RECO
+from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import * # MiniAOD PF variables
 from DQM.TrackingMonitor.tracksDQMMiniAOD_cff import * 
 from DQM.TrackingMonitor.trackingRecoMaterialAnalyzer_cfi import materialDumperAnalyzer
 materialDumperAnalyzer.usePV = True
@@ -67,7 +69,7 @@
                                  bTagPlotsDATA *
                                  alcaBeamMonitor *
                                  dqmPhysics *
-                                 produceDenoms *
+                                 produceDenomsData *
                                  pfTauRunDQMValidation)
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
 
@@ -76,20 +78,14 @@
 
 HLTMonitoring = cms.Sequence( OfflineHLTMonitoring )
 HLTMonitoringPA = cms.Sequence( OfflineHLTMonitoringPA )
+
 DQMOffline = cms.Sequence( DQMOfflinePreDPG *
                            DQMOfflinePrePOG *
                            HLTMonitoring *
                            # dqmFastTimerServiceLuminosity *
                            DQMMessageLogger )
 
-_ctpps_2016_DQMOffline = DQMOffline.copy()
-_ctpps_2016_DQMOffline *= ctppsDQM
-from Configuration.Eras.Modifier_ctpps_2016_cff import ctpps_2016
-ctpps_2016.toReplaceWith(DQMOffline, _ctpps_2016_DQMOffline)
-
-_ctpps_2016_DQMOffline = DQMOffline.copy()
-#_ctpps_2016_DQMOffline *= ctppsDQM
-ctpps_2016.toReplaceWith(DQMOffline, _ctpps_2016_DQMOffline)
+DQMOfflineCTPPS = cms.Sequence( ctppsDQM ) 
 
 DQMOfflineExtraHLT = cms.Sequence(
     offlineValidationHLTSource
@@ -98,6 +94,7 @@
 
 DQMOfflineFakeHLT = cms.Sequence( DQMOffline )
 DQMOfflineFakeHLT.remove( HLTMonitoring )
+DQMOfflineFakeHLT.remove( triggerOfflineDQMSource )
 
 DQMOfflinePrePOGMC = cms.Sequence( pvMonitor *
                                    bTagPlotsDATA *
@@ -123,7 +120,7 @@
                                 DQMMessageLogger *
                                 dqmPhysics *
                                 pvMonitor *
-                                produceDenoms
+                                produceDenomsData
                                 )
 
 DQMOfflineCommon = cms.Sequence( dqmDcsInfo *
@@ -135,9 +132,13 @@
                                  alcaBeamMonitor *
                                  castorSources *
                                  dqmPhysics *
-                                 produceDenoms *
+                                 produceDenomsData *
                                  pfTauRunDQMValidation
                                 )
+
+DQMOfflineCommonFakeHLT = cms.Sequence( DQMOfflineCommon )
+DQMOfflineCommonFakeHLT.remove( triggerOfflineDQMSource )
+
 DQMOfflineCommonSiStripZeroBias = cms.Sequence( dqmDcsInfo *
                                  DQMMessageLogger *
                                  SiStripDQMTier0MinBias *
@@ -150,9 +151,13 @@
                                  dqmPhysics *
                                  pvMonitor *
                                  materialDumperAnalyzer *
-                                 produceDenoms *
+                                 produceDenomsData *
                                  pfTauRunDQMValidation
                                  )
+
+DQMOfflineCommonSiStripZeroBiasFakeHLT = cms.Sequence( DQMOfflineCommonSiStripZeroBias )
+DQMOfflineCommonSiStripZeroBiasFakeHLT.remove( triggerOfflineDQMSource )
+
 DQMOfflineLumi = cms.Sequence ( zcounting )
 
 muonRecoAnalyzer.doMVA =         cms.bool( True )
@@ -177,7 +182,7 @@
 
 from DQMOffline.Muon.miniAOD_cff import *
 
-DQMOfflineMiniAOD = cms.Sequence(jetMETDQMOfflineRedoProductsMiniAOD*muonMonitors_miniAOD*MuonMiniAOD)
+DQMOfflineMiniAOD = cms.Sequence(jetMETDQMOfflineRedoProductsMiniAOD*muonMonitors_miniAOD*MuonMiniAOD*DQMOfflinePF)
 
 #Post sequences are automatically placed in the EndPath by ConfigBuilder if PAT is run.
 #miniAOD DQM sequences need to access the filter results.

From 87bf94d3296a82f008c53a77b8f858cd977cbb52 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 24 Oct 2019 16:41:47 +0200
Subject: [PATCH 15/50] Implement library-only wrappers for launching CUDA
 kernels (cms-patatrack#390)

Implement a wrapper to launch a CUDA kernel without using the
non-standard CUDA <<<...>>> syntax, based on the cudaLaunchKernel
library function.

Implement a similar wrapper for cudaLaunchCooperativeKernel.

Migrate code base from cuda::launch to cudautils::launch.
---
 .../PixelVertexFinding/test/VertexFinder_t.h        | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 8fe08c4964fde..0df7af362ac0d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -7,6 +7,7 @@
 #include <cuda/api_wrappers.h>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE clusterTracksDBSCAN
@@ -153,13 +154,13 @@ int main() {
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cuda::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
       cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
 
@@ -221,7 +222,7 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cuda::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
       cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
       cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
@@ -241,7 +242,7 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cuda::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t));
 #else
       gridDim.x = 1024;  // nv ????
@@ -254,10 +255,10 @@ int main() {
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cuda::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cuda::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
+      cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
 #else

From 06c607a21aabb763ea8217c23e7abaabdac91317 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Sat, 26 Oct 2019 13:57:43 -0500
Subject: [PATCH 16/50] Replace use of API wrapper stream and event with plain
 CUDA, part 1 (cms-patatrack#389)

Replace cuda::stream_t<> with cudaStream_t in client code
Replace cuda::event_t with cudaEvent_t in the client code
Clean up BuildFiles
---
 CUDADataFormats/Vertex/BuildFile.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
index 2aa4baedb0bb2..bf606ba2330e1 100644
--- a/CUDADataFormats/Vertex/BuildFile.xml
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -1,4 +1,4 @@
-<use name="cuda-api-wrappers"/>
+<use name="cuda"/>
 <use name="rootcore"/>
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>

From 7ef38e5c4cf7434823ce6690e7e4449e5beed63a Mon Sep 17 00:00:00 2001
From: waredjeb <39335169+waredjeb@users.noreply.github.com>
Date: Tue, 29 Oct 2019 07:09:04 +0100
Subject: [PATCH 17/50] Replace CUDA API wrapper memory operations with native
 CUDA calls (cms-patatrack#395)

---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 0df7af362ac0d..14263ed7b3d18 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -6,6 +6,7 @@
 
 #include <cuda/api_wrappers.h>
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
@@ -126,10 +127,10 @@ int main() {
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cuda::memory::copy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
-      cuda::memory::copy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
-      cuda::memory::copy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
-      cuda::memory::copy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
@@ -162,7 +163,7 @@ int main() {
 
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
       print(onGPU_d.get(), ws_d.get());
@@ -207,8 +208,8 @@ int main() {
 #endif
 
 #ifdef __CUDACC__
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
@@ -223,9 +224,9 @@ int main() {
 
 #ifdef __CUDACC__
       cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
       nv = onGPU_d->nvFinal;
@@ -243,7 +244,7 @@ int main() {
 #ifdef __CUDACC__
       // one vertex per block!!!
       cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
-      cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t));
+      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       gridDim.x = 1024;  // nv ????
       assert(blockIdx.x == 0);
@@ -260,7 +261,7 @@ int main() {
 
       cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
-      cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t));
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
       sortByPt2(onGPU_d.get(), ws_d.get());
@@ -274,12 +275,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cuda::memory::copy(zv, LOC_ONGPU(zv), nv * sizeof(float));
-      cuda::memory::copy(wv, LOC_ONGPU(wv), nv * sizeof(float));
-      cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
-      cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float));
-      cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t));
-      cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t));
+      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)

From 45504107643f7c397fd9e84af783c985294c026d Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Oct 2019 05:10:07 -0500
Subject: [PATCH 18/50] Synchronize event in the CUDAProductBase destructor
 (cms-patatrack#391)

Otherwise there are possibilities for weird races, e.g. combination of
non-ExternalWork producers, consumed-but-not-read CUDAProducts, CUDA
streams executing work later than expected (= on the next event).
---
 CUDADataFormats/Vertex/BuildFile.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
index bf606ba2330e1..e3f9a0910bbd8 100644
--- a/CUDADataFormats/Vertex/BuildFile.xml
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="cuda"/>
 <use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="eigen"/>

From f51ed637ca14fc3f053ba4b6a7c90d360cc78dc7 Mon Sep 17 00:00:00 2001
From: waredjeb <39335169+waredjeb@users.noreply.github.com>
Date: Thu, 31 Oct 2019 11:54:07 +0100
Subject: [PATCH 19/50] Replace use of CUDA API wrapper unique_ptrs with
 CUDAUtilities unique_ptrs (cms-patatrack#396)

Replace cuda::memory::device::make_unique() calls with cudautils::make_device_unique()
Replace cuda::memory::host::make_unique() with cudautils::make_host_unique()
---
 RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 14263ed7b3d18..2f545d121a177 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -98,8 +98,8 @@ int main() {
 
   auto current_device = cuda::device::current::get();
 
-  auto onGPU_d = cuda::memory::device::make_unique<ZVertices[]>(current_device, 1);
-  auto ws_d = cuda::memory::device::make_unique<WorkSpace[]>(current_device, 1);
+  auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
+  auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);
 #else
   auto onGPU_d = std::make_unique<ZVertices>();
   auto ws_d = std::make_unique<WorkSpace>();

From f47b6891bc9cb0cf166a2f21f1096f879e405079 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 4 Nov 2019 11:48:34 +0100
Subject: [PATCH 20/50] Synchronise with CMSSW_11_0_0_pre11

---
 DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 7150ef165106f..4e386c0e14e7e 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -19,6 +19,7 @@
 from DQM.HcalTasks.OfflineHarvestingSequence_pp import *
 from PhysicsTools.NanoAOD.nanoDQM_cff import *
 from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import *
+from DQM.CTPPS.ctppsDQM_cff import *
 
 DQMOffline_SecondStep_PreDPG = cms.Sequence( dqmDcsInfoClient *
                                              ecal_dqm_client_offline *
@@ -139,7 +140,7 @@
 
 DQMHarvestLumi = cms.Sequence()
 
-DQMHarvestCTPPS = cms.Sequence()
+DQMHarvestCTPPS = cms.Sequence(ctppsDQMHarvest)
 
 DQMHarvestMuon = cms.Sequence( dtClients *
                                rpcTier0Client *

From 1ab5beb34014dbccaed0313ec6be63500a176d05 Mon Sep 17 00:00:00 2001
From: waredjeb <39335169+waredjeb@users.noreply.github.com>
Date: Tue, 26 Nov 2019 18:41:27 +0100
Subject: [PATCH 21/50] Replace cuda::device operations with native CUDA calls
 (cms-patatrack#408)

Replaces the usage of cuda::device::count(), cuda::device::get(), cuda::device::set() and cuda::device::current::get() with native CUDA calls.
---
 RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml    | 3 ---
 RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index dbb25529e884e..119bd5f04b4a9 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -21,7 +21,6 @@
 
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   <flags CXXFLAGS="-g"/>
 </bin>
@@ -32,14 +31,12 @@
 
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderDBSCAN_t">
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_DBSCAN"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderIterative_t">
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
   <flags CXXFLAGS="-g"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 2f545d121a177..977858d0bf08d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -4,8 +4,6 @@
 #include <random>
 #include <vector>
 
-#include <cuda/api_wrappers.h>
-
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
@@ -96,8 +94,6 @@ int main() {
 #ifdef __CUDACC__
   exitSansCUDADevices();
 
-  auto current_device = cuda::device::current::get();
-
   auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
   auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);
 #else

From 4c02d33fe7d87dd6ab1e087954977ce32700b708 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 27 Nov 2019 15:17:05 +0100
Subject: [PATCH 22/50] Drop obsolete heterogenous framework
 (cms-patatrack#416)

---
 .../PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc  | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index 3f4013c196f07..cd143fb3aab2c 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -1,28 +1,26 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/Framework/interface/global/EDAnalyzer.h"
-#include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
-
 class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
 public:
   explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);

From 62b58ad9dc8ffa104d4cf751bdb00f9c5cf8f9e4 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 28 Nov 2019 14:15:13 +0100
Subject: [PATCH 23/50] Synchronise with CMSSW_11_0_0_pre12

---
 .../python/DQMOffline_SecondStep_cff.py       | 191 ++++++++-----
 .../Configuration/python/DQMOffline_cff.py    | 255 ++++++++++--------
 2 files changed, 259 insertions(+), 187 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 4e386c0e14e7e..5b2f0a250c5c3 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -5,61 +5,84 @@
 from DQMServices.Components.DQMFastTimerServiceClient_cfi import *
 
 from DQMOffline.Ecal.ecal_dqm_client_offline_cff import *
+from DQM.EcalPreshowerMonitorClient.es_dqm_client_offline_cff import *
 from DQM.SiStripMonitorClient.SiStripClientConfig_Tier0_cff import *
 from DQM.SiPixelCommon.SiPixelOfflineDQM_client_cff import *
 from DQM.DTMonitorClient.dtDQMOfflineClients_cff import *
 from DQM.RPCMonitorClient.RPCTier0Client_cff import *
 from DQM.CSCMonitorModule.csc_dqm_offlineclient_collisions_cff import *
-from DQM.EcalPreshowerMonitorClient.es_dqm_client_offline_cff import *
-from DQM.BeamMonitor.AlcaBeamMonitorClient_cff import *
-from DQMServices.Components.DQMFEDIntegrityClient_cff import *
-from Validation.RecoTau.DQMSequences_cfi import *
 from DQMOffline.Hcal.HcalDQMOfflinePostProcessor_cff import *
-from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQM.HcalTasks.OfflineHarvestingSequence_pp import *
-from PhysicsTools.NanoAOD.nanoDQM_cff import *
-from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import *
-from DQM.CTPPS.ctppsDQM_cff import *
+from DQMServices.Components.DQMFEDIntegrityClient_cff import *
+from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
+
+DQMOffline_SecondStepDCS = cms.Sequence( dqmDcsInfoClient )
+
+DQMOffline_SecondStepEcal = cms.Sequence( ecal_dqm_client_offline *
+					  es_dqm_client_offline )
+
+DQMOffline_SecondStepTrackerStrip = cms.Sequence( SiStripOfflineDQMClient )
 
-DQMOffline_SecondStep_PreDPG = cms.Sequence( dqmDcsInfoClient *
-                                             ecal_dqm_client_offline *
-                                             SiStripOfflineDQMClient *
-                                             PixelOfflineDQMClientNoDataCertification *
-                                             dtClients *
+DQMOffline_SecondStepTrackerPixel = cms.Sequence( PixelOfflineDQMClientNoDataCertification )
+
+DQMOffline_SecondStepMuonDPG = cms.Sequence( dtClients *
                                              rpcTier0Client *
-                                             cscOfflineCollisionsClients *
-                                             es_dqm_client_offline *
-                                             hcalOfflineHarvesting *
-                                             HcalDQMOfflinePostProcessor *
-                                             dqmFEDIntegrityClient *
-                                             l1TriggerDqmOfflineClient )
+                                             cscOfflineCollisionsClients )
+
+DQMOffline_SecondStepHcal = cms.Sequence( hcalOfflineHarvesting )
+
+DQMOffline_SecondStepHcal2 = cms.Sequence(  HcalDQMOfflinePostProcessor )
+
+DQMOffline_SecondStepFED = cms.Sequence( dqmFEDIntegrityClient )
+
+DQMOffline_SecondStepL1T = cms.Sequence( l1TriggerDqmOfflineClient )
+
+DQMOffline_SecondStep_PreDPG = cms.Sequence( DQMOffline_SecondStepDCS *
+                                             DQMOffline_SecondStepEcal *
+                                             DQMOffline_SecondStepTrackerStrip *
+					     DQMOffline_SecondStepTrackerPixel *
+                                             DQMOffline_SecondStepMuonDPG *
+					     DQMOffline_SecondStepHcal *
+					     DQMOffline_SecondStepHcal2 *
+                                             DQMOffline_SecondStepFED *
+					     DQMOffline_SecondStepL1T )
 
 DQMOffline_SecondStepDPG = cms.Sequence(
                                          DQMOffline_SecondStep_PreDPG *
                                          DQMMessageLoggerClientSeq )
 
+
+from DQM.TrackingMonitorClient.TrackingClientConfig_Tier0_cff import *
 from DQMOffline.Muon.muonQualityTests_cff import *
 from DQMOffline.EGamma.egammaPostProcessing_cff import *
 from DQMOffline.Trigger.DQMOffline_Trigger_Client_cff import *
 from DQMOffline.Trigger.DQMOffline_HLT_Client_cff import *
 from DQMOffline.RecoB.dqmCollector_cff import *
+from DQM.BeamMonitor.AlcaBeamMonitorClient_cff import *
 from DQMOffline.JetMET.SusyPostProcessor_cff import *
-from DQMOffline.JetMET.dataCertificationJetMET_cff import *
-from DQM.TrackingMonitorClient.TrackingClientConfig_Tier0_cff import *
-from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
-from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import *
-from DQM.SiOuterTracker.OuterTrackerClientConfig_cff import *
 
-DQMOffline_SecondStep_PrePOG = cms.Sequence( TrackingOfflineDQMClient *
-                                             muonQualityTests *
-                                             egammaPostProcessing *
-                                             triggerOfflineDQMClient *
-                                             hltOfflineDQMClient *
-                                             bTagCollectorSequenceDATA *
-                                             alcaBeamMonitorClient *
-                                             SusyPostProcessorSequence *
-                                             runTauEff)
-from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
+DQMOffline_SecondStepTracking = cms.Sequence ( TrackingOfflineDQMClient )
+
+DQMOffline_SecondStepMUO = cms.Sequence ( muonQualityTests )
+
+DQMOffline_SecondStepEGamma = cms.Sequence( egammaPostProcessing )
+
+DQMOffline_SecondStepTrigger = cms.Sequence( triggerOfflineDQMClient *
+						hltOfflineDQMClient )
+
+DQMOffline_SecondStepBTag = cms.Sequence( bTagCollectorSequenceDATA )
+
+DQMOffline_SecondStepBeam = cms.Sequence( alcaBeamMonitorClient )
+
+DQMOffline_SecondStepJetMET = cms.Sequence( SusyPostProcessorSequence )
+
+DQMOffline_SecondStep_PrePOG = cms.Sequence( DQMOffline_SecondStepTracking *
+                                             DQMOffline_SecondStepMUO *
+                                             DQMOffline_SecondStepEGamma *
+                                             DQMOffline_SecondStepTrigger *
+                                             DQMOffline_SecondStepBTag *
+                                             DQMOffline_SecondStepBeam *
+                                             DQMOffline_SecondStepJetMET )
 
 DQMOffline_SecondStepPOG = cms.Sequence(
                                          DQMOffline_SecondStep_PrePOG *
@@ -68,6 +91,7 @@
 
 HLTMonitoringClient = cms.Sequence(trackingMonitorClientHLT * trackingForDisplacedJetMonitorClientHLT)
 HLTMonitoringClientPA= cms.Sequence(trackingMonitorClientHLT * PAtrackingMonitorClientHLT)
+
 DQMOffline_SecondStep = cms.Sequence(
                                       DQMOffline_SecondStep_PreDPG *
                                       DQMOffline_SecondStep_PrePOG *
@@ -75,54 +99,78 @@
                                       DQMMessageLoggerClientSeq *
                                       dqmFastTimerServiceClient)
 
-DQMOffline_SecondStep_ExtraHLT = cms.Sequence(
-    hltOfflineDQMClientExtra
-)
+DQMOffline_SecondStep_ExtraHLT = cms.Sequence( hltOfflineDQMClientExtra )
 
 DQMOffline_SecondStep_FakeHLT = cms.Sequence( DQMOffline_SecondStep )
 DQMOffline_SecondStep_FakeHLT.remove( HLTMonitoringClient )
+DQMOffline_SecondStep_FakeHLT.remove( DQMOffline_SecondStepTrigger )
 
 DQMOffline_SecondStep_PrePOGMC = cms.Sequence( bTagCollectorSequenceDATA )
 
-DQMOffline_SecondStepPOGMC = cms.Sequence(
-                                           DQMOffline_SecondStep_PrePOGMC *
+DQMOffline_SecondStepPOGMC = cms.Sequence( DQMOffline_SecondStep_PrePOGMC *
                                            DQMMessageLoggerClientSeq )
 
-DQMHarvestCommon = cms.Sequence(
-                                 DQMMessageLoggerClientSeq *
-                                 dqmDcsInfoClient *
-                                 SiStripOfflineDQMClient *
-                                 TrackingOfflineDQMClient *
-                                 PixelOfflineDQMClientNoDataCertification *
-                                 triggerOfflineDQMClient *
-                                 hltOfflineDQMClient *
-                                 dqmFEDIntegrityClient *
-                                 alcaBeamMonitorClient *
-                                 runTauEff *
+# Harvest
+from DQMOffline.JetMET.dataCertificationJetMET_cff import *
+from DQM.SiOuterTracker.OuterTrackerClientConfig_cff import *
+from DQM.CTPPS.ctppsDQM_cff import *
+from Validation.RecoTau.DQMSequences_cfi import *
+from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
+from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import *
+
+DQMHarvestDCS = cms.Sequence ( dqmDcsInfoClient )
+
+DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient )
+
+DQMHarvestTrackerPixel = cms.Sequence ( PixelOfflineDQMClientNoDataCertification )
+
+DQMHarvestTrack = cms.Sequence ( TrackingOfflineDQMClient )
+
+DQMHarvestTrigger = cms.Sequence ( triggerOfflineDQMClient *
+				    hltOfflineDQMClient )
+
+DQMHarvestFED = cms.Sequence ( dqmFEDIntegrityClient )
+
+DQMHarvestBeam = cms.Sequence ( alcaBeamMonitorClient )
+
+DQMHarvestTAU = cms.Sequence ( runTauEff )
+
+DQMHarvestL1T = cms.Sequence( l1TriggerDqmOfflineClient )
+
+DQMHarvestL1TEgamma = cms.Sequence( l1TriggerEgDqmOfflineClient )
+
+DQMHarvestL1TMuon = cms.Sequence( l1TriggerMuonDqmOfflineClient )
+
+DQMHarvestCommon = cms.Sequence( DQMMessageLoggerClientSeq *
+                                 DQMHarvestDCS *
+                                 DQMHarvestTrackerStrip *
+                                 DQMHarvestTrack *
+                                 DQMHarvestTrackerPixel *
+				 DQMHarvestTrigger *
+                                 DQMHarvestFED *
+                                 DQMHarvestBeam *
+                                 DQMHarvestTAU *
                                  dqmFastTimerServiceClient
                                 )
 
 DQMHarvestCommonFakeHLT = cms.Sequence( DQMHarvestCommon )
-DQMHarvestCommonFakeHLT.remove( triggerOfflineDQMClient )
-DQMHarvestCommonFakeHLT.remove( hltOfflineDQMClient )
+DQMHarvestCommonFakeHLT.remove( DQMHarvestTrigger )
 
 DQMHarvestCommonSiStripZeroBias = cms.Sequence(
                                                DQMMessageLoggerClientSeq *
-                                               dqmDcsInfoClient *
-                                               SiStripOfflineDQMClient *
-                                               TrackingOfflineDQMClient *
-                                               PixelOfflineDQMClientNoDataCertification *
-                                               triggerOfflineDQMClient *
-                                               hltOfflineDQMClient *
-                                               l1TriggerDqmOfflineClient *
-                                               dqmFEDIntegrityClient *
-                                               alcaBeamMonitorClient *
-                                               runTauEff  *
+                                               DQMHarvestDCS *
+                                               DQMHarvestTrackerStrip *
+                                               DQMHarvestTrack *
+                                               DQMHarvestTrackerPixel *
+                                               DQMHarvestTrigger *
+                                               DQMHarvestL1T *
+                                               DQMHarvestFED *
+                                               DQMHarvestBeam *
                                                dqmFastTimerServiceClient
                                                )
+
 DQMHarvestCommonSiStripZeroBiasFakeHLT = cms.Sequence( DQMHarvestCommonSiStripZeroBias )
-DQMHarvestCommonSiStripZeroBiasFakeHLT.remove( triggerOfflineDQMClient )
-DQMHarvestCommonSiStripZeroBiasFakeHLT.remove( hltOfflineDQMClient )
+DQMHarvestCommonSiStripZeroBiasFakeHLT.remove( DQMHarvestTrigger )
 
 DQMHarvestTracking = cms.Sequence( TrackingOfflineDQMClient *
                                    dqmFastTimerServiceClient )
@@ -138,8 +186,6 @@
                                  dqmFastTimerServiceClient
                                  )
 
-DQMHarvestLumi = cms.Sequence()
-
 DQMHarvestCTPPS = cms.Sequence(ctppsDQMHarvest)
 
 DQMHarvestMuon = cms.Sequence( dtClients *
@@ -152,7 +198,9 @@
                                 es_dqm_client_offline
                               )
 
-DQMHarvestHcal = cms.Sequence(hcalOfflineHarvesting)
+DQMHarvestHcal = cms.Sequence( hcalOfflineHarvesting )
+
+DQMHarvestHcal2 = cms.Sequence( HcalDQMOfflinePostProcessor )
 
 DQMHarvestJetMET = cms.Sequence( SusyPostProcessorSequence )
 
@@ -160,12 +208,9 @@
 
 DQMHarvestBTag = cms.Sequence( bTagCollectorSequenceDATA )
 
+from PhysicsTools.NanoAOD.nanoDQM_cff import *
+from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import *
+
 DQMHarvestMiniAOD = cms.Sequence( dataCertificationJetMETSequence * muonQualityTests_miniAOD * DQMHarvestPF )
 DQMHarvestNanoAOD = cms.Sequence( nanoHarvest )
 
-# L1 trigger sequences
-DQMHarvestL1TMonitoring = cms.Sequence( l1TriggerDqmOfflineClient )
-
-DQMHarvestL1TEgamma = cms.Sequence( l1TriggerEgDqmOfflineClient )
-
-DQMHarvestL1TMuon = cms.Sequence( l1TriggerMuonDqmOfflineClient )
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 80ba153d9f835..b8afa5327ef77 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -4,74 +4,109 @@
 from DQMServices.Components.DQMDcsInfo_cfi import *
 from DQMServices.Components.DQMFastTimerService_cff import *
 
+from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQMOffline.Ecal.ecal_dqm_source_offline_cff import *
+from DQM.EcalPreshowerMonitorModule.es_dqm_source_offline_cff import *
 from DQM.HcalTasks.OfflineSourceSequence_pp import *
+from DQMOffline.Hcal.HcalDQMOfflineSequence_cff import *
 from DQM.SiStripMonitorClient.SiStripSourceConfigTier0_cff import *
 from DQM.SiPixelCommon.SiPixelOfflineDQM_source_cff import *
 from DQM.DTMonitorModule.dtDQMOfflineSources_cff import *
 from DQM.RPCMonitorClient.RPCTier0Source_cff import *
 from DQM.CSCMonitorModule.csc_dqm_sourceclient_offline_cff import *
-from DQM.EcalPreshowerMonitorModule.es_dqm_source_offline_cff import *
-from DQM.BeamMonitor.AlcaBeamMonitor_cff import *
 from DQM.CastorMonitor.castor_dqm_sourceclient_offline_cff import *
-from Validation.RecoTau.DQMSequences_cfi import *
-from DQMOffline.Hcal.HcalDQMOfflineSequence_cff import *
-from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQM.CTPPS.ctppsDQM_cff import *
 
 DQMNone = cms.Sequence()
 
-DQMOfflinePreDPG = cms.Sequence( dqmDcsInfo *
-                                 l1TriggerDqmOffline * # L1 emulator is run within this sequence for real data
-                                 ecal_dqm_source_offline *
-                                 hcalOfflineSourceSequence *
-                                 SiStripDQMTier0 *
-                                 siPixelOfflineDQM_source *
-                                 dtSources *
-                                 rpcTier0Source *
-                                 cscSources *
-                                 es_dqm_source_offline *
-                                 castorSources *
-                                 ctppsDQM *
-                                 HcalDQMOfflineSequence )
+DQMMessageLoggerSeq = cms.Sequence( DQMMessageLogger )
+
+DQMOfflineDCS = cms.Sequence( dqmDcsInfo )
+
+# L1 trigger sequences
+DQMOfflineL1T = cms.Sequence( l1TriggerDqmOffline ) # L1 emulator is run within this sequence for real data
+
+DQMOfflineL1TEgamma = cms.Sequence( l1TriggerEgDqmOffline )
+
+DQMOfflineL1TMuon = cms.Sequence( l1TriggerMuonDqmOffline )
+
+#DPGs
+DQMOfflineEcal = cms.Sequence( ecal_dqm_source_offline *
+				es_dqm_source_offline )
+
+DQMOfflineHcal = cms.Sequence( hcalOfflineSourceSequence )
+
+DQMOfflineHcal2 = cms.Sequence( HcalDQMOfflineSequence )
+
+DQMOfflineTrackerStrip = cms.Sequence( SiStripDQMTier0 )
+
+DQMOfflineTrackerPixel = cms.Sequence( 	siPixelOfflineDQM_source )
+
+DQMOfflineMuonDPG = cms.Sequence( dtSources *
+                                  rpcTier0Source *
+                                  cscSources )
+
+DQMOfflineCASTOR = cms.Sequence( castorSources )
+
+DQMOfflineCTPPS = cms.Sequence( ctppsDQM )
+
+DQMOfflinePreDPG = cms.Sequence( DQMOfflineDCS *
+				 DQMOfflineL1T *
+                                 DQMOfflineEcal *
+                                 DQMOfflineHcal *
+				 DQMOfflineHcal2 *
+                                 DQMOfflineTrackerStrip *
+				 DQMOfflineTrackerPixel *
+				 DQMOfflineMuonDPG *
+                                 DQMOfflineCASTOR *
+                                 DQMOfflineCTPPS )
 
 DQMOfflineDPG = cms.Sequence( DQMOfflinePreDPG *
                               DQMMessageLogger )
 
+from DQM.TrackingMonitorSource.TrackingSourceConfig_Tier0_cff import *
+from DQMOffline.RecoB.PrimaryVertexMonitor_cff import *
+from DQM.TrackingMonitor.trackingRecoMaterialAnalyzer_cfi import materialDumperAnalyzer
 from DQMOffline.Muon.muonMonitors_cff import *
 from DQMOffline.JetMET.jetMETDQMOfflineSource_cff import *
 from DQMOffline.EGamma.egammaDQMOffline_cff import *
 from DQMOffline.Trigger.DQMOffline_Trigger_cff import *
-from DQMOffline.RecoB.PrimaryVertexMonitor_cff import *
 from DQMOffline.RecoB.dqmAnalyzer_cff import *
-from DQMOffline.Lumi.ZCounting_cff import *
+from DQM.BeamMonitor.AlcaBeamMonitor_cff import *
 from DQM.Physics.DQMPhysics_cff import *
-from DQM.Physics.DQMTopMiniAOD_cff import *
-from Validation.RecoTau.DQMSequences_cfi import *
-from DQM.TrackingMonitorSource.TrackingSourceConfig_Tier0_cff import *
-from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import *
-from DQMOffline.RecoB.PixelVertexMonitor_cff import *
-from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
-# miniAOD DQM validation
-from Validation.RecoParticleFlow.miniAODDQM_cff import * # On MiniAOD vs RECO
-from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import * # MiniAOD PF variables
-from DQM.TrackingMonitor.tracksDQMMiniAOD_cff import * 
-from DQM.TrackingMonitor.trackingRecoMaterialAnalyzer_cfi import materialDumperAnalyzer
+
+DQMOfflineVertex = cms.Sequence( pvMonitor )
+
 materialDumperAnalyzer.usePV = True
+DQMOfflineTracking = cms.Sequence( TrackingDQMSourceTier0 *
+                                   DQMOfflineVertex *
+                                   materialDumperAnalyzer )
+
+DQMOfflineMUO = cms.Sequence(muonMonitors)
+muonRecoAnalyzer.doMVA =         cms.bool( True )
+muonRecoAnalyzer_miniAOD.doMVA = cms.bool( True )
+
+DQMOfflineJetMET = cms.Sequence( jetMETDQMOfflineSource )
+
+DQMOfflineEGamma = cms.Sequence( egammaDQMOffline )
+
+DQMOfflineTrigger = cms.Sequence( triggerOfflineDQMSource )
+
+DQMOfflineBTag = cms.Sequence( bTagPlotsDATA )
+
+DQMOfflineBeam = cms.Sequence( alcaBeamMonitor )
+
+DQMOfflinePhysics = cms.Sequence( dqmPhysics )
+
+DQMOfflinePrePOG = cms.Sequence( DQMOfflineTracking *
+                                 DQMOfflineMUO *
+                                 DQMOfflineJetMET *
+                                 DQMOfflineEGamma *
+                                 DQMOfflineTrigger *
+                                 DQMOfflineBTag *
+                                 DQMOfflineBeam *
+                                 DQMOfflinePhysics )
 
-DQMOfflinePrePOG = cms.Sequence( TrackingDQMSourceTier0 *
-                                 muonMonitors *
-                                 jetMETDQMOfflineSource *
-                                 egammaDQMOffline *
-                                 triggerOfflineDQMSource *
-                                 pvMonitor *
-                                 materialDumperAnalyzer *
-                                 bTagPlotsDATA *
-                                 alcaBeamMonitor *
-                                 dqmPhysics *
-                                 produceDenomsData *
-                                 pfTauRunDQMValidation)
-from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
 
 DQMOfflinePOG = cms.Sequence( DQMOfflinePrePOG *
                               DQMMessageLogger )
@@ -79,89 +114,92 @@
 HLTMonitoring = cms.Sequence( OfflineHLTMonitoring )
 HLTMonitoringPA = cms.Sequence( OfflineHLTMonitoringPA )
 
+# Data
 DQMOffline = cms.Sequence( DQMOfflinePreDPG *
                            DQMOfflinePrePOG *
                            HLTMonitoring *
-                           # dqmFastTimerServiceLuminosity *
                            DQMMessageLogger )
 
-DQMOfflineCTPPS = cms.Sequence( ctppsDQM ) 
-
-DQMOfflineExtraHLT = cms.Sequence(
-    offlineValidationHLTSource
-)
+DQMOfflineExtraHLT = cms.Sequence( offlineValidationHLTSource )
 
 
 DQMOfflineFakeHLT = cms.Sequence( DQMOffline )
 DQMOfflineFakeHLT.remove( HLTMonitoring )
-DQMOfflineFakeHLT.remove( triggerOfflineDQMSource )
+DQMOfflineFakeHLT.remove( DQMOfflineTrigger )
 
-DQMOfflinePrePOGMC = cms.Sequence( pvMonitor *
-                                   bTagPlotsDATA *
-                                   dqmPhysics )
+#MC
+DQMOfflinePrePOGMC = cms.Sequence( DQMOfflineVertex *
+                                   DQMOfflineBTag *
+                                   DQMOfflinePhysics )
 
 DQMOfflinePOGMC = cms.Sequence( DQMOfflinePrePOGMC *
                                 DQMMessageLogger )
 
-DQMOfflinePhysics = cms.Sequence( dqmPhysics )
-
-
-
-DQMOfflineTracking = cms.Sequence( TrackingDQMSourceTier0Common *
-                                   pvMonitor *
-                                   materialDumperAnalyzer
-                                 )
+#DQMOfflineCommon
+from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import *
+from DQMOffline.RecoB.PixelVertexMonitor_cff import *
+from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
+from Validation.RecoTau.DQMSequences_cfi import *
 
-DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring +
+DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring *
                                         pixelPVMonitor )
 
-DQMOuterTracker = cms.Sequence( dqmDcsInfo *
+DQMOuterTracker = cms.Sequence( DQMOfflineDCS *
                                 OuterTrackerSource *
                                 DQMMessageLogger *
-                                dqmPhysics *
-                                pvMonitor *
-                                produceDenomsData
+                                DQMOfflinePhysics *
+                                DQMOfflineVertex 
                                 )
 
-DQMOfflineCommon = cms.Sequence( dqmDcsInfo *
+DQMOfflineTAU = cms.Sequence( produceDenomsData *
+				pfTauRunDQMValidation )
+
+DQMOfflineTrackerStrip = cms.Sequence( SiStripDQMTier0Common )
+
+DQMOfflineTrackerPixel = cms.Sequence( siPixelOfflineDQM_source )
+
+DQMOfflineCommon = cms.Sequence( DQMOfflineDCS *
                                  DQMMessageLogger *
-                                 SiStripDQMTier0Common *
-                                 siPixelOfflineDQM_source *
+				 DQMOfflineTrackerStrip * 
+				 DQMOfflineTrackerPixel *
                                  DQMOfflineTracking *
-                                 triggerOfflineDQMSource *
-                                 alcaBeamMonitor *
-                                 castorSources *
-                                 dqmPhysics *
-                                 produceDenomsData *
-                                 pfTauRunDQMValidation
+                                 DQMOfflineTrigger *
+                                 DQMOfflineBeam *
+                                 DQMOfflineCASTOR *
+                                 DQMOfflinePhysics *
+				 DQMOfflineTAU
                                 )
 
 DQMOfflineCommonFakeHLT = cms.Sequence( DQMOfflineCommon )
-DQMOfflineCommonFakeHLT.remove( triggerOfflineDQMSource )
+DQMOfflineCommonFakeHLT.remove( DQMOfflineTrigger )
+
+#MinBias/ZeroBias
+DQMOfflineTrackerStripMinBias = cms.Sequence( SiStripDQMTier0MinBias )
 
-DQMOfflineCommonSiStripZeroBias = cms.Sequence( dqmDcsInfo *
+DQMOfflineTrackingMinBias = cms.Sequence( TrackingDQMSourceTier0MinBias *
+                                   DQMOfflineVertex *
+                                   materialDumperAnalyzer )
+
+
+DQMOfflineCommonSiStripZeroBias = cms.Sequence( DQMOfflineDCS *
                                  DQMMessageLogger *
-                                 SiStripDQMTier0MinBias *
-                                 TrackingDQMSourceTier0MinBias *
-                                 siPixelOfflineDQM_source *
-                                 l1TriggerDqmOffline *
-                                 triggerOfflineDQMSource *
-                                 alcaBeamMonitor *
-                                 castorSources *
-                                 dqmPhysics *
-                                 pvMonitor *
-                                 materialDumperAnalyzer *
-                                 produceDenomsData *
-                                 pfTauRunDQMValidation
+				 DQMOfflineTrackerStripMinBias *
+				 DQMOfflineTrackerPixel *
+                                 DQMOfflineL1T *
+                                 DQMOfflineTrigger *
+                                 DQMOfflineBeam *
+                                 DQMOfflineCASTOR *
+                                 DQMOfflinePhysics *
+				 DQMOfflineTrackingMinBias
                                  )
 
 DQMOfflineCommonSiStripZeroBiasFakeHLT = cms.Sequence( DQMOfflineCommonSiStripZeroBias )
-DQMOfflineCommonSiStripZeroBiasFakeHLT.remove( triggerOfflineDQMSource )
+DQMOfflineCommonSiStripZeroBiasFakeHLT.remove( DQMOfflineTrigger )
 
-DQMOfflineLumi = cms.Sequence ( zcounting )
+#Other definitons
+from DQMOffline.Lumi.ZCounting_cff import *
 
-muonRecoAnalyzer.doMVA =         cms.bool( True )
-muonRecoAnalyzer_miniAOD.doMVA = cms.bool( True )
+DQMOfflineLumi = cms.Sequence ( zcounting )
 
 DQMOfflineMuon = cms.Sequence( dtSources *
                                rpcTier0Source *
@@ -169,25 +207,24 @@
                                muonMonitors
                               )
 
-DQMOfflineHcal = cms.Sequence( hcalOfflineSourceSequence )
-
-DQMOfflineEcal = cms.Sequence( ecal_dqm_source_offline *
-                               es_dqm_source_offline
-                             )
-DQMOfflineJetMET = cms.Sequence( jetMETDQMOfflineSource )
-
-DQMOfflineEGamma = cms.Sequence( egammaDQMOffline )
+#Taus not created in pp conditions for HI
+from Configuration.Eras.Modifier_pp_on_AA_2018_cff import pp_on_AA_2018
+_DQMOfflineTAU = cms.Sequence()
+pp_on_AA_2018.toReplaceWith(DQMOfflineTAU, _DQMOfflineTAU)
 
-DQMOfflineBTag = cms.Sequence( bTagPlotsDATA )
 
+# miniAOD DQM validation
+from Validation.RecoParticleFlow.miniAODDQM_cff import * # On MiniAOD vs RECO
+from Validation.RecoParticleFlow.DQMForPF_MiniAOD_cff import * # MiniAOD PF variables
+from DQM.TrackingMonitor.tracksDQMMiniAOD_cff import *
 from DQMOffline.Muon.miniAOD_cff import *
+from DQM.Physics.DQMTopMiniAOD_cff import *
 
 DQMOfflineMiniAOD = cms.Sequence(jetMETDQMOfflineRedoProductsMiniAOD*muonMonitors_miniAOD*MuonMiniAOD*DQMOfflinePF)
 
 #Post sequences are automatically placed in the EndPath by ConfigBuilder if PAT is run.
 #miniAOD DQM sequences need to access the filter results.
 
-
 PostDQMOfflineMiniAOD = cms.Sequence(miniAODDQMSequence*jetMETDQMOfflineSourceMiniAOD*tracksDQMMiniAOD*topPhysicsminiAOD)
 PostDQMOffline = cms.Sequence()
 
@@ -196,17 +233,7 @@
     pfMetDQMAnalyzerMiniAOD, pfPuppiMetDQMAnalyzerMiniAOD # No hcalnoise yet
 ]))
 
-from Configuration.Eras.Modifier_pp_on_AA_2018_cff import pp_on_AA_2018
-_pfTauRunDQMValidation = cms.Sequence()
-pp_on_AA_2018.toReplaceWith(pfTauRunDQMValidation, _pfTauRunDQMValidation)
-
 from PhysicsTools.NanoAOD.nanoDQM_cff import nanoDQM
 DQMOfflineNanoAOD = cms.Sequence(nanoDQM)
 #PostDQMOfflineNanoAOD = cms.Sequence(nanoDQM)
 
-# L1 trigger sequences
-DQMOfflineL1TMonitoring = cms.Sequence( l1TriggerDqmOffline ) # L1 emulator is run within this sequence for real data
-
-DQMOfflineL1TEgamma = cms.Sequence( l1TriggerEgDqmOffline )
-
-DQMOfflineL1TMuon = cms.Sequence( l1TriggerMuonDqmOffline )

From ebace29cc2271f310248d25e69875c206efa77a8 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 30 Nov 2019 07:38:48 +0100
Subject: [PATCH 24/50] Synchronise with CMSSW_11_0_0_pre13

---
 .../Configuration/python/DQMOffline_SecondStep_cff.py       | 3 +++
 DQMOffline/Configuration/python/DQMOffline_cff.py           | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 5b2f0a250c5c3..a4d8e88aa9a40 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -15,6 +15,7 @@
 from DQM.HcalTasks.OfflineHarvestingSequence_pp import *
 from DQMServices.Components.DQMFEDIntegrityClient_cff import *
 from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
+from DQM.SiTrackerPhase2.Phase2TrackerDQMHarvesting_cff import *
 
 DQMOffline_SecondStepDCS = cms.Sequence( dqmDcsInfoClient )
 
@@ -185,6 +186,8 @@
                                  DQMMessageLoggerClientSeq *
                                  dqmFastTimerServiceClient
                                  )
+DQMHarvestTrackerPhase2 = cms.Sequence(trackerphase2DQMHarvesting)
+
 
 DQMHarvestCTPPS = cms.Sequence(ctppsDQMHarvest)
 
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index b8afa5327ef77..a54a84f0d04dd 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -16,6 +16,7 @@
 from DQM.CSCMonitorModule.csc_dqm_sourceclient_offline_cff import *
 from DQM.CastorMonitor.castor_dqm_sourceclient_offline_cff import *
 from DQM.CTPPS.ctppsDQM_cff import *
+from DQM.SiTrackerPhase2.Phase2TrackerDQMFirstStep_cff import *
 
 DQMNone = cms.Sequence()
 
@@ -151,10 +152,12 @@
                                 DQMOfflineVertex 
                                 )
 
+DQMOfflineTrackerPhase2 = cms.Sequence( trackerphase2DQMSource )
+
 DQMOfflineTAU = cms.Sequence( produceDenomsData *
 				pfTauRunDQMValidation )
 
-DQMOfflineTrackerStrip = cms.Sequence( SiStripDQMTier0Common )
+DQMOfflineTrackerStripCommon = cms.Sequence( SiStripDQMTier0Common )
 
 DQMOfflineTrackerPixel = cms.Sequence( siPixelOfflineDQM_source )
 
@@ -236,4 +239,3 @@
 from PhysicsTools.NanoAOD.nanoDQM_cff import nanoDQM
 DQMOfflineNanoAOD = cms.Sequence(nanoDQM)
 #PostDQMOfflineNanoAOD = cms.Sequence(nanoDQM)
-

From 27ac879e7d2df40f52427a8ef071ecc804340674 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 2 Dec 2019 14:43:35 +0100
Subject: [PATCH 25/50] Rename exitSansCUDADevices to requireCUDADevices
 (cms-patatrack#423)

---
 RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 977858d0bf08d..9b55ba59daab2 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
@@ -92,7 +92,7 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
 
 int main() {
 #ifdef __CUDACC__
-  exitSansCUDADevices();
+  requireCUDADevices();
 
   auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
   auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);

From 012df145e2935521cd29d1cdd29c5cf89c05636c Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 3 Dec 2019 17:39:10 +0100
Subject: [PATCH 26/50] Fix possible memory corruption in  gpuVertexFinder
 (cms-patatrack#419)

---
 CUDADataFormats/Vertex/interface/ZVertexSoA.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
index cd1f8aea4e340..5f8a7f65843f1 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -8,7 +8,7 @@
 // These vertices are clusterized and fitted only along the beam line (z)
 // to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
 struct ZVertexSoA {
-  static constexpr uint32_t MAXTRACKS = 32 * 1024;
+  static constexpr uint32_t MAXTRACKS = 16 * 1024;
   static constexpr uint32_t MAXVTX = 1024;
 
   int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
@@ -16,7 +16,7 @@ struct ZVertexSoA {
   float wv[MAXVTX];          // output weight (1/error^2) on the above
   float chi2[MAXVTX];        // vertices chi2
   float ptv2[MAXVTX];        // vertices pt^2
-  int32_t ndof[MAXVTX];      // vertices number of dof (reused as workspace for the number of nearest neighbours)
+  int32_t ndof[MAXTRACKS];   // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
   uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
   uint32_t nvFinal;          // the number of vertices
 

From e459d8ce09d39a633eebd6f19015eb4f388fe6e3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 3 Dec 2019 17:43:04 +0100
Subject: [PATCH 27/50] Implement GPU vertex finder with a single kernel
 (cms-patatrack#413)

---
 .../PixelVertexFinding/test/BuildFile.xml     |  7 ++-
 .../PixelVertexFinding/test/VertexFinder_t.h  | 44 +++++++++++++++----
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index 119bd5f04b4a9..95a572e68ce5e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -19,6 +19,12 @@
 <use name="SimDataFormats/Track"/>
 <use name="TrackingTools/TransientTrack"/>
 
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
 <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
   <use name="cuda"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
@@ -44,4 +50,3 @@
 <bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
   <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
 </bin>
-
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 9b55ba59daab2..d32a611402e61 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -15,12 +15,34 @@
 #define CLUSTERIZE clusterTracksIterative
 #else
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h"
-#define CLUSTERIZE clusterTracksByDensity
+#define CLUSTERIZE clusterTracksByDensityKernel
 #endif
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
 
+#ifdef ONE_KERNEL
+#ifdef __CUDACC__
+  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                           gpuVertexFinder::WorkSpace* pws,
+                                           int minT,    // min number of neighbours to be "seed"
+                                           float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                           float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
+    __syncthreads();
+    fitVertices(pdata,pws, 50.);
+    __syncthreads();
+    splitVertices(pdata,pws, 9.f);    
+    __syncthreads();
+    fitVertices(pdata,pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata,pws);
+  }
+#endif
+#endif
+
 using namespace gpuVertexFinder;
 
 struct Event {
@@ -151,13 +173,17 @@ int main() {
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
+#ifdef ONE_KERNEL
+      cudautils::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#else
       cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#endif
       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
@@ -219,7 +245,7 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
@@ -239,23 +265,23 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+ //     cudautils::launch(splitVerticesKernel, {1, 256}, onGPU_d.get(), ws_d.get(), 9.f);
+      cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      gridDim.x = 1024;  // nv ????
+      gridDim.x = 1;
       assert(blockIdx.x == 0);
-      for (; blockIdx.x < gridDim.x; ++blockIdx.x)
-        splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
       resetGrid();
       nv = ws_d->nvIntermediate;
 #endif
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get());
+      cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else

From 749793a09e3b40d8f15efc425db2c8f0858bdec7 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 3 Dec 2019 18:13:21 +0100
Subject: [PATCH 28/50] Apply code-format fixes (cms-patatrack#427)

---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index d32a611402e61..5261069a6b283 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -23,23 +23,23 @@
 
 #ifdef ONE_KERNEL
 #ifdef __CUDACC__
-  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
-                                           gpuVertexFinder::WorkSpace* pws,
-                                           int minT,    // min number of neighbours to be "seed"
-                                           float eps,     // max absolute distance to cluster
-                                        float errmax,  // max error to be "seed"
-                                           float chi2max  // max normalized distance to cluster,
-  ) {
-    clusterTracksByDensity(pdata,pws,minT,eps,errmax,chi2max);
-    __syncthreads();
-    fitVertices(pdata,pws, 50.);
-    __syncthreads();
-    splitVertices(pdata,pws, 9.f);    
-    __syncthreads();
-    fitVertices(pdata,pws, 5000.);
-    __syncthreads();
-    sortByPt2(pdata,pws);
-  }
+__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+) {
+  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  __syncthreads();
+  fitVertices(pdata, pws, 50.);
+  __syncthreads();
+  splitVertices(pdata, pws, 9.f);
+  __syncthreads();
+  fitVertices(pdata, pws, 5000.);
+  __syncthreads();
+  sortByPt2(pdata, pws);
+}
 #endif
 #endif
 
@@ -265,7 +265,6 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
- //     cudautils::launch(splitVerticesKernel, {1, 256}, onGPU_d.get(), ws_d.get(), 9.f);
       cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else

From 08cdb55088e731ed709bfd61c916af6ad555240f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 17 Jan 2020 09:10:53 -0600
Subject: [PATCH 29/50] Implement changes from the CUDA framework review
 (cms-patatrack#429)

Rename the cudautils namespace to cms::cuda or cms::cudatest, and drop the CUDA prefix from the symbols defined there.

Always record and query the CUDA event, to minimize need for error checking in CUDAScopedContextProduce destructor.

Add comments to highlight the pieces in CachingDeviceAllocator that have been changed wrt. cub.

Various other updates and clean up:
  - enable CUDA for compute capability 3.5.
  - clean up CUDAService, CUDA tests and plugins.
  - add CUDA existence protections to BuildFiles.
  - mark thread-safe static variables with CMS_THREAD_SAFE.
---
 .../Vertex/interface/ZVertexHeterogeneous.h   |  4 ++--
 CUDADataFormats/Vertex/src/classes.h          |  2 +-
 CUDADataFormats/Vertex/src/classes_def.xml    |  2 +-
 .../plugins/PixelTrackDumpCUDA.cc             | 14 ++++++------
 .../PixelVertexFinding/test/VertexFinder_t.h  | 22 +++++++++----------
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
index d12ed5f3d98de..aacfddc6fe7e2 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -7,8 +7,8 @@
 
 using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
 #ifndef __CUDACC__
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-using ZVertexCUDAProduct = CUDAProduct<ZVertexHeterogeneous>;
+#include "CUDADataFormats/Common/interface/Product.h"
+using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
 #endif
 
 #endif
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index f1144d1e3014e..e7fea871f7d39 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -2,7 +2,7 @@
 #define CUDADataFormats__src_classes_h
 
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
index c43814eb03def..ea633080af9af 100644
--- a/CUDADataFormats/Vertex/src/classes_def.xml
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -1,5 +1,5 @@
 <lcgdict>
-  <class name="CUDAProduct<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
   <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
   <class name="ZVertexHeterogeneous" persistent="false"/>
   <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index cd143fb3aab2c..04faf570c3fcc 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
@@ -18,7 +18,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
@@ -31,8 +31,8 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
 private:
   void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
-  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenGPUTrack_;
-  edm::EDGetTokenT<CUDAProduct<ZVertexHeterogeneous>> tokenGPUVertex_;
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
   edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
 };
@@ -41,9 +41,9 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
     : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
     tokenGPUTrack_ =
-        consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ =
-        consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+        consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
     tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
@@ -64,7 +64,7 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
                                  const edm::EventSetup& iSetup) const {
   if (m_onGPU) {
     auto const& hTracks = iEvent.get(tokenGPUTrack_);
-    CUDAScopedContextProduce ctx{hTracks};
+    cms::cuda::ScopedContextProduce ctx{hTracks};
 
     auto const& tracks = ctx.get(hTracks);
     auto const* tsoa = tracks.get();
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5261069a6b283..5b7a1b6eadd0c 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
@@ -114,10 +114,10 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
 
 int main() {
 #ifdef __CUDACC__
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
-  auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
-  auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);
+  auto onGPU_d = cms::cuda::make_device_unique<ZVertices[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<WorkSpace[]>(1, nullptr);
 #else
   auto onGPU_d = std::make_unique<ZVertices>();
   auto ws_d = std::make_unique<WorkSpace>();
@@ -174,16 +174,16 @@ int main() {
       cudaDeviceSynchronize();
 
 #ifdef ONE_KERNEL
-      cudautils::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
 #else
-      cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
 #endif
       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
@@ -245,7 +245,7 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
@@ -265,7 +265,7 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cms::cuda::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       gridDim.x = 1;
@@ -277,10 +277,10 @@ int main() {
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cms::cuda::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else

From 94e9ef881fa509e35d6d4da9095aae95cf88dc2f Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 27 Jan 2020 12:17:14 +0100
Subject: [PATCH 30/50] Synchronise with CMSSW_11_1_0_pre2

Major changes:
  - restructure the RecoPixelVertexing/PixelVertexFinding package;
  - update the interface of PixelCPEFast.
---
 .../python/DQMOffline_SecondStep_cff.py       |   2 +-
 .../Configuration/python/DQMOffline_cff.py    |   2 +-
 .../PixelVertexFinding/plugins/BuildFile.xml  |   5 +-
 .../plugins/PixelVertexCollectionTrimmer.cc   |   2 +-
 .../plugins/PixelVertexProducerCUDA.cc        | 125 +++++++++
 .../plugins/PixelVertexProducerFromSoA.cc     | 175 +++++++++++++
 .../plugins/PixelVertexSoAFromCUDA.cc         |  65 +++++
 .../plugins/gpuClusterTracksByDensity.h       | 234 +++++++++++++++++
 .../plugins/gpuClusterTracksDBSCAN.h          | 242 ++++++++++++++++++
 .../plugins/gpuClusterTracksIterative.h       | 213 +++++++++++++++
 .../plugins/gpuFitVertices.h                  | 113 ++++++++
 .../PixelVertexFinding/plugins/gpuSortByPt2.h |  73 ++++++
 .../plugins/gpuSplitVertices.h                | 139 ++++++++++
 .../plugins/gpuVertexFinder.cc                |   1 +
 .../plugins/gpuVertexFinder.cu                |   1 +
 .../plugins/gpuVertexFinder.h                 |  83 ++++++
 .../plugins/gpuVertexFinderImpl.h             | 173 +++++++++++++
 .../PixelVertexFinding/test/VertexFinder_t.h  |  12 +-
 18 files changed, 1650 insertions(+), 10 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
 create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index a4d8e88aa9a40..ad00af3abdf74 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -189,7 +189,7 @@
 DQMHarvestTrackerPhase2 = cms.Sequence(trackerphase2DQMHarvesting)
 
 
-DQMHarvestCTPPS = cms.Sequence(ctppsDQMHarvest)
+DQMHarvestCTPPS = cms.Sequence( ctppsDQMOfflineHarvest )
 
 DQMHarvestMuon = cms.Sequence( dtClients *
                                rpcTier0Client *
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index a54a84f0d04dd..ca5dc2ac34a40 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -49,7 +49,7 @@
 
 DQMOfflineCASTOR = cms.Sequence( castorSources )
 
-DQMOfflineCTPPS = cms.Sequence( ctppsDQM )
+DQMOfflineCTPPS = cms.Sequence( ctppsDQMOfflineSource )
 
 DQMOfflinePreDPG = cms.Sequence( DQMOfflineDCS *
 				 DQMOfflineL1T *
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
index 427799cb122b5..99b91b2587bcf 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="CUDADataFormats/Vertex"/>
 <use name="CommonTools/Clustering1D"/>
 <use name="DataFormats/BeamSpot"/>
 <use name="DataFormats/GeometryCommonDetAlgo"/>
@@ -15,10 +16,12 @@
 <use name="FWCore/Utilities"/>
 <use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalTracker/ClusterParameterEstimator"/>
 <use name="RecoLocalTracker/Records"/>
 <use name="RecoPixelVertexing/PixelVertexFinding"/>
 <use name="SimDataFormats/PileupSummaryInfo"/>
-<library file="*.cc" name="RecoPixelVertexingPixelVertexFindingPlugins">
+<library file="*.cc *.cu" name="RecoPixelVertexingPixelVertexFindingPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexCollectionTrimmer.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexCollectionTrimmer.cc
index c6747707ada73..91dfc0393b432 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexCollectionTrimmer.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexCollectionTrimmer.cc
@@ -3,7 +3,7 @@
 // Package: RecoPixelVertexing/PixelVertexFinding
 // Class: PixelVertexCollectionTrimmer
 //
-/**\class PixelVertexCollectionTrimmer PixelVertexCollectionTrimmer.cc RecoPixelVertexing/PixelVertexFinding/src/PixelVertexCollectionTrimmer.cc
+/**\class PixelVertexCollectionTrimmer PixelVertexCollectionTrimmer.cc RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexCollectionTrimmer.cc
 
 Description: [one line class summary]
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
new file mode 100644
index 0000000000000..e9054dbf17c53
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -0,0 +1,125 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "gpuVertexFinder.h"
+
+class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
+public:
+  explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig);
+  ~PixelVertexProducerCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  bool m_OnGPU;
+
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
+
+  const gpuVertexFinder::Producer m_gpuAlgo;
+
+  // Tracking cuts before sending tracks to vertex algo
+  const float m_ptMin;
+};
+
+PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
+    : m_OnGPU(conf.getParameter<bool>("onGPU")),
+      m_gpuAlgo(conf.getParameter<bool>("oneKernel"),
+                conf.getParameter<bool>("useDensity"),
+                conf.getParameter<bool>("useDBSCAN"),
+                conf.getParameter<bool>("useIterative"),
+                conf.getParameter<int>("minT"),
+                conf.getParameter<double>("eps"),
+                conf.getParameter<double>("errmax"),
+                conf.getParameter<double>("chi2max")),
+      m_ptMin(conf.getParameter<double>("PtMin"))  // 0.5 GeV
+{
+  if (m_OnGPU) {
+    tokenGPUTrack_ =
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
+  } else {
+    tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
+  }
+}
+
+void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  // Only one of these three algos can be used at once.
+  // Maybe this should become a Plugin Factory
+  desc.add<bool>("onGPU", true);
+  desc.add<bool>("oneKernel", true);
+  desc.add<bool>("useDensity", true);
+  desc.add<bool>("useDBSCAN", false);
+  desc.add<bool>("useIterative", false);
+
+  desc.add<int>("minT", 2);          // min number of neighbours to be "core"
+  desc.add<double>("eps", 0.07);     // max absolute distance to cluster
+  desc.add<double>("errmax", 0.01);  // max error to be "seed"
+  desc.add<double>("chi2max", 9.);   // max normalized distance to cluster
+
+  desc.add<double>("PtMin", 0.5);
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+
+  auto label = "pixelVertexCUDA";
+  descriptions.add(label, desc);
+}
+
+void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  if (m_OnGPU) {
+    edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+    iEvent.getByToken(tokenGPUTrack_, hTracks);
+
+    cms::cuda::ScopedContextProduce ctx{*hTracks};
+    auto const* tracks = ctx.get(*hTracks).get();
+
+    assert(tracks);
+
+    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+
+  } else {
+    auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+    assert(tracks);
+
+    /*
+    auto const & tsoa = *tracks;
+    auto maxTracks = tsoa.stride();
+    std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+    int32_t nt = 0;
+    for (int32_t it = 0; it < maxTracks; ++it) {
+      auto nHits = tsoa.nHits(it);
+      assert(nHits==int(tsoa.hitIndices.size(it)));
+      if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+      nt++;
+    }
+    std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+    */
+
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+  }
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
new file mode 100644
index 0000000000000..e642e3fd734f9
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -0,0 +1,175 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+
+#include "DataFormats/VertexReco/interface/Vertex.h"
+#include "DataFormats/VertexReco/interface/VertexFwd.h"
+
+class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
+public:
+  using IndToEdm = std::vector<uint16_t>;
+
+  explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~PixelVertexProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+  edm::EDGetTokenT<ZVertexHeterogeneous> tokenVertex_;
+  edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
+  edm::EDGetTokenT<reco::TrackCollection> tokenTracks_;
+  edm::EDGetTokenT<IndToEdm> tokenIndToEdm_;
+};
+
+PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf)
+    : tokenVertex_(consumes<ZVertexHeterogeneous>(conf.getParameter<edm::InputTag>("src"))),
+      tokenBeamSpot_(consumes<reco::BeamSpot>(conf.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTracks_(consumes<reco::TrackCollection>(conf.getParameter<edm::InputTag>("TrackCollection"))),
+      tokenIndToEdm_(consumes<IndToEdm>(conf.getParameter<edm::InputTag>("TrackCollection"))) {
+  produces<reco::VertexCollection>();
+}
+
+void PixelVertexProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("TrackCollection", edm::InputTag("pixelTracks"));
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelVertexSoA"));
+
+  descriptions.add("pixelVertexFromSoA", desc);
+}
+
+void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &) const {
+  auto vertexes = std::make_unique<reco::VertexCollection>();
+
+  edm::Handle<reco::TrackCollection> trackCollection;
+  iEvent.getByToken(tokenTracks_, trackCollection);
+  auto const &tracks = *(trackCollection.product());
+  edm::Handle<IndToEdm> indToEdmH;
+  iEvent.getByToken(tokenIndToEdm_, indToEdmH);
+  auto const &indToEdm = *indToEdmH;
+
+  edm::Handle<reco::BeamSpot> bsHandle;
+  iEvent.getByToken(tokenBeamSpot_, bsHandle);
+
+  float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0;
+  std::vector<int32_t> itrk;
+  if (!bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) ";
+  } else {
+    const reco::BeamSpot &bs = *bsHandle;
+    x0 = bs.x0();
+    y0 = bs.y0();
+    z0 = bs.z0();
+    dxdz = bs.dxdz();
+    dydz = bs.dydz();
+  }
+
+  auto const &soa = *(iEvent.get(tokenVertex_).get());
+
+  int nv = soa.nvFinal;
+
+  // std::cout << "converting " << nv << " vertices " << " from " << indToEdm.size() << " tracks" << std::endl;
+
+  std::set<uint16_t> uind;  // fort verifing index consistency
+  for (int j = nv - 1; j >= 0; --j) {
+    auto i = soa.sortInd[j];  // on gpu sorted in ascending order....
+    assert(i < nv);
+    uind.insert(i);
+    assert(itrk.empty());
+    auto z = soa.zv[i];
+    auto x = x0 + dxdz * z;
+    auto y = y0 + dydz * z;
+    z += z0;
+    reco::Vertex::Error err;
+    err(2, 2) = 1.f / soa.wv[i];
+    err(2, 2) *= 2.;  // artifically inflate error
+    //Copy also the tracks (no intention to be efficient....)
+    for (auto k = 0U; k < indToEdm.size(); ++k) {
+      if (soa.idv[k] == int16_t(i))
+        itrk.push_back(k);
+    }
+    auto nt = itrk.size();
+    if (nt == 0) {
+      std::cout << "vertex " << i << " with no tracks..." << std::endl;
+      continue;
+    }
+    if (nt < 2) {
+      itrk.clear();
+      continue;
+    }  // remove outliers
+    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt);
+    auto &v = (*vertexes).back();
+    for (auto it : itrk) {
+      assert(it < int(indToEdm.size()));
+      auto k = indToEdm[it];
+      if (k > tracks.size()) {
+        edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k;
+        continue;
+      }
+      auto tk = reco::TrackRef(trackCollection, k);
+      v.add(reco::TrackBaseRef(tk));
+    }
+    itrk.clear();
+  }
+
+  LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n";
+  for (unsigned int i = 0; i < vertexes->size(); ++i) {
+    LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize()
+                                    << " tracks with a position of " << (*vertexes)[i].z() << " +- "
+                                    << std::sqrt((*vertexes)[i].covariance(2, 2));
+  }
+
+  // legacy logic....
+  if (vertexes->empty() && bsHandle.isValid()) {
+    const reco::BeamSpot &bs = *bsHandle;
+
+    GlobalError bse(bs.rotatedCovariance3D());
+    if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) {
+      AlgebraicSymMatrix33 we;
+      we(0, 0) = 10000;
+      we(1, 1) = 10000;
+      we(2, 2) = 10000;
+      vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix()
+                                          << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    } else {
+      vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    }
+  } else if (vertexes->empty() && !bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned.";
+  }
+
+  iEvent.put(std::move(vertexes));
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
new file mode 100644
index 0000000000000..0cadf24580cf7
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -0,0 +1,65 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~PixelVertexSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
+
+  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+};
+
+PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
+    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<ZVertexHeterogeneous>()) {}
+
+void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelVertexCUDA"));
+  descriptions.add("pixelVertexSoA", desc);
+}
+
+void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                     edm::EventSetup const& iSetup,
+                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_soa = inputData.toHostAsync(ctx.stream());
+}
+
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // No copies....
+  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+}
+
+DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
new file mode 100644
index 0000000000000..871b09599c903
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -0,0 +1,234 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  //
+  // based on Rodrighez&Laio algo
+  //
+  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
+                                                         gpuVertexFinder::WorkSpace* pws,
+                                                         int minT,      // min number of neighbours to be "seed"
+                                                         float eps,     // max absolute distance to cluster
+                                                         float errmax,  // max error to be "seed"
+                                                         float chi2max  // max normalized distance to cluster
+  ) {
+    using namespace gpuVertexFinder;
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find closest above me .... (we ignore the possibility of two j at same distance from i)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // (break natural order???)
+        mdist = dist;
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+#ifdef GPU_DEBUG
+    __syncthreads();
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto minJ = i;
+      auto mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        mdist = dist;
+        minJ = j;
+      };
+      forEachInBins(hist, izt[i], 1, loop);
+      // should belong to the same cluster...
+      assert(iv[i] == iv[minJ]);
+      assert(nn[i] <= nn[iv[i]]);
+    }
+    __syncthreads();
+#endif
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
+                                               gpuVertexFinder::WorkSpace* pws,
+                                               int minT,      // min number of neighbours to be "seed"
+                                               float eps,     // max absolute distance to cluster
+                                               float errmax,  // max error to be "seed"
+                                               float chi2max  // max normalized distance to cluster
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
new file mode 100644
index 0000000000000..593c7597aecea
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -0,0 +1,242 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksDBSCAN(ZVertices* pdata,
+                                      WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "core"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        nn[i]++;
+      };
+
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find NN with smaller z...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      float mz = zt[i];
+      auto loop = [&](uint32_t j) {
+        if (zt[j] >= mz)
+          return;
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        mz = zt[j];
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      assert(zt[iv[i]] <= zt[i]);
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        // they should belong to the same cluster, isn't it?
+        if (iv[i] != iv[j]) {
+          printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
+          printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
+          ;
+        }
+        assert(iv[i] == iv[j]);
+      };
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+    __syncthreads();
+#endif
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
new file mode 100644
index 0000000000000..14c825f353960
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -0,0 +1,213 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksIterative(ZVertices* pdata,
+                                         WorkSpace* pws,
+                                         int minT,      // min number of neighbours to be "core"
+                                         float eps,     // max absolute distance to cluster
+                                         float errmax,  // max error to be "seed"
+                                         float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ int nloops;
+    nloops = 0;
+
+    __syncthreads();
+
+    // cluster seeds only
+    bool more = true;
+    while (__syncthreads_or(more)) {
+      if (1 == nloops % 2) {
+        for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+          auto m = iv[i];
+          while (m != iv[m])
+            m = iv[m];
+          iv[i] = m;
+        }
+      } else {
+        more = false;
+        for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+          auto p = hist.begin() + k;
+          auto i = (*p);
+          auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
+          if (nn[i] < minT)
+            continue;  // DBSCAN core rule
+          auto loop = [&](uint32_t j) {
+            assert(i != j);
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+              return;
+            auto old = atomicMin(&iv[j], iv[i]);
+            if (old != iv[i]) {
+              // end the loop only if no changes were applied
+              more = true;
+            }
+            atomicMin(&iv[i], old);
+          };
+          ++p;
+          for (; p < hist.end(be); ++p)
+            loop(*p);
+        }  // for i
+      }
+      if (threadIdx.x == 0)
+        ++nloops;
+    }  // while
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](int j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
new file mode 100644
index 0000000000000..4487cb12ea17b
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -0,0 +1,113 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
+                                              WorkSpace* pws,
+                                              float chi2Max  // for outlier rejection
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    assert(nvFinal <= nvIntermediate);
+    nvFinal = nvIntermediate;
+    auto foundClusters = nvFinal;
+
+    // zero
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      zv[i] = 0;
+      wv[i] = 0;
+      chi2[i] = 0;
+    }
+
+    // only for test
+    __shared__ int noise;
+    if (verbose && 0 == threadIdx.x)
+      noise = 0;
+
+    __syncthreads();
+
+    // compute cluster location
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990) {
+        if (verbose)
+          atomicAdd(&noise, 1);
+        continue;
+      }
+      assert(iv[i] >= 0);
+      assert(iv[i] < int(foundClusters));
+      auto w = 1.f / ezt2[i];
+      atomicAdd(&zv[iv[i]], zt[i] * w);
+      atomicAdd(&wv[iv[i]], w);
+    }
+
+    __syncthreads();
+    // reuse nn
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      assert(wv[i] > 0.f);
+      zv[i] /= wv[i];
+      nn[i] = -1;  // ndof
+    }
+    __syncthreads();
+
+    // compute chi2
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+
+      auto c2 = zv[iv[i]] - zt[i];
+      c2 *= c2 / ezt2[i];
+      if (c2 > chi2Max) {
+        iv[i] = 9999;
+        continue;
+      }
+      atomicAdd(&chi2[iv[i]], c2);
+      atomicAdd(&nn[iv[i]], 1);
+    }
+    __syncthreads();
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
+      if (nn[i] > 0)
+        wv[i] *= float(nn[i]) / chi2[i];
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto clusters ", foundClusters);
+    if (verbose && 0 == threadIdx.x)
+      printf("and %d noise\n", noise);
+  }
+
+  __global__ void fitVerticesKernel(ZVertices* pdata,
+                                    WorkSpace* pws,
+                                    float chi2Max  // for outlier rejection
+  ) {
+    fitVertices(pdata, pws, chi2Max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
new file mode 100644
index 0000000000000..89cc9a3844f76
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -0,0 +1,73 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#ifdef __CUDA_ARCH__
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+#endif
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ ptt2 = ws.ptt2;
+    uint32_t const& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ iv = ws.iv;
+    float* __restrict__ ptv2 = data.ptv2;
+    uint16_t* __restrict__ sortInd = data.sortInd;
+
+    // if (threadIdx.x == 0)
+    //    printf("sorting %d vertices\n",nvFinal);
+
+    if (nvFinal < 1)
+      return;
+
+    // fill indexing
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      data.idv[ws.itrk[i]] = iv[i];
+    }
+
+    // can be done asynchronoisly at the end of previous event
+    for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) {
+      ptv2[i] = 0;
+    }
+    __syncthreads();
+
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+      atomicAdd(&ptv2[iv[i]], ptt2[i]);
+    }
+    __syncthreads();
+
+    if (1 == nvFinal) {
+      if (threadIdx.x == 0)
+        sortInd[0] = 0;
+      return;
+    }
+#ifdef __CUDA_ARCH__
+    __shared__ uint16_t sws[1024];
+    // sort using only 16 bits
+    radixSort<float, 2>(ptv2, sortInd, sws, nvFinal);
+#else
+    for (uint16_t i = 0; i < nvFinal; ++i)
+      sortInd[i] = i;
+    std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
+#endif
+  }
+
+  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
new file mode 100644
index 0000000000000..694915ab02157
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -0,0 +1,139 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float const* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    // one vertex per block
+    for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
+      if (nn[kv] < 4)
+        continue;
+      if (chi2[kv] < maxChi2 * float(nn[kv]))
+        continue;
+
+      constexpr int MAXTK = 512;
+      assert(nn[kv] < MAXTK);
+      if (nn[kv] >= MAXTK)
+        continue;                      // too bad FIXME
+      __shared__ uint32_t it[MAXTK];   // track index
+      __shared__ float zz[MAXTK];      // z pos
+      __shared__ uint8_t newV[MAXTK];  // 0 or 1
+      __shared__ float ww[MAXTK];      // z weight
+
+      __shared__ uint32_t nq;  // number of track for this vertex
+      nq = 0;
+      __syncthreads();
+
+      // copy to local
+      for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
+        if (iv[k] == int(kv)) {
+          auto old = atomicInc(&nq, MAXTK);
+          zz[old] = zt[k] - zv[kv];
+          newV[old] = zz[old] < 0 ? 0 : 1;
+          ww[old] = 1.f / ezt2[k];
+          it[old] = k;
+        }
+      }
+
+      __shared__ float znew[2], wnew[2];  // the new vertices
+
+      __syncthreads();
+      assert(int(nq) == nn[kv] + 1);
+
+      int maxiter = 20;
+      // kt-min....
+      bool more = true;
+      while (__syncthreads_or(more)) {
+        more = false;
+        if (0 == threadIdx.x) {
+          znew[0] = 0;
+          znew[1] = 0;
+          wnew[0] = 0;
+          wnew[1] = 0;
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto i = newV[k];
+          atomicAdd(&znew[i], zz[k] * ww[k]);
+          atomicAdd(&wnew[i], ww[k]);
+        }
+        __syncthreads();
+        if (0 == threadIdx.x) {
+          znew[0] /= wnew[0];
+          znew[1] /= wnew[1];
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto d0 = fabs(zz[k] - znew[0]);
+          auto d1 = fabs(zz[k] - znew[1]);
+          auto newer = d0 < d1 ? 0 : 1;
+          more |= newer != newV[k];
+          newV[k] = newer;
+        }
+        --maxiter;
+        if (maxiter <= 0)
+          more = false;
+      }
+
+      // avoid empty vertices
+      if (0 == wnew[0] || 0 == wnew[1])
+        continue;
+
+      // quality cut
+      auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
+
+      auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
+
+      if (verbose && 0 == threadIdx.x)
+        printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
+
+      if (chi2Dist < 4)
+        continue;
+
+      // get a new global vertex
+      __shared__ uint32_t igv;
+      if (0 == threadIdx.x)
+        igv = atomicAdd(&ws.nvIntermediate, 1);
+      __syncthreads();
+      for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+        if (1 == newV[k])
+          iv[it[k]] = igv;
+      }
+
+    }  // loop on vertices
+  }
+
+  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    splitVertices(pdata, pws, maxChi2);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
new file mode 100644
index 0000000000000..6cd86c93a6737
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -0,0 +1,83 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+
+#include <cstddef>
+#include <cstdint>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+
+namespace gpuVertexFinder {
+
+  using ZVertices = ZVertexSoA;
+  using TkSoA = pixelTrack::TrackSoA;
+
+  // workspace used in the vertex reco algos
+  struct WorkSpace {
+    static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
+    static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
+
+    uint32_t ntrks;            // number of "selected tracks"
+    uint16_t itrk[MAXTRACKS];  // index of original track
+    float zt[MAXTRACKS];       // input track z at bs
+    float ezt2[MAXTRACKS];     // input error^2 on the above
+    float ptt2[MAXTRACKS];     // input pt^2 on the above
+    uint8_t izt[MAXTRACKS];    // interized z-position of input tracks
+    int32_t iv[MAXTRACKS];     // vertex index for each associated track
+
+    uint32_t nvIntermediate;  // the number of vertices after splitting pruning etc.
+
+    __host__ __device__ void init() {
+      ntrks = 0;
+      nvIntermediate = 0;
+    }
+  };
+
+  __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
+    pdata->init();
+    pws->init();
+  }
+
+  class Producer {
+  public:
+    using ZVertices = ZVertexSoA;
+    using WorkSpace = gpuVertexFinder::WorkSpace;
+    using TkSoA = pixelTrack::TrackSoA;
+
+    Producer(bool oneKernel,
+             bool useDensity,
+             bool useDBSCAN,
+             bool useIterative,
+             int iminT,      // min number of neighbours to be "core"
+             float ieps,     // max absolute distance to cluster
+             float ierrmax,  // max error to be "seed"
+             float ichi2max  // max normalized distance to cluster
+             )
+        : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
+          useDensity_(useDensity),
+          useDBSCAN_(useDBSCAN),
+          useIterative_(useIterative),
+          minT(iminT),
+          eps(ieps),
+          errmax(ierrmax),
+          chi2max(ichi2max) {}
+
+    ~Producer() = default;
+
+    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
+
+  private:
+    const bool oneKernel_;
+    const bool useDensity_;
+    const bool useDBSCAN_;
+    const bool useIterative_;
+
+    int minT;       // min number of neighbours to be "core"
+    float eps;      // max absolute distance to cluster
+    float errmax;   // max error to be "seed"
+    float chi2max;  // max normalized distance to cluster
+  };
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
new file mode 100644
index 0000000000000..10c487d588e9d
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -0,0 +1,173 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
+#include "gpuFitVertices.h"
+#include "gpuSortByPt2.h"
+#include "gpuSplitVertices.h"
+
+namespace gpuVertexFinder {
+
+  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
+    assert(ptracks);
+    assert(soa);
+    auto const& tracks = *ptracks;
+    auto const& fit = tracks.stateAtBS;
+    auto const* quality = tracks.qualityData();
+
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = tracks.nHits(idx);
+      if (nHits == 0)
+        break;  // this is a guard: maybe we need to move to nTracks...
+
+      // initialize soa...
+      soa->idv[idx] = -1;
+
+      if (nHits < 4)
+        continue;  // no triplets
+      if (quality[idx] != trackQuality::loose)
+        continue;
+
+      auto pt = tracks.pt(idx);
+
+      if (pt < ptMin)
+        continue;
+
+      auto& data = *pws;
+      auto it = atomicAdd(&data.ntrks, 1);
+      data.itrk[it] = idx;
+      data.zt[it] = tracks.zip(idx);
+      data.ezt2[it] = fit.covariance(idx)(14);
+      data.ptt2[it] = pt * pt;
+    }
+  }
+
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                        gpuVertexFinder::WorkSpace* pws,
+                                        int minT,      // min number of neighbours to be "seed"
+                                        float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                        float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, 50.);
+    __syncthreads();
+    splitVertices(pdata, pws, 9.f);
+    __syncthreads();
+    fitVertices(pdata, pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#else
+  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, 50.);
+  }
+
+  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+    fitVertices(pdata, pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#endif
+
+#ifdef __CUDACC__
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+    // std::cout << "producing Vertices on GPU" << std::endl;
+    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+#else
+  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
+    // std::cout << "producing Vertices on  CPU" <<    std::endl;
+    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+#endif
+    assert(tksoa);
+    auto* soa = vertices.get();
+    assert(soa);
+
+#ifdef __CUDACC__
+    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+#else
+    auto ws_d = std::make_unique<WorkSpace>();
+#endif
+
+#ifdef __CUDACC__
+    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    auto blockSize = 128;
+    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
+    cudaCheck(cudaGetLastError());
+#else
+    cudaCompat::resetGrid();
+    init(soa, ws_d.get());
+    loadTracks(tksoa, soa, ws_d.get(), ptMin);
+#endif
+
+#ifdef __CUDACC__
+    if (oneKernel_) {
+      // implemented only for density clustesrs
+#ifndef THREE_KERNELS
+      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+#else
+      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+#endif
+    } else {  // five kernels
+      if (useDensity_) {
+        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useDBSCAN_) {
+        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useIterative_) {
+        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      }
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
+      cudaCheck(cudaGetLastError());
+      sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+    }
+    cudaCheck(cudaGetLastError());
+#else  // __CUDACC__
+    if (useDensity_) {
+      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useDBSCAN_) {
+      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useIterative_) {
+      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    }
+    // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+    fitVertices(soa, ws_d.get(), 50.);
+    // one block per vertex!
+    blockIdx.x = 0;
+    gridDim.x = 1;
+    splitVertices(soa, ws_d.get(), 9.f);
+    resetGrid();
+    fitVertices(soa, ws_d.get(), 5000.);
+    sortByPt2(soa, ws_d.get());
+#endif
+
+    return vertices;
+  }
+
+}  // namespace gpuVertexFinder
+
+#undef FROM
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5b7a1b6eadd0c..f293c78709723 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -8,18 +8,18 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
+#include "../plugins/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE clusterTracksDBSCAN
 #elif USE_ITERATIVE
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksIterative.h"
+#include "../plugins/gpuClusterTracksIterative.h"
 #define CLUSTERIZE clusterTracksIterative
 #else
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h"
+#include "../plugins/gpuClusterTracksByDensity.h"
 #define CLUSTERIZE clusterTracksByDensityKernel
 #endif
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuFitVertices.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSortByPt2.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
+#include "../plugins/gpuFitVertices.h"
+#include "../plugins/gpuSortByPt2.h"
+#include "../plugins/gpuSplitVertices.h"
 
 #ifdef ONE_KERNEL
 #ifdef __CUDACC__

From 5cdde61b152368223331d901617a9748813f82d5 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 5 Mar 2020 11:00:41 +0100
Subject: [PATCH 31/50] Synchronise with CMSSW_11_1_0_pre4

---
 .../Configuration/python/DQMOffline_SecondStep_cff.py  | 10 +---------
 DQMOffline/Configuration/python/DQMOffline_cff.py      |  5 +++--
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index ad00af3abdf74..ee175cc2027ea 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -1,7 +1,6 @@
 import FWCore.ParameterSet.Config as cms
 
 from DQMServices.Components.DQMMessageLoggerClient_cff import *
-from DQMServices.Components.DQMDcsInfoClient_cfi import *
 from DQMServices.Components.DQMFastTimerServiceClient_cfi import *
 
 from DQMOffline.Ecal.ecal_dqm_client_offline_cff import *
@@ -17,8 +16,6 @@
 from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQM.SiTrackerPhase2.Phase2TrackerDQMHarvesting_cff import *
 
-DQMOffline_SecondStepDCS = cms.Sequence( dqmDcsInfoClient )
-
 DQMOffline_SecondStepEcal = cms.Sequence( ecal_dqm_client_offline *
 					  es_dqm_client_offline )
 
@@ -38,7 +35,7 @@
 
 DQMOffline_SecondStepL1T = cms.Sequence( l1TriggerDqmOfflineClient )
 
-DQMOffline_SecondStep_PreDPG = cms.Sequence( DQMOffline_SecondStepDCS *
+DQMOffline_SecondStep_PreDPG = cms.Sequence( 
                                              DQMOffline_SecondStepEcal *
                                              DQMOffline_SecondStepTrackerStrip *
 					     DQMOffline_SecondStepTrackerPixel *
@@ -119,8 +116,6 @@
 from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
 from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import *
 
-DQMHarvestDCS = cms.Sequence ( dqmDcsInfoClient )
-
 DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient )
 
 DQMHarvestTrackerPixel = cms.Sequence ( PixelOfflineDQMClientNoDataCertification )
@@ -143,7 +138,6 @@
 DQMHarvestL1TMuon = cms.Sequence( l1TriggerMuonDqmOfflineClient )
 
 DQMHarvestCommon = cms.Sequence( DQMMessageLoggerClientSeq *
-                                 DQMHarvestDCS *
                                  DQMHarvestTrackerStrip *
                                  DQMHarvestTrack *
                                  DQMHarvestTrackerPixel *
@@ -159,7 +153,6 @@
 
 DQMHarvestCommonSiStripZeroBias = cms.Sequence(
                                                DQMMessageLoggerClientSeq *
-                                               DQMHarvestDCS *
                                                DQMHarvestTrackerStrip *
                                                DQMHarvestTrack *
                                                DQMHarvestTrackerPixel *
@@ -180,7 +173,6 @@
                                         pixelVertexResolutionClient )
 
 DQMHarvestOuterTracker = cms.Sequence(
-                                 dqmDcsInfoClient *
                                  OuterTrackerClient *
                                  dqmFEDIntegrityClient *
                                  DQMMessageLoggerClientSeq *
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index ca5dc2ac34a40..216b020a0cafe 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -1,7 +1,7 @@
 import FWCore.ParameterSet.Config as cms
 
 from DQMServices.Components.DQMMessageLogger_cfi import *
-from DQMServices.Components.DQMDcsInfo_cfi import *
+from DQMServices.Components.DQMProvInfo_cfi import *
 from DQMServices.Components.DQMFastTimerService_cff import *
 
 from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
@@ -22,7 +22,8 @@
 
 DQMMessageLoggerSeq = cms.Sequence( DQMMessageLogger )
 
-DQMOfflineDCS = cms.Sequence( dqmDcsInfo )
+dqmProvInfo.runType = "pp_run"
+DQMOfflineDCS = cms.Sequence( dqmProvInfo )
 
 # L1 trigger sequences
 DQMOfflineL1T = cms.Sequence( l1TriggerDqmOffline ) # L1 emulator is run within this sequence for real data

From f1c358e442fe87df76a91a6edb5aff4ab1925281 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 25 Mar 2020 00:28:04 +0100
Subject: [PATCH 32/50] Integrate the comments from the upstream PRs
 (cms-patatrack#442)

Clean up the Patatrack code base following the comments received during the integration into the upstream release.

Currently tracks the changes introduced due to
   - cms-sw#29109: Patatrack integration - trivial changes (1/N)
   - cms-sw#29110: Patatrack integration - common tools (2/N)

List of changes:
 * Remove unused files
 * Fix compilation warnings
 * Fix AtomicPairCounter unit test
 * Rename the cudaCompat namespace to cms::cudacompat
 * Remove extra semicolon
 * Move SimpleVector and VecArray to the cms::cuda namespace
 * Add missing dependency
 * Move HistoContainer, AtomicPairCounter, prefixScan and radixSort to the cms::cuda namespace
 * Remove rule exception for HeterogeneousCore
 * Fix code rule violations:
    - replace using namespace cms::cuda in test/OneToManyAssoc_t.h .
    - add an exception for cudaCompat.h:
      cudaCompat relies on defining equivalent symbols to the CUDA
      intrinsics in the cms::cudacompat namespace, and pulling them in the
      global namespace when compiling device code without CUDA.
* Protect the headers to compile only with a CUDA compiler
---
 .../plugins/gpuClusterTracksByDensity.h                |  8 ++++----
 .../plugins/gpuClusterTracksDBSCAN.h                   | 10 +++++-----
 .../plugins/gpuClusterTracksIterative.h                |  6 +++---
 .../PixelVertexFinding/plugins/gpuVertexFinderImpl.h   |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index 871b09599c903..b32c7d5b613db 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -48,7 +48,7 @@ namespace gpuVertexFinder {
     assert(pdata);
     assert(zt);
 
-    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
@@ -101,7 +101,7 @@ namespace gpuVertexFinder {
         nn[i]++;
       };
 
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -122,7 +122,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         iv[i] = j;  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -171,7 +171,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         minJ = j;
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
       // should belong to the same cluster...
       assert(iv[i] == iv[minJ]);
       assert(nn[i] <= nn[iv[i]]);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index 593c7597aecea..ffd7fdc948bf8 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -44,7 +44,7 @@ namespace gpuVertexFinder {
     assert(pdata);
     assert(zt);
 
-    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
@@ -96,7 +96,7 @@ namespace gpuVertexFinder {
         nn[i]++;
       };
 
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -118,7 +118,7 @@ namespace gpuVertexFinder {
         mz = zt[j];
         iv[i] = j;  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __syncthreads();
@@ -172,7 +172,7 @@ namespace gpuVertexFinder {
         }
         assert(iv[i] == iv[j]);
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
     __syncthreads();
 #endif
@@ -194,7 +194,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         iv[i] = iv[j];  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __shared__ unsigned int foundClusters;
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 14c825f353960..49da86e941867 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -44,7 +44,7 @@ namespace gpuVertexFinder {
     assert(pdata);
     assert(zt);
 
-    using Hist = HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
@@ -97,7 +97,7 @@ namespace gpuVertexFinder {
         nn[i]++;
       };
 
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __shared__ int nloops;
@@ -165,7 +165,7 @@ namespace gpuVertexFinder {
         mdist = dist;
         iv[i] = iv[j];  // assign to cluster (better be unique??)
       };
-      forEachInBins(hist, izt[i], 1, loop);
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }
 
     __shared__ unsigned int foundClusters;
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
index 10c487d588e9d..0da24cef219e0 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -109,7 +109,7 @@ namespace gpuVertexFinder {
     loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
     cudaCheck(cudaGetLastError());
 #else
-    cudaCompat::resetGrid();
+    cms::cudacompat::resetGrid();
     init(soa, ws_d.get());
     loadTracks(tksoa, soa, ws_d.get(), ptMin);
 #endif

From 3edbb17c2d1bdea5f16703206e05270595f60add Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 26 Mar 2020 19:09:30 +0100
Subject: [PATCH 33/50] Synchronise with CMSSW_11_1_0_pre5

---
 DQMOffline/Configuration/python/DQMOffline_cff.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 216b020a0cafe..f9189cce5cceb 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -23,6 +23,7 @@
 DQMMessageLoggerSeq = cms.Sequence( DQMMessageLogger )
 
 dqmProvInfo.runType = "pp_run"
+dqmProvInfo.dcsRecord = cms.untracked.InputTag("onlineMetaDataDigis")
 DQMOfflineDCS = cms.Sequence( dqmProvInfo )
 
 # L1 trigger sequences

From b525042fed68e3b56a07bd677caf66ec2a7c4644 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 6 Apr 2020 15:57:55 +0200
Subject: [PATCH 34/50] Backport remove unneeded dependencies in Reco
 subsystems (#29295)

---
 .../PixelVertexFinding/test/BuildFile.xml              | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index 95a572e68ce5e..f5c154b298574 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -1,20 +1,10 @@
 <use name="boost"/>
 <use name="root"/>
-<use name="CommonTools/Clustering1D"/>
-<use name="DataFormats/TrackerRecHit2D"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
-<use name="FWCore/PluginManager"/>
-<use name="Geometry/CommonDetUnit"/>
-<use name="Geometry/Records"/>
-<use name="Geometry/TrackerGeometryBuilder"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="MagneticField/Engine"/>
 <use name="MagneticField/Records"/>
-<use name="RecoPixelVertexing/PixelTrackFitting"/>
-<use name="RecoPixelVertexing/PixelTriplets"/>
-<use name="RecoTracker/TkHitPairs"/>
-<use name="RecoTracker/TkTrackingRegions"/>
 <use name="RecoVertex/KalmanVertexFit"/>
 <use name="SimDataFormats/Track"/>
 <use name="TrackingTools/TransientTrack"/>

From f203d0d6223375bad8ba4513de2afb0dca87da6c Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 7 Apr 2020 11:24:15 +0200
Subject: [PATCH 35/50] Fix use of namespaces (cms-patatrack#446)

Clean up instances of using namespace ... from header files,
following the comments from the upstream integration.
---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index f293c78709723..bac599e4fee4d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -9,13 +9,13 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
 #include "../plugins/gpuClusterTracksDBSCAN.h"
-#define CLUSTERIZE clusterTracksDBSCAN
+#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
 #elif USE_ITERATIVE
 #include "../plugins/gpuClusterTracksIterative.h"
-#define CLUSTERIZE clusterTracksIterative
+#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
 #else
 #include "../plugins/gpuClusterTracksByDensity.h"
-#define CLUSTERIZE clusterTracksByDensityKernel
+#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
 #endif
 #include "../plugins/gpuFitVertices.h"
 #include "../plugins/gpuSortByPt2.h"
@@ -43,8 +43,6 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
 #endif
 #endif
 
-using namespace gpuVertexFinder;
-
 struct Event {
   std::vector<float> zvert;
   std::vector<uint16_t> itrack;
@@ -103,10 +101,10 @@ struct ClusterGenerator {
 };
 
 // a macro SORRY
-#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(ZVertices, M))
-#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(WorkSpace, M))
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
+#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
 
-__global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
+__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
   auto const& __restrict__ data = *pdata;
   auto const& __restrict__ ws = *pws;
   printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
@@ -116,11 +114,11 @@ int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
 
-  auto onGPU_d = cms::cuda::make_device_unique<ZVertices[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<WorkSpace[]>(1, nullptr);
+  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
 #else
-  auto onGPU_d = std::make_unique<ZVertices>();
-  auto ws_d = std::make_unique<WorkSpace>();
+  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
 #endif
 
   Event ev;
@@ -183,7 +181,7 @@ int main() {
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
@@ -245,7 +243,7 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
@@ -265,7 +263,7 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cms::cuda::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       gridDim.x = 1;
@@ -277,10 +275,10 @@ int main() {
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else

From 1b7463a1773467626b382181db16d9c47afe4c1b Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 11 May 2020 14:49:56 +0200
Subject: [PATCH 36/50] Synchronise with CMSSW_11_1_0_pre7

---
 .../Configuration/python/DQMOffline_SecondStep_cff.py    | 2 ++
 DQMOffline/Configuration/python/DQMOffline_cff.py        | 9 +++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index ee175cc2027ea..d4a043386678d 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -16,6 +16,8 @@
 from DQMOffline.L1Trigger.L1TriggerDqmOffline_cff import *
 from DQM.SiTrackerPhase2.Phase2TrackerDQMHarvesting_cff import *
 
+DQMNone = cms.Sequence()
+
 DQMOffline_SecondStepEcal = cms.Sequence( ecal_dqm_client_offline *
 					  es_dqm_client_offline )
 
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index f9189cce5cceb..9d797d87d3be5 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -34,8 +34,13 @@
 DQMOfflineL1TMuon = cms.Sequence( l1TriggerMuonDqmOffline )
 
 #DPGs
-DQMOfflineEcal = cms.Sequence( ecal_dqm_source_offline *
-				es_dqm_source_offline )
+DQMOfflineEcalOnly = cms.Sequence(
+    ecalOnly_dqm_source_offline +
+    es_dqm_source_offline )
+
+DQMOfflineEcal = cms.Sequence(
+    ecal_dqm_source_offline +
+    es_dqm_source_offline )
 
 DQMOfflineHcal = cms.Sequence( hcalOfflineSourceSequence )
 

From c6577b39e355a20236172274c325717f94e1f461 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 7 Jul 2020 17:19:51 +0200
Subject: [PATCH 37/50] Fix max track size in vertex SoA (cms-patatrack#499)

Make the size consistent with the buffer in track SoA.
---
 CUDADataFormats/Vertex/interface/ZVertexSoA.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
index 5f8a7f65843f1..5f0699d5831ec 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -8,7 +8,7 @@
 // These vertices are clusterized and fitted only along the beam line (z)
 // to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
 struct ZVertexSoA {
-  static constexpr uint32_t MAXTRACKS = 16 * 1024;
+  static constexpr uint32_t MAXTRACKS = 32 * 1024;
   static constexpr uint32_t MAXVTX = 1024;
 
   int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)

From 0863f52287329b0b6d6e9d090fdb0f5e4a81b4b5 Mon Sep 17 00:00:00 2001
From: Mariarosaria D'Alfonso <dalfonso@cern.ch>
Date: Sat, 6 Jun 2020 00:05:23 +0200
Subject: [PATCH 38/50] Backport: add ECAL-only and HCAL-only workflows for MC
 and data (#30350)

Backport #30105: add ECAL-only workflows for data.
Backport #30136: add HCAL-only workflows for MC and data.
---
 DQMOffline/Configuration/python/DQMOffline_cff.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 9d797d87d3be5..5c0c9fc180187 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -42,10 +42,16 @@
     ecal_dqm_source_offline +
     es_dqm_source_offline )
 
+#offline version of the online DQM: used in validation/certification
 DQMOfflineHcal = cms.Sequence( hcalOfflineSourceSequence )
 
+# offline DQM: used in Release validation
 DQMOfflineHcal2 = cms.Sequence( HcalDQMOfflineSequence )
 
+DQMOfflineHcalOnly = cms.Sequence( hcalOnlyOfflineSourceSequence )
+
+DQMOfflineHcal2Only = cms.Sequence( RecHitsDQMOffline )
+
 DQMOfflineTrackerStrip = cms.Sequence( SiStripDQMTier0 )
 
 DQMOfflineTrackerPixel = cms.Sequence( 	siPixelOfflineDQM_source )

From 8c29b60adb351d2d8c9e23e0e281e2e621f1887a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 12 Jul 2020 23:24:43 +0200
Subject: [PATCH 39/50] Synchronise with CMSSW_11_2_0_pre2

---
 DQMOffline/Configuration/python/DQMOffline_cff.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 5c0c9fc180187..f09a28eb06861 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -244,9 +244,9 @@
 PostDQMOfflineMiniAOD = cms.Sequence(miniAODDQMSequence*jetMETDQMOfflineSourceMiniAOD*tracksDQMMiniAOD*topPhysicsminiAOD)
 PostDQMOffline = cms.Sequence()
 
-from Configuration.Eras.Modifier_phase2_hcal_cff import phase2_hcal
-phase2_hcal.toReplaceWith( PostDQMOfflineMiniAOD, PostDQMOfflineMiniAOD.copyAndExclude([
-    pfMetDQMAnalyzerMiniAOD, pfPuppiMetDQMAnalyzerMiniAOD # No hcalnoise yet
+from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
+run3_HB.toReplaceWith( PostDQMOfflineMiniAOD, PostDQMOfflineMiniAOD.copyAndExclude([
+    pfMetDQMAnalyzerMiniAOD, pfPuppiMetDQMAnalyzerMiniAOD # No hcalnoise (yet)
 ]))
 
 from PhysicsTools.NanoAOD.nanoDQM_cff import nanoDQM

From 0f5e3c89006dd88c63bccbb25dd9316106575208 Mon Sep 17 00:00:00 2001
From: Suvankar Roy Chowdhury <suvankar.roy.chowdhury@cern.ch>
Date: Mon, 12 Oct 2020 16:53:54 +0200
Subject: [PATCH 40/50] Update the validation sequence for pixel-only tracking
 workflows (cms-patatrack#548)

---
 DQMOffline/RecoB/python/PixelVertexMonitor_cff.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
index 3c2e3d7d6700e..9e293f4478bd6 100644
--- a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
+++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
@@ -4,4 +4,5 @@
 pixelPVMonitor = _pvMonitor.clone(
     TopFolderName = "OfflinePixelPV",
     vertexLabel = "pixelVertices",
+    ndof        = cms.int32( 1 )
 )

From 5525ec1f5829bd2ccf2a0d49381ccfeab2b132da Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 23 Oct 2020 13:43:58 +0200
Subject: [PATCH 41/50] Synchronise with CMSSW_11_2_0_pre8

---
 .../python/DQMOffline_SecondStep_cff.py             | 13 +++++++++++++
 DQMOffline/Configuration/python/DQMOffline_cff.py   | 12 ++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index d4a043386678d..29bf311c474d4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -10,6 +10,7 @@
 from DQM.DTMonitorClient.dtDQMOfflineClients_cff import *
 from DQM.RPCMonitorClient.RPCTier0Client_cff import *
 from DQM.CSCMonitorModule.csc_dqm_offlineclient_collisions_cff import *
+from DQMOffline.Muon.gem_dqm_offline_client_cff import *
 from DQMOffline.Hcal.HcalDQMOfflinePostProcessor_cff import *
 from DQM.HcalTasks.OfflineHarvestingSequence_pp import *
 from DQMServices.Components.DQMFEDIntegrityClient_cff import *
@@ -29,6 +30,11 @@
                                              rpcTier0Client *
                                              cscOfflineCollisionsClients )
 
+from Configuration.Eras.Modifier_run3_GEM_cff import run3_GEM
+_run3_GEM_DQMOffline_SecondStepMuonDPG = DQMOffline_SecondStepMuonDPG.copy()
+_run3_GEM_DQMOffline_SecondStepMuonDPG += gemClients
+run3_GEM.toReplaceWith(DQMOffline_SecondStepMuonDPG, _run3_GEM_DQMOffline_SecondStepMuonDPG)
+
 DQMOffline_SecondStepHcal = cms.Sequence( hcalOfflineHarvesting )
 
 DQMOffline_SecondStepHcal2 = cms.Sequence(  HcalDQMOfflinePostProcessor )
@@ -171,6 +177,9 @@
 DQMHarvestTracking = cms.Sequence( TrackingOfflineDQMClient *
                                    dqmFastTimerServiceClient )
 
+DQMHarvestTrackingZeroBias = cms.Sequence( TrackingOfflineDQMClientZeroBias *
+                                           dqmFastTimerServiceClient )
+
 DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern *
                                         pixelVertexResolutionClient )
 
@@ -191,6 +200,10 @@
                                muonQualityTests
                                )
 
+_run3_GEM_DQMHarvestMuon = DQMHarvestMuon.copy()
+_run3_GEM_DQMHarvestMuon += gemClients
+run3_GEM.toReplaceWith(DQMHarvestMuon, _run3_GEM_DQMHarvestMuon)
+
 DQMHarvestEcal = cms.Sequence( ecal_dqm_client_offline *
                                 es_dqm_client_offline
                               )
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index f09a28eb06861..7c5dd240d6e21 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -14,6 +14,7 @@
 from DQM.DTMonitorModule.dtDQMOfflineSources_cff import *
 from DQM.RPCMonitorClient.RPCTier0Source_cff import *
 from DQM.CSCMonitorModule.csc_dqm_sourceclient_offline_cff import *
+from DQMOffline.Muon.gem_dqm_offline_source_cff import *
 from DQM.CastorMonitor.castor_dqm_sourceclient_offline_cff import *
 from DQM.CTPPS.ctppsDQM_cff import *
 from DQM.SiTrackerPhase2.Phase2TrackerDQMFirstStep_cff import *
@@ -60,6 +61,11 @@
                                   rpcTier0Source *
                                   cscSources )
 
+from Configuration.Eras.Modifier_run3_GEM_cff import run3_GEM
+_run3_GEM_DQMOfflineMuonDPG = DQMOfflineMuonDPG.copy()
+_run3_GEM_DQMOfflineMuonDPG += gemSources
+run3_GEM.toReplaceWith(DQMOfflineMuonDPG, _run3_GEM_DQMOfflineMuonDPG)
+
 DQMOfflineCASTOR = cms.Sequence( castorSources )
 
 DQMOfflineCTPPS = cms.Sequence( ctppsDQMOfflineSource )
@@ -223,6 +229,10 @@
                                muonMonitors
                               )
 
+_run3_GEM_DQMOfflineMuon = DQMOfflineMuon.copy()
+_run3_GEM_DQMOfflineMuon += gemSources
+run3_GEM.toReplaceWith(DQMOfflineMuon, _run3_GEM_DQMOfflineMuon)
+
 #Taus not created in pp conditions for HI
 from Configuration.Eras.Modifier_pp_on_AA_2018_cff import pp_on_AA_2018
 _DQMOfflineTAU = cms.Sequence()
@@ -252,3 +262,5 @@
 from PhysicsTools.NanoAOD.nanoDQM_cff import nanoDQM
 DQMOfflineNanoAOD = cms.Sequence(nanoDQM)
 #PostDQMOfflineNanoAOD = cms.Sequence(nanoDQM)
+from PhysicsTools.NanoAOD.nanogenDQM_cff import nanogenDQM
+DQMOfflineNanoGen = cms.Sequence(nanogenDQM)

From d5649dd69163f57500126291b07bdda9e1753768 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 16 Nov 2020 11:56:30 +0100
Subject: [PATCH 42/50] Synchronise with CMSSW_11_2_0_pre9

---
 DQMOffline/Configuration/python/DQMOffline_cff.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 7c5dd240d6e21..ac28700d4eaf4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -234,9 +234,9 @@
 run3_GEM.toReplaceWith(DQMOfflineMuon, _run3_GEM_DQMOfflineMuon)
 
 #Taus not created in pp conditions for HI
-from Configuration.Eras.Modifier_pp_on_AA_2018_cff import pp_on_AA_2018
+from Configuration.ProcessModifiers.pp_on_AA_cff import pp_on_AA
 _DQMOfflineTAU = cms.Sequence()
-pp_on_AA_2018.toReplaceWith(DQMOfflineTAU, _DQMOfflineTAU)
+pp_on_AA.toReplaceWith(DQMOfflineTAU, _DQMOfflineTAU)
 
 
 # miniAOD DQM validation

From c37f72d4398e1ae5b104c5f0532efec5e67d6e69 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 26 Nov 2020 09:50:53 +0100
Subject: [PATCH 43/50] Remove spurious empty lines

---
 .../Configuration/python/RecoPixelVertexing_cff.py               | 1 -
 1 file changed, 1 deletion(-)

diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index e1cd387360698..424ac13a43627 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -22,4 +22,3 @@
 
 gpu.toReplaceWith(pixelVertices,_pixelVertexFromSoA)
 gpu.toReplaceWith(recopixelvertexingTask,_pixelVertexingCUDATask)
-

From bb28343dc461291783e4adf4fc48b4b62d730eac Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 1 Dec 2020 02:22:18 +0100
Subject: [PATCH 44/50] Simplify cudacompat layer to use a 1-dimensional grid
 (cms-patatrack#586)

Remove the possibility of changing the grid size used by the
cms::cudacompat layer, and make it a constant equal to {1, 1, 1}.

This avoids a thread-related problem caused by TBB using worker threads
where the grid size had not been initialised.

The kernel for pixel clustering need to be rewritten to support a
one-dimensional grid to run on the CPU.
Currently they are only used on the GPU in the Patatrack workflows, but
they are exercised on the CPU by the gpuClustering_t tests; those tests
have been commented out until the kernels can be updated.
---
 .../PixelVertexFinding/plugins/gpuVertexFinderImpl.h          | 4 ----
 RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h   | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
index 0da24cef219e0..cf34d9075b70d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -109,7 +109,6 @@ namespace gpuVertexFinder {
     loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
     cudaCheck(cudaGetLastError());
 #else
-    cms::cudacompat::resetGrid();
     init(soa, ws_d.get());
     loadTracks(tksoa, soa, ws_d.get(), ptMin);
 #endif
@@ -157,10 +156,7 @@ namespace gpuVertexFinder {
     // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
     fitVertices(soa, ws_d.get(), 50.);
     // one block per vertex!
-    blockIdx.x = 0;
-    gridDim.x = 1;
     splitVertices(soa, ws_d.get(), 9.f);
-    resetGrid();
     fitVertices(soa, ws_d.get(), 5000.);
     sortByPt2(soa, ws_d.get());
 #endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index bac599e4fee4d..e3298f8c5761b 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -266,10 +266,7 @@ int main() {
       cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      gridDim.x = 1;
-      assert(blockIdx.x == 0);
       splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
-      resetGrid();
       nv = ws_d->nvIntermediate;
 #endif
       std::cout << "after split " << nv << std::endl;

From af73ec9845fd91062eab1a3aa50ca19799faa04a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 18 Dec 2020 18:26:58 +0100
Subject: [PATCH 45/50] Clean up the pixel local reconstruction code
 (cms-patatrack#593)

Address the pixel local reconstruction review comments.

General clean up of the pixel local reconstruction code:
  - remove commented out and obsolete code and data members
  - use named constants more consistently
  - update variable names to follow the coding rules and for better consistency
  - use member initializer lists in the constructors
  - allow `if constexpr` in CUDA code
  - use `std::size` instead of hardcoding the array size
  - convert iterator-based loops to range-based loops
  - replace `cout` and `printf` with `LogDebug` or `LogWarning`
  - use put tokens
  - reorganise the auto-generated cfi files and use them more consistently
  - adjust code after rearranging an `#ifdef GPU_DEBUG` block
  - apply code formatting
  - other minor changes

Improve comments:
  - improve comments and remove obsolete ones
  - clarify comments and types regarding `HostProduct`
  - update comments about `GPU_SMALL_EVENTS` being kept for testing purposes
  - add notes about the original cpu code

Reuse some more common code:
  - move common pixel cluster code to `PixelClusterizerBase`
  - extend the `SiPixelCluster` constructor

Rename classes and modules for better consistency:
  - remove the `TrackingRecHit2DCUDA.h` and `gpuClusteringConstants.h` forwarding headers
  - rename `PixelRecHits` to `PixelRecHitGPUKernel`
  - rename SiPixelRecHitFromSOA to SiPixelRecHitFromCUDA
  - rename `siPixelClustersCUDAPreSplitting` to `siPixelClustersPreSplittingCUDA`
  - rename `siPixelRecHitsCUDAPreSplitting` to `siPixelRecHitsPreSplittingCUDA`
  - rename `siPixelRecHitsLegacyPreSplitting` to `siPixelRecHitsPreSplittingLegacy`
  - rename `siPixelRecHitHostSoA` to `siPixelRecHitSoAFromLegacy`

Re-apply changes from #29805 that were lost in the Patatrack branch.
---
 .../PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index 04faf570c3fcc..2f0965be50eb8 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -2,7 +2,7 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"

From 55fe4b3005cc3ab96fe4c1137e361a475134ef4b Mon Sep 17 00:00:00 2001
From: Eric Cano <37585813+ericcano@users.noreply.github.com>
Date: Tue, 23 Mar 2021 22:17:52 +0100
Subject: [PATCH 46/50] Clean up the pixel track reconstruction code
 (cms-patatrack#606)

Updat EDM access:
  - switch to consumes() scheme for event setup;
  - simplify some event data access.

Style fixes:
  - make class member private & fixed problematic cast;
  - format of comments for clang-tidy;
  - chang to enum class to avoid creating a namespace (usage becomes: pixelTrack::Quality::loose);
  - add article reference in comment (it was already further down in the file);
  - fix member functions and classes capitalization;
  - fix one letter or upper case variable names in formulas (trying to keep the naming from the reference article).

Avoid some code repetitions.
---
 .../PixelVertexFinding/plugins/gpuVertexFinderImpl.h            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
index cf34d9075b70d..ae423dd375e06 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -27,7 +27,7 @@ namespace gpuVertexFinder {
 
       if (nHits < 4)
         continue;  // no triplets
-      if (quality[idx] != trackQuality::loose)
+      if (quality[idx] != pixelTrack::Quality::loose)
         continue;
 
       auto pt = tracks.pt(idx);

From 0cd8f9453b1159e7d1a5c39bd5dbba8922f1c06e Mon Sep 17 00:00:00 2001
From: Eric Cano <37585813+ericcano@users.noreply.github.com>
Date: Fri, 26 Mar 2021 17:31:38 +0100
Subject: [PATCH 47/50] Clean up the pixel vertex reconstruction code
 (cms-patatrack#609)

Split PixelVertexProducerCUDA produce() method into two methods, for running on the CPU and on the GPU.

Update access to EDM handles and event collections.

General code clean up:
  - use named constants for kernel grid dimensions;
  - replace commented out code with #ifdef-based conditionals;
  - update data member names to follow the coding rules;
  - fix include guard names and replace relative with absolute includes.

Apply code formatting.
---
 CUDADataFormats/Vertex/interface/ZVertexSoA.h |  6 +-
 .../plugins/PixelVertexProducerCUDA.cc        | 98 +++++++++++--------
 .../plugins/PixelVertexProducerFromSoA.cc     | 26 ++---
 .../plugins/gpuVertexFinder.h                 |  6 +-
 .../plugins/gpuVertexFinderImpl.h             | 41 +++++---
 .../PixelVertexFinding/test/VertexFinder_t.h  | 12 +--
 6 files changed, 110 insertions(+), 79 deletions(-)

diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
index 5f0699d5831ec..e31b87f30fa11 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -1,5 +1,5 @@
-#ifndef CUDADataFormatsVertexZVertexSoA_H
-#define CUDADataFormatsVertexZVertexSoA_H
+#ifndef CUDADataFormats_Vertex_ZVertexSoA_h
+#define CUDADataFormats_Vertex_ZVertexSoA_h
 
 #include <cstdint>
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
@@ -23,4 +23,4 @@ struct ZVertexSoA {
   __host__ __device__ void init() { nvFinal = 0; }
 };
 
-#endif  // CUDADataFormatsVertexZVertexSoA.H
+#endif  // CUDADataFormats_Vertex_ZVertexSoA_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index e9054dbf17c53..e2c2bc76c8612 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -19,6 +19,8 @@
 
 #include "gpuVertexFinder.h"
 
+#undef PIXVERTEX_DEBUG_PRODUCE
+
 class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 public:
   explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig);
@@ -27,34 +29,36 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
+  void produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+  void produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 
-  bool m_OnGPU;
+  bool onGPU_;
 
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
   edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
 
-  const gpuVertexFinder::Producer m_gpuAlgo;
+  const gpuVertexFinder::Producer gpuAlgo_;
 
   // Tracking cuts before sending tracks to vertex algo
-  const float m_ptMin;
+  const float ptMin_;
 };
 
 PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
-    : m_OnGPU(conf.getParameter<bool>("onGPU")),
-      m_gpuAlgo(conf.getParameter<bool>("oneKernel"),
-                conf.getParameter<bool>("useDensity"),
-                conf.getParameter<bool>("useDBSCAN"),
-                conf.getParameter<bool>("useIterative"),
-                conf.getParameter<int>("minT"),
-                conf.getParameter<double>("eps"),
-                conf.getParameter<double>("errmax"),
-                conf.getParameter<double>("chi2max")),
-      m_ptMin(conf.getParameter<double>("PtMin"))  // 0.5 GeV
+    : onGPU_(conf.getParameter<bool>("onGPU")),
+      gpuAlgo_(conf.getParameter<bool>("oneKernel"),
+               conf.getParameter<bool>("useDensity"),
+               conf.getParameter<bool>("useDBSCAN"),
+               conf.getParameter<bool>("useIterative"),
+               conf.getParameter<int>("minT"),
+               conf.getParameter<double>("eps"),
+               conf.getParameter<double>("errmax"),
+               conf.getParameter<double>("chi2max")),
+      ptMin_(conf.getParameter<double>("PtMin"))  // 0.5 GeV
 {
-  if (m_OnGPU) {
+  if (onGPU_) {
     tokenGPUTrack_ =
         consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
@@ -87,38 +91,50 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
   descriptions.add(label, desc);
 }
 
-void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  if (m_OnGPU) {
-    edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
-    iEvent.getByToken(tokenGPUTrack_, hTracks);
+void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
+                                           edm::Event& iEvent,
+                                           const edm::EventSetup& iSetup) const {
+  edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+  iEvent.getByToken(tokenGPUTrack_, hTracks);
+
+  cms::cuda::ScopedContextProduce ctx{*hTracks};
+  auto const* tracks = ctx.get(*hTracks).get();
+
+  assert(tracks);
 
-    cms::cuda::ScopedContextProduce ctx{*hTracks};
-    auto const* tracks = ctx.get(*hTracks).get();
+  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+}
 
-    assert(tracks);
+void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
+                                           edm::Event& iEvent,
+                                           const edm::EventSetup& iSetup) const {
+  auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+  assert(tracks);
+
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+  auto const& tsoa = *tracks;
+  auto maxTracks = tsoa.stride();
+  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    assert(nHits == int(tsoa.hitIndices.size(it)));
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
+    nt++;
+  }
+  std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
 
-    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_));
+}
 
+void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  if (onGPU_) {
+    produceOnGPU(streamID, iEvent, iSetup);
   } else {
-    auto const* tracks = iEvent.get(tokenCPUTrack_).get();
-    assert(tracks);
-
-    /*
-    auto const & tsoa = *tracks;
-    auto maxTracks = tsoa.stride();
-    std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
-
-    int32_t nt = 0;
-    for (int32_t it = 0; it < maxTracks; ++it) {
-      auto nHits = tsoa.nHits(it);
-      assert(nHits==int(tsoa.hitIndices.size(it)));
-      if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
-      nt++;
-    }
-    std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
-    */
-
-    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+    produceOnCPU(streamID, iEvent, iSetup);
   }
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
index e642e3fd734f9..d4184c0825c7e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -23,6 +23,8 @@
 #include "DataFormats/VertexReco/interface/Vertex.h"
 #include "DataFormats/VertexReco/interface/VertexFwd.h"
 
+#undef PIXVERTEX_DEBUG_PRODUCE
+
 class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
 public:
   using IndToEdm = std::vector<uint16_t>;
@@ -62,15 +64,10 @@ void PixelVertexProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions
 void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &) const {
   auto vertexes = std::make_unique<reco::VertexCollection>();
 
-  edm::Handle<reco::TrackCollection> trackCollection;
-  iEvent.getByToken(tokenTracks_, trackCollection);
-  auto const &tracks = *(trackCollection.product());
-  edm::Handle<IndToEdm> indToEdmH;
-  iEvent.getByToken(tokenIndToEdm_, indToEdmH);
-  auto const &indToEdm = *indToEdmH;
-
-  edm::Handle<reco::BeamSpot> bsHandle;
-  iEvent.getByToken(tokenBeamSpot_, bsHandle);
+  auto tracksHandle = iEvent.getHandle(tokenTracks_);
+  auto tracksSize = tracksHandle->size();
+  auto const &indToEdm = iEvent.get(tokenIndToEdm_);
+  auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
 
   float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0;
   std::vector<int32_t> itrk;
@@ -89,7 +86,10 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
 
   int nv = soa.nvFinal;
 
-  // std::cout << "converting " << nv << " vertices " << " from " << indToEdm.size() << " tracks" << std::endl;
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+  std::cout << "converting " << nv << " vertices "
+            << " from " << indToEdm.size() << " tracks" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
 
   std::set<uint16_t> uind;  // fort verifing index consistency
   for (int j = nv - 1; j >= 0; --j) {
@@ -111,7 +111,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
     }
     auto nt = itrk.size();
     if (nt == 0) {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
       std::cout << "vertex " << i << " with no tracks..." << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
       continue;
     }
     if (nt < 2) {
@@ -123,11 +125,11 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
     for (auto it : itrk) {
       assert(it < int(indToEdm.size()));
       auto k = indToEdm[it];
-      if (k > tracks.size()) {
+      if (k > tracksSize) {
         edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k;
         continue;
       }
-      auto tk = reco::TrackRef(trackCollection, k);
+      auto tk = reco::TrackRef(tracksHandle, k);
       v.add(reco::TrackBaseRef(tk));
     }
     itrk.clear();
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 6cd86c93a6737..b1d581fd2a60d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_gpuVertexFinder_h
 
 #include <cstddef>
 #include <cstdint>
@@ -80,4 +80,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_gpuVertexFinder_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
index ae423dd375e06..8659ca9024568 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -7,6 +7,8 @@
 #include "gpuSortByPt2.h"
 #include "gpuSplitVertices.h"
 
+#undef PIXVERTEX_DEBUG_PRODUCE
+
 namespace gpuVertexFinder {
 
   __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
@@ -85,11 +87,15 @@ namespace gpuVertexFinder {
 
 #ifdef __CUDACC__
   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
-    // std::cout << "producing Vertices on GPU" << std::endl;
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "producing Vertices on GPU" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
 #else
   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
-    // std::cout << "producing Vertices on  CPU" <<    std::endl;
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "producing Vertices on  CPU" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
 #endif
     assert(tksoa);
@@ -114,35 +120,40 @@ namespace gpuVertexFinder {
 #endif
 
 #ifdef __CUDACC__
+    // Running too many thread lead to problems when printf is enabled.
+    constexpr int maxThreadsForPrint = 1024 - 256;
+    constexpr int numBlocks = 1024;
+    constexpr int threadsPerBlock = 128;
+
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
 #else
-      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), 9.f);
       cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
 #endif
     } else {  // five kernels
       if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
-        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       }
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), 50.);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), 9.f);
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), 5000.);
       cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
     }
     cudaCheck(cudaGetLastError());
 #else  // __CUDACC__
@@ -153,7 +164,9 @@ namespace gpuVertexFinder {
     } else if (useIterative_) {
       clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
     }
-    // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
     fitVertices(soa, ws_d.get(), 50.);
     // one block per vertex!
     splitVertices(soa, ws_d.get(), 9.f);
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index e3298f8c5761b..52253a1e4bbfe 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -8,18 +8,18 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
-#include "../plugins/gpuClusterTracksDBSCAN.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
 #elif USE_ITERATIVE
-#include "../plugins/gpuClusterTracksIterative.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
 #else
-#include "../plugins/gpuClusterTracksByDensity.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
 #endif
-#include "../plugins/gpuFitVertices.h"
-#include "../plugins/gpuSortByPt2.h"
-#include "../plugins/gpuSplitVertices.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h"
 
 #ifdef ONE_KERNEL
 #ifdef __CUDACC__

From 545ddeafa42919a170bd42d7848f91b1581dfc85 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 1 Apr 2021 15:41:18 +0200
Subject: [PATCH 48/50] Add the vertex reconstruction to the Pixel SoA workflow
 on CPU

---
 .../python/customizePixelTracksSoAonCPU.py          | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index 24cc16e02b463..1661cac832b8b 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -20,12 +20,21 @@ def customizePixelTracksSoAonCPU(process):
     pixelRecHitSrc = 'siPixelRecHitsPreSplitting'
   )
 
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+  process.pixelVertexSoA = pixelVertexCUDA.clone(
+    onGPU = False,
+    pixelTrackSrc = 'pixelTrackSoA'
+  )
+
   from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA
   process.pixelTracks = pixelTrackProducerFromSoA.clone(
     pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting'
   )
 
-  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA
+  process.pixelVertices = pixelVertexFromSoA.clone()
+
+  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA
 
   return process
 
@@ -46,7 +55,7 @@ def customizePixelTracksSoAonCPUForProfiling(process):
 
   process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
   
-  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA)
+  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA)
 
   process.schedule = cms.Schedule(process.TkSoA)
 

From a48c872fae944cec62c391af301479ac68400b69 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 6 Apr 2021 11:33:28 +0200
Subject: [PATCH 49/50] Address more review comments to the vertex finding code
 (cms-patatrack#612)

Use std::clamp(...) in device code now that CUDA supports c++17.
Name reused constants in the vertex fitting and splitting.
---
 .../plugins/gpuClusterTracksDBSCAN.h          |  3 +-
 .../plugins/gpuClusterTracksIterative.h       |  3 +-
 .../plugins/gpuVertexFinderImpl.h             | 31 ++++++++++++-------
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index ffd7fdc948bf8..7f62bb0b1b086 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -61,8 +61,7 @@ namespace gpuVertexFinder {
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i < ZVertices::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
-      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
-      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
       assert(iz - INT8_MIN >= 0);
       assert(iz - INT8_MIN < 256);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 49da86e941867..098cc82b53117 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -61,8 +61,7 @@ namespace gpuVertexFinder {
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i < ZVertices::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
-      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
-      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
       assert(iz - INT8_MIN >= 0);
       assert(iz - INT8_MIN < 256);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
index 8659ca9024568..44ad16099e06a 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -11,6 +11,13 @@
 
 namespace gpuVertexFinder {
 
+  // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
+  constexpr float maxChi2ForFirstFit = 50.f;
+  constexpr float maxChi2ForFinalFit = 5000.f;
+
+  // split vertices with a chi2/NDoF greater than this
+  constexpr float maxChi2ForSplit = 9.f;
+
   __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
     assert(ptracks);
     assert(soa);
@@ -57,11 +64,11 @@ namespace gpuVertexFinder {
   ) {
     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
     __syncthreads();
-    fitVertices(pdata, pws, 50.);
+    fitVertices(pdata, pws, maxChi2ForFirstFit);
     __syncthreads();
-    splitVertices(pdata, pws, 9.f);
+    splitVertices(pdata, pws, maxChi2ForSplit);
     __syncthreads();
-    fitVertices(pdata, pws, 5000.);
+    fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
   }
@@ -75,11 +82,11 @@ namespace gpuVertexFinder {
   ) {
     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
     __syncthreads();
-    fitVertices(pdata, pws, 50.);
+    fitVertices(pdata, pws, maxChi2ForFirstFit);
   }
 
   __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
-    fitVertices(pdata, pws, 5000.);
+    fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
   }
@@ -133,7 +140,7 @@ namespace gpuVertexFinder {
       vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), 9.f);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
       vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
 #endif
@@ -146,12 +153,12 @@ namespace gpuVertexFinder {
         clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       }
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), 50.);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), 9.f);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), 5000.);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
       cudaCheck(cudaGetLastError());
       sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
     }
@@ -167,10 +174,10 @@ namespace gpuVertexFinder {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    fitVertices(soa, ws_d.get(), 50.);
+    fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
     // one block per vertex!
-    splitVertices(soa, ws_d.get(), 9.f);
-    fitVertices(soa, ws_d.get(), 5000.);
+    splitVertices(soa, ws_d.get(), maxChi2ForSplit);
+    fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
     sortByPt2(soa, ws_d.get());
 #endif
 

From 0abda2e472476ce6bb4d44419bcb2184f7e13b1a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 6 Apr 2021 12:59:15 +0200
Subject: [PATCH 50/50] Update include guards and include files in the pixel
 vertex reconstruction code (#613)

---
 CUDADataFormats/Vertex/BuildFile.xml                 |  2 +-
 CUDADataFormats/Vertex/src/classes.h                 |  6 +++---
 .../plugins/PixelVertexProducerFromSoA.cc            | 12 +++++-------
 .../plugins/gpuClusterTracksByDensity.h              |  6 +++---
 .../plugins/gpuClusterTracksDBSCAN.h                 |  6 +++---
 .../plugins/gpuClusterTracksIterative.h              |  6 +++---
 .../PixelVertexFinding/plugins/gpuFitVertices.h      |  6 +++---
 .../PixelVertexFinding/plugins/gpuSortByPt2.h        |  6 +++---
 .../PixelVertexFinding/plugins/gpuSplitVertices.h    |  6 +++---
 .../PixelVertexFinding/plugins/gpuVertexFinder.h     |  6 +++---
 .../PixelVertexFinding/plugins/gpuVertexFinderImpl.h |  5 ++++-
 11 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
index e3f9a0910bbd8..f61e4aff7403f 100644
--- a/CUDADataFormats/Vertex/BuildFile.xml
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -1,9 +1,9 @@
 <use name="cuda"/>
+<use name="eigen"/>
 <use name="rootcore"/>
 <use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
-<use name="eigen"/>
 <export>
     <lib name="1"/>
 </export>
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index e7fea871f7d39..7931beaa8f4bd 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -1,8 +1,8 @@
-#ifndef CUDADataFormats__src_classes_h
-#define CUDADataFormats__src_classes_h
+#ifndef CUDADataFormats_Vertex_src_classes_h
+#define CUDADataFormats_Vertex_src_classes_h
 
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
-#endif
+#endif  // CUDADataFormats_Vertex_src_classes_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
index d4184c0825c7e..62b9bb46bd4a5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -1,28 +1,26 @@
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/OrphanHandle.h"
 #include "DataFormats/TrackReco/interface/Track.h"
 #include "DataFormats/TrackReco/interface/TrackExtra.h"
 #include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/VertexReco/interface/Vertex.h"
+#include "DataFormats/VertexReco/interface/VertexFwd.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-
-#include "DataFormats/VertexReco/interface/Vertex.h"
-#include "DataFormats/VertexReco/interface/VertexFwd.h"
-
 #undef PIXVERTEX_DEBUG_PRODUCE
 
 class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index b32c7d5b613db..b19aeb5930fc6 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
 
 #include <algorithm>
 #include <cmath>
@@ -231,4 +231,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index 7f62bb0b1b086..22ba1e15b4e05 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
 
 #include <algorithm>
 #include <cmath>
@@ -238,4 +238,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 098cc82b53117..1f2934ba15d0c 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
 
 #include <algorithm>
 #include <cmath>
@@ -209,4 +209,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
index 4487cb12ea17b..b8bbd0f601cb6 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
 
 #include <algorithm>
 #include <cmath>
@@ -110,4 +110,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index 89cc9a3844f76..841eab3901965 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
 
 #include <algorithm>
 #include <cmath>
@@ -70,4 +70,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
index 694915ab02157..0fe8bd882dcc5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
 
 #include <algorithm>
 #include <cmath>
@@ -136,4 +136,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index b1d581fd2a60d..5f8238c3ea8c8 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_gpuVertexFinder_h
-#define RecoPixelVertexing_PixelVertexFinding_gpuVertexFinder_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
 
 #include <cstddef>
 #include <cstdint>
@@ -80,4 +80,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_gpuVertexFinder_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
index 44ad16099e06a..d685ced488233 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -1,3 +1,6 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
+
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 #include "gpuClusterTracksByDensity.h"
@@ -186,4 +189,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#undef FROM
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h