diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
new file mode 100644
index 0000000000000..e3f9a0910bbd8
--- /dev/null
+++ b/CUDADataFormats/Track/BuildFile.xml
@@ -0,0 +1,9 @@
+<use name="cuda"/>
+<use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
new file mode 100644
index 0000000000000..3ee5af80353dd
--- /dev/null
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -0,0 +1,9 @@
+#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
+#define CUDADataFormats_Track_PixelTrackHeterogeneous_h
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+
+using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
\ No newline at end of file
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
new file mode 100644
index 0000000000000..bd39f3c4d3bfe
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -0,0 +1,73 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
+#define CUDADataFormats_Track_TrackHeterogeneousT_H
+
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+
+namespace pixelTrack {
+  enum class Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAHeterogeneousT {
+public:
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = pixelTrack::Quality;
+  using hindex_type = uint32_t;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+private:
+  eigenSoA::ScalarSoA<uint8_t, S> quality_;
+
+public:
+  constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); }
+  constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); }
+  constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); }
+  constexpr Quality *qualityData() { return (Quality *)(quality_.data()); }
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i); }
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoAT<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+};
+
+namespace pixelTrack {
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
new file mode 100644
index 0000000000000..64fcd573a6991
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
@@ -0,0 +1,59 @@
+#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
+#define CUDADataFormats_Track_TrajectoryStateSOAT_H
+
+#include <Eigen/Dense>
+#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+
+template <int32_t S>
+struct TrajectoryStateSoAT {
+  using Vector5f = Eigen::Matrix<float, 5, 1>;
+  using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::MatrixSoA<Vector5f, S> state;
+  eigenSoA::MatrixSoA<Vector15f, S> covariance;
+
+  template <typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__ inline void copyFromCircle(
+      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
+    state(i) << cp.template cast<float>(), lp.template cast<float>();
+    state(i)(2) *= b;
+    auto cov = covariance(i);
+    cov(0) = ccov(0, 0);
+    cov(1) = ccov(0, 1);
+    cov(2) = b * float(ccov(0, 2));
+    cov(4) = cov(3) = 0;
+    cov(5) = ccov(1, 1);
+    cov(6) = b * float(ccov(1, 2));
+    cov(8) = cov(7) = 0;
+    cov(9) = b * b * float(ccov(2, 2));
+    cov(11) = cov(10) = 0;
+    cov(12) = lcov(0, 0);
+    cov(13) = lcov(0, 1);
+    cov(14) = lcov(1, 1);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
+    state(i) = v.template cast<float>();
+    for (int j = 0, ind = 0; j < 5; ++j)
+      for (auto k = j; k < 5; ++k)
+        covariance(i)(ind++) = cov(j, k);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
+    v = state(i).template cast<typename V5::Scalar>();
+    for (int j = 0, ind = 0; j < 5; ++j) {
+      cov(j, j) = covariance(i)(ind++);
+      for (auto k = j + 1; k < 5; ++k)
+        cov(k, j) = cov(j, k) = covariance(i)(ind++);
+    }
+  }
+};
+
+#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
new file mode 100644
index 0000000000000..97c116f6c88d3
--- /dev/null
+++ b/CUDADataFormats/Track/src/classes.h
@@ -0,0 +1,9 @@
+#ifndef CUDADataFormats_Track_src_classes_h
+#define CUDADataFormats_Track_src_classes_h
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
new file mode 100644
index 0000000000000..9c80ae91baf29
--- /dev/null
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
+  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
new file mode 100644
index 0000000000000..598b345d4709d
--- /dev/null
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -0,0 +1,13 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
new file mode 100644
index 0000000000000..d6ff539a642b0
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
new file mode 100644
index 0000000000000..d6ff539a642b0
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
new file mode 100644
index 0000000000000..97b88873c2613
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -0,0 +1,75 @@
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+__host__ __device__ Matrix5d loadCov(Vector5d const& e) {
+  Matrix5d cov;
+  for (int i = 0; i < 5; ++i)
+    cov(i, i) = e(i) * e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
+      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
+      cov(j, i) = cov(i, j);
+    }
+  }
+  return cov;
+}
+
+using TS = TrajectoryStateSoAT<128>;
+
+__global__ void testTSSoA(TS* pts, int n) {
+  assert(n <= 128);
+
+  Vector5d par0;
+  par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
+  Vector5d e0;
+  e0 << 0.01, 0.01, 0.035, -0.03, -0.01;
+  auto cov0 = loadCov(e0);
+
+  TS& ts = *pts;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    ts.copyFromDense(par0, cov0, i);
+    Vector5d par1;
+    Matrix5d cov1;
+    ts.copyToDense(par1, cov1, i);
+    Vector5d delV = par1 - par0;
+    Matrix5d delM = cov1 - cov0;
+    for (int j = 0; j < 5; ++j) {
+      assert(std::abs(delV(j)) < 1.e-5);
+      for (auto k = j; k < 5; ++k) {
+        assert(cov0(k, j) == cov0(j, k));
+        assert(cov1(k, j) == cov1(j, k));
+        assert(std::abs(delM(k, j)) < 1.e-5);
+      }
+    }
+  }
+}
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
+  TS ts;
+
+#ifdef __CUDACC__
+  TS* ts_d;
+  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
+  testTSSoA<<<1, 64>>>(ts_d, 128);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testTSSoA(&ts, 128);
+#endif
+}
diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
new file mode 100644
index 0000000000000..e3f9a0910bbd8
--- /dev/null
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -0,0 +1,9 @@
+<use name="cuda"/>
+<use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
new file mode 100644
index 0000000000000..aacfddc6fe7e2
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
+#define CUDADataFormatsVertexZVertexHeterogeneous_H
+
+#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
+#ifndef __CUDACC__
+#include "CUDADataFormats/Common/interface/Product.h"
+using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
+#endif
+
+#endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
new file mode 100644
index 0000000000000..5f0699d5831ec
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -0,0 +1,26 @@
+#ifndef CUDADataFormatsVertexZVertexSoA_H
+#define CUDADataFormatsVertexZVertexSoA_H
+
+#include <cstdint>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+// SOA for vertices
+// These vertices are clusterized and fitted only along the beam line (z)
+// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
+struct ZVertexSoA {
+  static constexpr uint32_t MAXTRACKS = 32 * 1024;
+  static constexpr uint32_t MAXVTX = 1024;
+
+  int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
+  float zv[MAXVTX];          // output z-posistion of found vertices
+  float wv[MAXVTX];          // output weight (1/error^2) on the above
+  float chi2[MAXVTX];        // vertices chi2
+  float ptv2[MAXVTX];        // vertices pt^2
+  int32_t ndof[MAXTRACKS];   // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
+  uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
+  uint32_t nvFinal;          // the number of vertices
+
+  __host__ __device__ void init() { nvFinal = 0; }
+};
+
+#endif  // CUDADataFormatsVertexZVertexSoA.H
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
new file mode 100644
index 0000000000000..e7fea871f7d39
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats__src_classes_h
+#define CUDADataFormats__src_classes_h
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
new file mode 100644
index 0000000000000..ea633080af9af
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
+  <class name="ZVertexHeterogeneous" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
+</lcgdict>
diff --git a/Configuration/PyReleaseValidation/python/relval_2017.py b/Configuration/PyReleaseValidation/python/relval_2017.py
index 193ab79bcd384..87130136c154c 100644
--- a/Configuration/PyReleaseValidation/python/relval_2017.py
+++ b/Configuration/PyReleaseValidation/python/relval_2017.py
@@ -5,7 +5,7 @@
 # here only define the workflows as a combination of the steps defined above:
 workflows = Matrix()
 
-# each workflow defines a name and a list of steps to be done. 
+# each workflow defines a name and a list of steps to be done.
 # if no explicit name/label given for the workflow (first arg),
 # the name of step1 will be used
 
@@ -24,16 +24,16 @@
 #        (HE collapse: TTbar, TTbar PU, TTbar design)
 #        (ParkingBPH: TTbar)
 #        (TTbar PU with JME NanoAOD)
-#        (Patatrack pixel-only: ZMM - on CPU)
-#        (Patatrack pixel-only: TTbar - on CPU)
+#        (Patatrack pixel-only: ZMM - on CPU: quadruplets, triplets)
+#        (Patatrack pixel-only: TTbar - on CPU: quadruplets, triplets)
 #        (Patatrack ECAL-only: TTbar - on CPU)
 #        (Patatrack HCAL-only: TTbar - on CPU)
 #   2021 (DD4HEP: TTbar, ZMM)
 #        (ele guns 10, 35, 1000; pho guns 10, 35; mu guns 1, 10, 100, 1000, QCD 3TeV, QCD Flat)
 #        (ZMM, TTbar, ZEE, MinBias, TTbar PU, TTbar PU premix, ZEE PU, TTbar design)
 #        (TTbar trackingOnly, pixelTrackingOnly, trackingMkFit, trackdnn)
-#        (Patatrack pixel-only: ZMM - on CPU)
-#        (Patatrack pixel-only: TTbar - on CPU)
+#        (Patatrack pixel-only: ZMM - on CPU: quadruplets, triplets)
+#        (Patatrack pixel-only: TTbar - on CPU: quadruplets, triplets)
 #        (Patatrack ECAL-only: TTbar - on CPU)
 #        (Patatrack HCAL-only: TTbar - on CPU)
 #        (TTbar 0T, TTbar PU 0T)
@@ -51,16 +51,16 @@
            10824.6,11024.6,11224.6,
            10824.8,
            11024.15,
-           10842.501,
-           10824.501,
+           10842.501,10842.505,
+           10824.501,10824.505,
            10824.511,
            10824.521,
            11634.911, 11650.911,
            11601.0,11602.0,11603.0,11604.0,11605.0,11606.0,11607.0,11608.0,11609.0,11630.0,11643.0,
            11650.0,11634.0,11646.0,11640.0,11834.0,11834.99,11846.0,12024.0,
            11634.1,11634.5,11634.7,11634.91,
-           11650.501,
-           11634.501,
+           11650.501,11650.505,
+           11634.501,11634.505,
            11634.511,
            11634.521,
            11634.24,11834.24,
diff --git a/Configuration/PyReleaseValidation/python/relval_gpu.py b/Configuration/PyReleaseValidation/python/relval_gpu.py
index 4e49467a0e2e8..43353279ea4ad 100644
--- a/Configuration/PyReleaseValidation/python/relval_gpu.py
+++ b/Configuration/PyReleaseValidation/python/relval_gpu.py
@@ -5,7 +5,7 @@
 # here only define the workflows as a combination of the steps defined above:
 workflows = Matrix()
 
-# each workflow defines a name and a list of steps to be done. 
+# each workflow defines a name and a list of steps to be done.
 # if no explicit name/label given for the workflow (first arg),
 # the name of step1 will be used
 
@@ -14,21 +14,29 @@
 #just define all of them
 
 #WFs to run in IB:
-# mc 2018   (Patatrack pixel-only: ZMM - on GPU, both CPU and GPU, auto)
-#           (Patatrack pixel-only: TTbar - on GPU, both CPU and GPU, auto)
+# mc 2018   (Patatrack pixel-only quadruplets: ZMM - on GPU, both CPU and GPU, auto)
+#           (Patatrack pixel-only triplets: ZMM - on GPU, both CPU and GPU, auto)
+#           (Patatrack pixel-only quadruplets: TTbar - on GPU, both CPU and GPU, auto)
+#           (Patatrack pixel-only triplets: TTbar - on GPU, both CPU and GPU, auto)
 #           (Patatrack ECAL-only: TTbar - on GPU, both CPU and GPU, auto)
 #           (Patatrack HCAL-only: TTbar - on GPU, both CPU and GPU, auto)
-# mc 2021   (Patatrack pixel-only: ZMM - on GPU, both CPU and GPU, auto)
-#           (Patatrack pixel-only: TTbar - on GPU, both CPU and GPU, auto)
+# mc 2021   (Patatrack pixel-only quadruplets: ZMM - on GPU, both CPU and GPU, auto)
+#           (Patatrack pixel-only triplets: ZMM - on GPU, both CPU and GPU, auto)
+#           (Patatrack pixel-only quadruplets: TTbar - on GPU, both CPU and GPU, auto)
+#           (Patatrack pixel-only triplets: TTbar - on GPU, both CPU and GPU, auto)
 #           (Patatrack ECAL-only: TTbar - on GPU, both CPU and GPU, auto)
 #           (Patatrack HCAL-only: TTbar - on GPU, both CPU and GPU, auto)
 numWFIB = [
            10842.502, # 10842.503,10842.504,
+           10842.506, # 10842.507,10842.508,
            10824.502, # 10824.503,10824.504,
+           10824.506, # 10824.507,10824.508,
            10824.512, # 10824.513,10824.514,
            10824.522, # 10824.523,10824.524,
            11650.502, # 11650.503,11650.504,
+           11650.506, # 11650.507,11650.508,
            11634.502, # 11634.503,11634.504,
+           11634.506, # 11634.507,11634.508,
            11634.512, # 11634.513,11634.514,
            11634.522, # 11634.523,11634.524
         ]
diff --git a/Configuration/PyReleaseValidation/python/relval_steps.py b/Configuration/PyReleaseValidation/python/relval_steps.py
index 3556eda005ada..0fc667283361b 100644
--- a/Configuration/PyReleaseValidation/python/relval_steps.py
+++ b/Configuration/PyReleaseValidation/python/relval_steps.py
@@ -2186,8 +2186,11 @@ def gen2021HiMix(fragment,howMuch):
                   '--era'          :'Run2_2016'
                   }
 
-step3_pixelNtupleFit = {
-    '--procModifiers': 'pixelNtupleFit',
+step3_pixel_ntuplet_cpu = {
+    '--customise': 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksSoAonCPU'
+}
+step3_pixel_triplets = {
+    '--customise': 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksForTriplets'
 }
 step3_gpu = {
     '--procModifiers': 'gpu',
@@ -2320,8 +2323,11 @@ def gen2021HiMix(fragment,howMuch):
 steps['RECODR2_2018reHLT_Prompt']=merge([{'--conditions':'auto:run2_data'},steps['RECODR2_2018reHLT']])
 steps['RECODR2_2018reHLT_ZBPrompt']=merge([{'--conditions':'auto:run2_data','-s':'RAW2DIGI,L1Reco,RECO,EI,PAT,ALCA:SiStripCalZeroBias+SiStripCalMinBias+TkAlMinBias+EcalESAlign,DQM:@rerecoZeroBias+@ExtraHLT+@miniAODDQM'},steps['RECODR2_2018reHLT']])
 steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']=merge([{'-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,DQM:@pixelTrackingOnlyDQM'},steps['RECODR2_2018reHLT_Prompt']])
-steps['RECODR2_2018reHLT_Patatrack_PixelOnlyCPU']=merge([step3_pixelNtupleFit, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']])
+steps['RECODR2_2018reHLT_Patatrack_PixelOnlyCPU']=merge([step3_pixel_ntuplet_cpu, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']])
 steps['RECODR2_2018reHLT_Patatrack_PixelOnlyGPU']=merge([step3_gpu, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']])
+steps['RECODR2_2018reHLT_Patatrack_PixelOnlyTripletsCPU']=merge([step3_pixel_ntuplet_cpu, step3_pixel_triplets, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']])
+steps['RECODR2_2018reHLT_Patatrack_PixelOnlyTripletsGPU']=merge([step3_gpu, step3_pixel_triplets, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']])
+
 steps['RECODR2_2018reHLT_ECALOnlyCPU']=merge([{'-s': 'RAW2DIGI:RawToDigi_ecalOnly,RECO:reconstruction_ecalOnly,DQM:@ecalOnly'},steps['RECODR2_2018reHLT_Prompt']])
 steps['RECODR2_2018reHLT_ECALOnlyGPU']=merge([step3_gpu, steps['RECODR2_2018reHLT_ECALOnlyCPU']])
 steps['RECODR2_2018reHLT_HCALOnlyCPU']=merge([{'-s': 'RAW2DIGI:RawToDigi_hcalOnly,RECO:reconstruction_hcalOnly,DQM:@hcalOnly+@hcal2Only'},steps['RECODR2_2018reHLT_Prompt']])
diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
index 9ff0ae9f22e0d..513462593300e 100644
--- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
+++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
@@ -453,7 +453,26 @@ def condition_(self, fragment, stepList, key, hasHarvest):
     '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM',
     '--datatier': 'GEN-SIM-RECO,DQMIO',
     '--eventcontent': 'RECOSIM,DQM',
-    '--procModifiers': 'pixelNtupleFit'
+    '--customise' : 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksSoAonCPU'
+}
+
+upgradeWFs['PatatrackPixelOnlyTripletsCPU'] = UpgradeWorkflowPatatrack_PixelOnlyCPU(
+    steps = [
+        'Reco',
+        'HARVEST',
+        'RecoGlobal',
+        'HARVESTGlobal',
+    ],
+    PU = [],
+    suffix = 'Patatrack_PixelOnlyTripletsCPU',
+    offset = 0.505,
+)
+
+upgradeWFs['PatatrackPixelOnlyTripletsCPU'].step3 = {
+    '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM',
+    '--datatier': 'GEN-SIM-RECO,DQMIO',
+    '--eventcontent': 'RECOSIM,DQM',
+    '--customise' : 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksSoAonCPU,RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksForTriplets'
 }
 
 class UpgradeWorkflowPatatrack_PixelOnlyGPU(UpgradeWorkflowPatatrack):
@@ -487,6 +506,26 @@ def condition_(self, fragment, stepList, key, hasHarvest):
     '--procModifiers': 'gpu'
 }
 
+upgradeWFs['PatatrackPixelOnlyTripletsGPU'] = UpgradeWorkflowPatatrack_PixelOnlyGPU(
+    steps = [
+        'Reco',
+        'HARVEST',
+        'RecoGlobal',
+        'HARVESTGlobal',
+    ],
+    PU = [],
+    suffix = 'Patatrack_PixelOnlyTripletsGPU',
+    offset = 0.506,
+)
+
+upgradeWFs['PatatrackPixelOnlyTripletsGPU'].step3 = {
+    '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM',
+    '--datatier': 'GEN-SIM-RECO,DQMIO',
+    '--eventcontent': 'RECOSIM,DQM',
+    '--procModifiers': 'gpu',
+    '--customise': 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksForTriplets'
+}
+
 class UpgradeWorkflowPatatrack_ECALOnlyCPU(UpgradeWorkflowPatatrack):
     def setup_(self, step, stepName, stepDict, k, properties):
         if 'Reco' in step:
diff --git a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py
index efdde1512fcf7..4846de0887fde 100644
--- a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py
@@ -317,8 +317,7 @@
 process.pixelTracksTrackingRegions.RegionPSet.originXPos =  0.08
 process.pixelTracksTrackingRegions.RegionPSet.originYPos = -0.03
 process.pixelTracksTrackingRegions.RegionPSet.originZPos = 0.
-
-process.pixelVertices.TkFilterParameters.minPt = process.pixelTracksTrackingRegions.RegionPSet.ptMin
+process.pixelVertices.PtMin = process.pixelTracksTrackingRegions.RegionPSet.ptMin
 
 process.tracking_FirstStep = cms.Sequence(
       process.siPixelDigis 
diff --git a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py
index f909104a39834..a3eac2069e6ed 100644
--- a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py
@@ -90,12 +90,12 @@
 process.siPixelClusterShapeCachePreSplitting = siPixelClusterShapeCache.clone(src = 'siPixelClustersPreSplitting')
 process.load("RecoLocalTracker.SiPixelRecHits.PixelCPEGeneric_cfi")
 process.load("RecoPixelVertexing.Configuration.RecoPixelVertexing_cff")
-process.pixelVertices.TkFilterParameters.minPt = process.pixelTracksTrackingRegions.RegionPSet.ptMin
 process.pixelTracksTrackingRegions.RegionPSet.originRadius     = cms.double(0.4)
 process.pixelTracksTrackingRegions.RegionPSet.originHalfLength = cms.double(15.)
 process.pixelTracksTrackingRegions.RegionPSet.originXPos       = cms.double(0.08)
 process.pixelTracksTrackingRegions.RegionPSet.originYPos       = cms.double(-0.03)
 process.pixelTracksTrackingRegions.RegionPSet.originZPos       = cms.double(0.)
+process.pixelVertices.PtMin = process.pixelTracksTrackingRegions.RegionPSet.ptMin
 
 
 #----------------------------
diff --git a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py
index 15ceaf93ed20a..cff85e56d94f7 100644
--- a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py
+++ b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py
@@ -21,7 +21,10 @@ def _layers(suffix, quant, histoPostfix):
     ]
 
 pixelTrackingEffFromHitPattern = DQMEDHarvester("DQMGenericClient",
-    subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/HitEffFromHitPattern*"),
+    subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/pixelTracks/HitEffFromHitPattern*",
+                                    "Tracking/PixelTrackParameters/dzPV0p1/HitEffFromHitPattern*",
+                                    "Tracking/PixelTrackParameters/pt_0to1/HitEffFromHitPattern*",
+                                    "Tracking/PixelTrackParameters/pt_1/HitEffFromHitPattern*"),
     efficiency = cms.vstring(
         _layers("PU", "GoodNumVertices", "") +
         _layers("BX", "BX", "VsBX") +
diff --git a/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
new file mode 100644
index 0000000000000..2558e88d26012
--- /dev/null
+++ b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
@@ -0,0 +1,7 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQM.TrackingMonitorClient.primaryVertexResolutionClient_cfi import primaryVertexResolutionClient as _primaryVertexResolutionClient
+
+pixelVertexResolutionClient = _primaryVertexResolutionClient.clone(
+    subDirs = ["OfflinePixelPV/Resolution/*"]
+)
diff --git a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
index a075f671f05ce..d5deba78b46c8 100644
--- a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
+++ b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py
@@ -1,23 +1,77 @@
 import FWCore.ParameterSet.Config as cms
 
 import DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi
-pixelTracksMonitoring = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone()
-pixelTracksMonitoring.FolderName                = 'Tracking/PixelTrackParameters'
-pixelTracksMonitoring.TrackProducer             = 'pixelTracks'
-pixelTracksMonitoring.allTrackProducer          = 'pixelTracks'
-pixelTracksMonitoring.beamSpot                  = 'offlineBeamSpot'
-pixelTracksMonitoring.primaryVertex             = 'pixelVertices'
-pixelTracksMonitoring.pvNDOF                    = 1
-pixelTracksMonitoring.doAllPlots                = True
-pixelTracksMonitoring.doLumiAnalysis            = True
-pixelTracksMonitoring.doProfilesVsLS            = True
-pixelTracksMonitoring.doDCAPlots                = True
-pixelTracksMonitoring.doProfilesVsLS            = True
-pixelTracksMonitoring.doPlotsVsGoodPVtx         = True
-pixelTracksMonitoring.doEffFromHitPatternVsPU   = False
-pixelTracksMonitoring.doEffFromHitPatternVsBX   = False
-pixelTracksMonitoring.doEffFromHitPatternVsLUMI = False
-pixelTracksMonitoring.doPlotsVsGoodPVtx         = True
-pixelTracksMonitoring.doPlotsVsLUMI             = True
-pixelTracksMonitoring.doPlotsVsBX               = True
+pixelTracksMonitor = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone()
+pixelTracksMonitor.FolderName                = 'Tracking/PixelTrackParameters/pixelTracks'
+pixelTracksMonitor.TrackProducer             = 'pixelTracks'
+pixelTracksMonitor.allTrackProducer          = 'pixelTracks'
+pixelTracksMonitor.beamSpot                  = 'offlineBeamSpot'
+pixelTracksMonitor.primaryVertex             = 'pixelVertices'
+pixelTracksMonitor.pvNDOF                    = 1
+pixelTracksMonitor.doAllPlots                = True
+pixelTracksMonitor.doLumiAnalysis            = True
+pixelTracksMonitor.doProfilesVsLS            = True
+pixelTracksMonitor.doDCAPlots                = True
+pixelTracksMonitor.doProfilesVsLS            = True
+pixelTracksMonitor.doPlotsVsGoodPVtx         = True
+pixelTracksMonitor.doEffFromHitPatternVsPU   = False
+pixelTracksMonitor.doEffFromHitPatternVsBX   = False
+pixelTracksMonitor.doEffFromHitPatternVsLUMI = False
+pixelTracksMonitor.doPlotsVsGoodPVtx         = True
+pixelTracksMonitor.doPlotsVsLUMI             = True
+pixelTracksMonitor.doPlotsVsBX               = True
 
+_trackSelector = cms.EDFilter('TrackSelector',
+    src = cms.InputTag('pixelTracks'),
+    cut = cms.string("")
+)
+
+pixelTracksPt0to1 = _trackSelector.clone(cut = "pt >= 0 & pt < 1 ")
+pixelTracksPt1 = _trackSelector.clone(cut = "pt >= 1 ")
+from DQM.TrackingMonitorSource.TrackCollections2monitor_cff import highPurityPV0p1 as _highPurityPV0p1
+pixelTracksPV0p1 = _highPurityPV0p1.clone(
+    src = "pixelTracks",
+    quality = "",
+    vertexTag = "goodPixelVertices"
+)
+
+pixelTracksMonitorPt0to1 = pixelTracksMonitor.clone(
+    TrackProducer = "pixelTracksPt0to1",
+    FolderName = "Tracking/PixelTrackParameters/pt_0to1"
+)
+pixelTracksMonitorPt1 = pixelTracksMonitor.clone(
+    TrackProducer = "pixelTracksPt1",
+    FolderName = "Tracking/PixelTrackParameters/pt_1"
+)
+pixelTracksMonitorPV0p1 = pixelTracksMonitor.clone(
+    TrackProducer = "pixelTracksPV0p1",
+    FolderName = "Tracking/PixelTrackParameters/dzPV0p1"
+)
+
+
+from CommonTools.ParticleFlow.goodOfflinePrimaryVertices_cfi import goodOfflinePrimaryVertices as _goodOfflinePrimaryVertices
+goodPixelVertices = _goodOfflinePrimaryVertices.clone(
+    src = "pixelVertices",
+)
+
+from DQM.TrackingMonitor.primaryVertexResolution_cfi import primaryVertexResolution as _primaryVertexResolution
+pixelVertexResolution = _primaryVertexResolution.clone(
+    vertexSrc = "goodPixelVertices",
+    rootFolder = "OfflinePixelPV/Resolution",
+)
+
+pixelTracksMonitoringTask = cms.Task(
+    goodPixelVertices,
+    pixelTracksPt0to1,
+    pixelTracksPt1,
+    pixelTracksPV0p1,
+)
+
+pixelTracksMonitoring = cms.Sequence(
+    pixelTracksMonitor +
+    pixelTracksMonitorPt0to1 +
+    pixelTracksMonitorPt1 +
+    pixelTracksMonitorPV0p1 +
+    pixelVertexResolution,
+    pixelTracksMonitoringTask
+)
diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 368b328632fd8..29bf311c474d4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -122,6 +122,7 @@
 from DQM.CTPPS.ctppsDQM_cff import *
 from Validation.RecoTau.DQMSequences_cfi import *
 from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
+from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import *
 
 DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient )
 
@@ -179,7 +180,8 @@
 DQMHarvestTrackingZeroBias = cms.Sequence( TrackingOfflineDQMClientZeroBias *
                                            dqmFastTimerServiceClient )
 
-DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern )
+DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern *
+                                        pixelVertexResolutionClient )
 
 DQMHarvestOuterTracker = cms.Sequence(
                                  OuterTrackerClient *
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 2001c22352a48..ac28700d4eaf4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -157,10 +157,12 @@
 
 #DQMOfflineCommon
 from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import *
+from DQMOffline.RecoB.PixelVertexMonitor_cff import *
 from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
 from Validation.RecoTau.DQMSequences_cfi import *
 
-DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring )
+DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring *
+                                        pixelPVMonitor )
 
 DQMOuterTracker = cms.Sequence( DQMOfflineDCS *
                                 OuterTrackerSource *
diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
new file mode 100644
index 0000000000000..9e293f4478bd6
--- /dev/null
+++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
@@ -0,0 +1,8 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQMOffline.RecoB.PrimaryVertexMonitor_cff import pvMonitor as _pvMonitor
+pixelPVMonitor = _pvMonitor.clone(
+    TopFolderName = "OfflinePixelPV",
+    vertexLabel = "pixelVertices",
+    ndof        = cms.int32( 1 )
+)
diff --git a/FastSimulation/Tracking/python/SeedingMigration.py b/FastSimulation/Tracking/python/SeedingMigration.py
index 751670daa50c8..3a982eba55e36 100644
--- a/FastSimulation/Tracking/python/SeedingMigration.py
+++ b/FastSimulation/Tracking/python/SeedingMigration.py
@@ -13,8 +13,9 @@ def _hitSetProducerToFactoryPSet(producer):
         "PixelTripletLargeTipEDProducer": "PixelTripletLargeTipGenerator",
         "MultiHitFromChi2EDProducer": "MultiHitGeneratorFromChi2",
         "CAHitTripletEDProducer": "CAHitTripletGenerator",
-        "CAHitQuadrupletEDProducer": "CAHitQuadrupletGenerator",   
-        }
+        "CAHitQuadrupletEDProducer": "CAHitQuadrupletGenerator",
+        "CAHitNtupletHeterogeneousEDProducer": "CAHitQuadrupletGenerator",
+    }
     ret = cms.PSet()
     _copy(producer, ret)
     ret.ComponentName = cms.string(_map[producer._TypedParameterizable__type]);
diff --git a/HLTrigger/Configuration/python/customizeHLTforPatatrack.py b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py
new file mode 100644
index 0000000000000..5164188c94997
--- /dev/null
+++ b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py
@@ -0,0 +1,802 @@
+import copy
+import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+from HLTrigger.Configuration.common import *
+from Configuration.Eras.Modifier_run3_common_cff import run3_common
+
+
+# force the SwitchProducerCUDA choice to pick a specific backend: True for offloading to a gpu, False for running on cpu
+def forceGpuOffload(status = True):
+    import HeterogeneousCore.CUDACore.SwitchProducerCUDA
+    HeterogeneousCore.CUDACore.SwitchProducerCUDA._cuda_enabled_cached = bool(status)
+
+
+# reset the SwitchProducerCUDA choice to pick a backend depending on the availability of a supported gpu
+def resetGpuOffload():
+    import HeterogeneousCore.CUDACore.SwitchProducerCUDA
+    HeterogeneousCore.CUDACore.SwitchProducerCUDA._cuda_enabled_cached = None
+    HeterogeneousCore.CUDACore.SwitchProducerCUDA._switch_cuda()
+
+
+# customisation for running the Patatrack reconstruction, common parts
+def customiseCommon(process):
+
+    # Services
+
+    process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+
+    process.load("HeterogeneousCore.CUDAServices.NVProfilerService_cfi")
+
+
+    # Paths and EndPaths
+
+    # the hltGetConditions module would force gpu-specific ESProducers to run even if no supported gpu is present
+    if 'hltGetConditions' in process.__dict__:
+        del process.hltGetConditions
+
+    # produce a boolean to track if the events ar being processed on gpu (true) or cpu (false)
+    process.statusOnGPU = SwitchProducerCUDA(
+        cpu  = cms.EDProducer("BooleanProducer", value = cms.bool(False)),
+        cuda = cms.EDProducer("BooleanProducer", value = cms.bool(True))
+    )
+
+    process.statusOnGPUFilter = cms.EDFilter("BooleanFilter",
+        src = cms.InputTag("statusOnGPU")
+    )
+
+    if 'Status_OnGPU' in process.__dict__:
+        replace_with(process.Status_OnGPU, cms.Path(process.statusOnGPU + process.statusOnGPUFilter))
+    else:
+        process.Status_OnGPU = cms.Path(process.statusOnGPU + process.statusOnGPUFilter)
+        if 'HLTSchedule' in process.__dict__:
+            process.HLTSchedule.append(process.Status_OnGPU)
+        if process.schedule is not None:
+            process.schedule.append(process.Status_OnGPU)
+
+
+    # make the ScoutingCaloMuonOutput endpath compatible with using Tasks in the Scouting paths
+    if 'hltOutputScoutingCaloMuon' in process.__dict__ and not 'hltPreScoutingCaloMuonOutputSmart' in process.__dict__:
+        process.hltPreScoutingCaloMuonOutputSmart = cms.EDFilter( "TriggerResultsFilter",
+            l1tIgnoreMaskAndPrescale = cms.bool( False ),
+            l1tResults = cms.InputTag( "" ),
+            hltResults = cms.InputTag( 'TriggerResults','','@currentProcess' ),
+            triggerConditions = process.hltOutputScoutingCaloMuon.SelectEvents.SelectEvents,
+            throw = cms.bool( True )
+        )
+        insert_modules_after(process, process.hltPreScoutingCaloMuonOutput, process.hltPreScoutingCaloMuonOutputSmart)
+
+    # make the ScoutingPFOutput endpath compatible with using Tasks in the Scouting paths
+    if 'hltOutputScoutingPF' in process.__dict__ and not 'hltPreScoutingPFOutputSmart' in process.__dict__:
+        process.hltPreScoutingPFOutputSmart = cms.EDFilter( "TriggerResultsFilter",
+            l1tIgnoreMaskAndPrescale = cms.bool( False ),
+            l1tResults = cms.InputTag( "" ),
+            hltResults = cms.InputTag( 'TriggerResults','','@currentProcess' ),
+            triggerConditions = process.hltOutputScoutingPF.SelectEvents.SelectEvents,
+            throw = cms.bool( True )
+        )
+        insert_modules_after(process, process.hltPreScoutingPFOutput, process.hltPreScoutingPFOutputSmart)
+
+
+    # done
+    return process
+
+
+# customisation for running the "Patatrack" pixel local reconstruction
+def customisePixelLocalReconstruction(process):
+
+    if not 'HLTDoLocalPixelSequence' in process.__dict__:
+        return process
+
+
+    # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases
+
+    process.HLTDoLocalPixelSequence = cms.Sequence()
+
+
+    # Event Setup
+
+    process.load("CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi")                 # this should be used only on GPUs, will crash otherwise
+    process.load("CalibTracker.SiPixelESProducers.siPixelROCsStatusAndMappingWrapperESProducer_cfi")    # this should be used only on GPUs, will crash otherwise
+    process.load("RecoLocalTracker.SiPixelRecHits.PixelCPEFastESProducer_cfi")
+
+
+    # Modules and EDAliases
+
+    # referenced in HLTDoLocalPixelTask
+
+    # transfer the beamspot to the gpu
+    from RecoVertex.BeamSpotProducer.offlineBeamSpotToCUDA_cfi import offlineBeamSpotToCUDA as _offlineBeamSpotToCUDA
+    process.hltOnlineBeamSpotToCUDA = _offlineBeamSpotToCUDA.clone(
+        src = "hltOnlineBeamSpot"
+    )
+
+    # reconstruct the pixel digis and clusters on the gpu
+    from RecoLocalTracker.SiPixelClusterizer.siPixelRawToClusterCUDA_cfi import siPixelRawToClusterCUDA as _siPixelRawToClusterCUDA
+    process.hltSiPixelClustersCUDA = _siPixelRawToClusterCUDA.clone()
+    # use the pixel channel calibrations scheme for Run 3
+    run3_common.toModify(process.hltSiPixelClustersCUDA, isRun2 = False)
+
+    # copy the pixel digis errors to the host
+    from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA
+    process.hltSiPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone(
+        src = "hltSiPixelClustersCUDA"
+    )
+
+    # convert the pixel digis errors to the legacy format
+    from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsFromSoA_cfi import siPixelDigiErrorsFromSoA as _siPixelDigiErrorsFromSoA
+    process.hltSiPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone(
+        digiErrorSoASrc = "hltSiPixelDigiErrorsSoA",
+        UsePhase1 = True
+    )
+
+    # copy the pixel digis (except errors) and clusters to the host
+    from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA
+    process.hltSiPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone(
+        src = "hltSiPixelClustersCUDA"
+    )
+
+    # convert the pixel digis (except errors) and clusters to the legacy format
+    from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA
+    process.hltSiPixelDigisClusters = _siPixelDigisClustersFromSoA.clone(
+        src = "hltSiPixelDigisSoA"
+    )
+
+    # SwitchProducer wrapping the legacy pixel digis producer or an alias combining the pixel digis information converted from SoA
+    process.hltSiPixelDigis = SwitchProducerCUDA(
+        # legacy producer
+        cpu = process.hltSiPixelDigis,
+        # alias used to access products from multiple conversion modules
+        cuda = cms.EDAlias(
+            hltSiPixelDigisClusters = cms.VPSet(
+                cms.PSet(type = cms.string("PixelDigiedmDetSetVector"))
+            ),
+            hltSiPixelDigiErrors = cms.VPSet(
+                cms.PSet(type = cms.string("DetIdedmEDCollection")),
+                cms.PSet(type = cms.string("SiPixelRawDataErroredmDetSetVector")),
+                cms.PSet(type = cms.string("PixelFEDChanneledmNewDetSetVector"))
+            )
+        )
+    )
+
+    # SwitchProducer wrapping the legacy pixel cluster producer or an alias for the pixel clusters information converted from SoA
+    process.hltSiPixelClusters = SwitchProducerCUDA(
+        # legacy producer
+        cpu = process.hltSiPixelClusters,
+        # alias used to access products from multiple conversion modules
+        cuda = cms.EDAlias(
+            hltSiPixelDigisClusters = cms.VPSet(
+                cms.PSet(type = cms.string("SiPixelClusteredmNewDetSetVector"))
+            )
+        )
+    )
+
+    # reconstruct the pixel rechits on the gpu
+    from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDA_cfi import siPixelRecHitCUDA as _siPixelRecHitCUDA
+    process.hltSiPixelRecHitsCUDA = _siPixelRecHitCUDA.clone(
+        src = "hltSiPixelClustersCUDA",
+        beamSpot = "hltOnlineBeamSpotToCUDA"
+    )
+
+    # SwitchProducer wrapping the legacy pixel rechit producer or the transfer of the pixel rechits to the host and the conversion from SoA
+    from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDA_cfi import siPixelRecHitFromCUDA as _siPixelRecHitFromCUDA
+    process.hltSiPixelRecHits = SwitchProducerCUDA(
+        # legacy producer
+        cpu = process.hltSiPixelRecHits,
+        # converter to legacy format
+        cuda = _siPixelRecHitFromCUDA.clone(
+            pixelRecHitSrc = "hltSiPixelRecHitsCUDA",
+            src = "hltSiPixelClusters"
+        )
+    )
+
+
+    # Tasks and Sequences
+
+    process.HLTDoLocalPixelTask = cms.Task(
+          process.hltOnlineBeamSpotToCUDA,                  # transfer the beamspot to the gpu
+          process.hltSiPixelClustersCUDA,                   # reconstruct the pixel digis and clusters on the gpu
+          process.hltSiPixelRecHitsCUDA,                    # reconstruct the pixel rechits on the gpu
+          process.hltSiPixelDigisSoA,                       # copy the pixel digis (except errors) and clusters to the host
+          process.hltSiPixelDigisClusters,                  # convert the pixel digis (except errors) and clusters to the legacy format
+          process.hltSiPixelDigiErrorsSoA,                  # copy the pixel digis errors to the host
+          process.hltSiPixelDigiErrors,                     # convert the pixel digis errors to the legacy format
+          process.hltSiPixelDigis,                          # SwitchProducer wrapping the legacy pixel digis producer or an alias combining the pixel digis information converted from SoA
+          process.hltSiPixelClusters,                       # SwitchProducer wrapping the legacy pixel cluster producer or an alias for the pixel clusters information converted from SoA
+          process.hltSiPixelClustersCache,                  # legacy module, used by the legacy pixel quadruplet producer
+          process.hltSiPixelRecHits)                        # SwitchProducer wrapping the legacy pixel rechit producer or the transfer of the pixel rechits to the host and the conversion from SoA
+
+    process.HLTDoLocalPixelSequence = cms.Sequence(process.HLTDoLocalPixelTask)
+
+
+    # done
+    return process
+
+
+# customisation for running the "Patatrack" pixel track reconstruction
+def customisePixelTrackReconstruction(process):
+
+    if not 'HLTRecoPixelTracksSequence' in process.__dict__:
+        return process
+
+
+    # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases
+
+    process.HLTRecoPixelTracksSequence = cms.Sequence()
+    process.HLTRecopixelvertexingSequence = cms.Sequence()
+
+
+    # Modules and EDAliases
+
+    # referenced in process.HLTRecoPixelTracksTask
+
+    # cpu only: convert the pixel rechits from legacy to SoA format
+    from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy as _siPixelRecHitSoAFromLegacy
+    process.hltSiPixelRecHitSoA = _siPixelRecHitSoAFromLegacy.clone(
+        src = "hltSiPixelClusters",
+        beamSpot = "hltOnlineBeamSpot",
+        convertToLegacy = True
+    )
+
+    # build pixel ntuplets and pixel tracks in SoA format on gpu
+    from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA as _caHitNtupletCUDA
+    process.hltPixelTracksCUDA = _caHitNtupletCUDA.clone(
+        idealConditions = False,
+        pixelRecHitSrc = "hltSiPixelRecHitsCUDA",
+        onGPU = True
+    )
+    # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows
+    run3_common.toModify(process.hltPixelTracksCUDA, idealConditions = True)
+
+    # SwitchProducer providing the pixel tracks in SoA format on cpu
+    process.hltPixelTracksSoA = SwitchProducerCUDA(
+        # build pixel ntuplets and pixel tracks in SoA format on cpu
+        cpu = _caHitNtupletCUDA.clone(
+            idealConditions = False,
+            pixelRecHitSrc = "hltSiPixelRecHitSoA",
+            onGPU = False
+        ),
+        # transfer the pixel tracks in SoA format to the host
+        cuda = cms.EDProducer("PixelTrackSoAFromCUDA",
+            src = cms.InputTag("hltPixelTracksCUDA")
+        )
+    )
+    # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows
+    run3_common.toModify(process.hltPixelTracksSoA.cpu, idealConditions = True)
+
+    # convert the pixel tracks from SoA to legacy format
+    from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackProducerFromSoA
+    process.hltPixelTracks = _pixelTrackProducerFromSoA.clone(
+        beamSpot = "hltOnlineBeamSpot",
+        pixelRecHitLegacySrc = "hltSiPixelRecHits",
+        trackSrc = "hltPixelTracksSoA"
+    )
+
+
+    # referenced in process.HLTRecopixelvertexingTask
+
+    # build pixel vertices in SoA format on gpu
+    from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA as _pixelVertexCUDA
+    process.hltPixelVerticesCUDA = _pixelVertexCUDA.clone(
+        pixelTrackSrc = "hltPixelTracksCUDA",
+        onGPU = True
+    )
+
+    # build or transfer pixel vertices in SoA format on cpu
+    process.hltPixelVerticesSoA = SwitchProducerCUDA(
+        # build pixel vertices in SoA format on cpu
+        cpu = _pixelVertexCUDA.clone(
+            pixelTrackSrc = "hltPixelTracksSoA",
+            onGPU = False
+        ),
+        # transfer the pixel vertices in SoA format to cpu
+        cuda = cms.EDProducer("PixelVertexSoAFromCUDA",
+            src = cms.InputTag("hltPixelVerticesCUDA")
+        )
+    )
+
+    # convert the pixel vertices from SoA to legacy format
+    from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA
+    process.hltPixelVertices = _pixelVertexFromSoA.clone(
+        src = "hltPixelVerticesSoA",
+        TrackCollection = "hltPixelTracks",
+        beamSpot = "hltOnlineBeamSpot"
+    )
+
+
+    # Tasks and Sequences
+
+    process.HLTRecoPixelTracksTask = cms.Task(
+          process.hltPixelTracksTrackingRegions,            # from the original sequence
+          process.hltSiPixelRecHitSoA,                      # pixel rechits on cpu, converted to SoA
+          process.hltPixelTracksCUDA,                       # pixel ntuplets on gpu, in SoA format
+          process.hltPixelTracksSoA,                        # pixel ntuplets on cpu, in SoA format
+          process.hltPixelTracks)                           # pixel tracks on cpu, in legacy format
+
+
+    process.HLTRecoPixelTracksSequence = cms.Sequence(process.HLTRecoPixelTracksTask)
+
+    process.HLTRecopixelvertexingTask = cms.Task(
+          process.HLTRecoPixelTracksTask,
+          process.hltPixelVerticesCUDA,                     # pixel vertices on gpu, in SoA format
+          process.hltPixelVerticesSoA,                      # pixel vertices on cpu, in SoA format
+          process.hltPixelVertices,                         # pixel vertices on cpu, in legacy format
+          process.hltTrimmedPixelVertices)                  # from the original sequence
+
+    process.HLTRecopixelvertexingSequence = cms.Sequence(
+          process.hltPixelTracksFitter +                    # not used here, kept for compatibility with legacy sequences
+          process.hltPixelTracksFilter,                     # not used here, kept for compatibility with legacy sequences
+          process.HLTRecopixelvertexingTask)
+
+
+    # done
+    return process
+
+
+# customisation for offloading the ECAL local reconstruction via CUDA if a supported gpu is present
+def customiseEcalLocalReconstruction(process):
+
+    if not 'HLTDoFullUnpackingEgammaEcalSequence' in process.__dict__:
+        return process
+
+
+    # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases
+
+    process.HLTDoFullUnpackingEgammaEcalMFSequence = cms.Sequence()
+    process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerSequence = cms.Sequence()
+    process.HLTDoFullUnpackingEgammaEcalSequence = cms.Sequence()
+
+
+    # Event Setup
+
+    process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalMultifitParametersGPUESProducer_cfi")
+
+    process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitParametersGPUESProducer_cfi")
+
+
+    # Modules and EDAliases
+
+    # ECAL unpacker running on gpu
+    process.hltEcalDigisGPU = cms.EDProducer("EcalRawToDigiGPU",
+        InputLabel = cms.InputTag("rawDataCollector"),
+        FEDs = cms.vint32(
+            601, 602, 603, 604, 605,
+            606, 607, 608, 609, 610,
+            611, 612, 613, 614, 615,
+            616, 617, 618, 619, 620,
+            621, 622, 623, 624, 625,
+            626, 627, 628, 629, 630,
+            631, 632, 633, 634, 635,
+            636, 637, 638, 639, 640,
+            641, 642, 643, 644, 645,
+            646, 647, 648, 649, 650,
+            651, 652, 653, 654
+        ),
+        digisLabelEB = cms.string("ebDigis"),
+        digisLabelEE = cms.string("eeDigis"),
+        maxChannelsEB = cms.uint32(61200),
+        maxChannelsEE = cms.uint32(14648),
+    )
+
+    # SwitchProducer wrapping the legacy ECAL unpacker or the ECAL digi converter from SoA format on gpu to legacy format on cpu
+    process.hltEcalDigisLegacy = process.hltEcalDigis.clone()
+
+    process.hltEcalDigis = SwitchProducerCUDA(
+        # legacy producer
+        cpu = cms.EDAlias(
+            hltEcalDigisLegacy = cms.VPSet(
+                cms.PSet(type = cms.string("EBDigiCollection")),
+                cms.PSet(type = cms.string("EEDigiCollection")),
+                cms.PSet(type = cms.string("EBDetIdedmEDCollection")),
+                cms.PSet(type = cms.string("EEDetIdedmEDCollection")),
+                cms.PSet(type = cms.string("EBSrFlagsSorted")),
+                cms.PSet(type = cms.string("EESrFlagsSorted")),
+                cms.PSet(type = cms.string("EcalElectronicsIdedmEDCollection"), fromProductInstance = cms.string("EcalIntegrityBlockSizeErrors")),
+                cms.PSet(type = cms.string("EcalElectronicsIdedmEDCollection"), fromProductInstance = cms.string("EcalIntegrityTTIdErrors"))
+            )
+        ),
+        # convert ECAL digis from SoA format on gpu to legacy format on cpu
+        cuda = cms.EDProducer("EcalCPUDigisProducer",
+            digisInLabelEB = cms.InputTag("hltEcalDigisGPU", "ebDigis"),
+            digisInLabelEE = cms.InputTag("hltEcalDigisGPU", "eeDigis"),
+            digisOutLabelEB = cms.string("ebDigis"),
+            digisOutLabelEE = cms.string("eeDigis"),
+            produceDummyIntegrityCollections = cms.bool(True)
+        )
+    )
+
+    # ECAL multifit running on gpu
+    from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi import ecalUncalibRecHitProducerGPU as _ecalUncalibRecHitProducerGPU
+    process.hltEcalUncalibRecHitGPU = _ecalUncalibRecHitProducerGPU.clone(
+        digisLabelEB = ("hltEcalDigisGPU", "ebDigis"),
+        digisLabelEE = ("hltEcalDigisGPU", "eeDigis"),
+        shouldRunTimingComputation = False
+    )
+
+    # copy the ECAL uncalibrated rechits from gpu to cpu in SoA format
+    process.hltEcalUncalibRecHitSoA = cms.EDProducer("EcalCPUUncalibRecHitProducer",
+        containsTimingInformation = cms.bool(False),
+        recHitsInLabelEB = cms.InputTag("hltEcalUncalibRecHitGPU", "EcalUncalibRecHitsEB"),
+        recHitsInLabelEE = cms.InputTag("hltEcalUncalibRecHitGPU", "EcalUncalibRecHitsEE"),
+        recHitsOutLabelEB = cms.string("EcalUncalibRecHitsEB"),
+        recHitsOutLabelEE = cms.string("EcalUncalibRecHitsEE")
+    )
+
+    # SwitchProducer wrapping the legacy ECAL uncalibrated rechits producer or a converter from SoA to legacy format
+    process.hltEcalUncalibRecHit = SwitchProducerCUDA(
+        # legacy producer
+        cpu = process.hltEcalUncalibRecHit,
+        # convert the ECAL uncalibrated rechits from SoA to legacy format
+        cuda = cms.EDProducer("EcalUncalibRecHitConvertGPU2CPUFormat",
+            recHitsLabelGPUEB = cms.InputTag("hltEcalUncalibRecHitSoA", "EcalUncalibRecHitsEB"),
+            recHitsLabelGPUEE = cms.InputTag("hltEcalUncalibRecHitSoA", "EcalUncalibRecHitsEE"),
+            recHitsLabelCPUEB = cms.string("EcalUncalibRecHitsEB"),
+            recHitsLabelCPUEE = cms.string("EcalUncalibRecHitsEE")
+        )
+    )
+
+    # Reconstructing the ECAL calibrated rechits on gpu works, but is extremely slow.
+    # Disable it for the time being, until the performance has been addressed.
+    """
+    process.hltEcalRecHitGPU = cms.EDProducer("EcalRecHitProducerGPU",
+        uncalibrecHitsInLabelEB = cms.InputTag("hltEcalUncalibRecHitGPU","EcalUncalibRecHitsEB"),
+        uncalibrecHitsInLabelEE = cms.InputTag("hltEcalUncalibRecHitGPU","EcalUncalibRecHitsEE"),
+        recHitsLabelEB = cms.string("EcalRecHitsEB"),
+        recHitsLabelEE = cms.string("EcalRecHitsEE"),
+        maxNumberHitsEB = cms.uint32(61200),
+        maxNumberHitsEE = cms.uint32(14648),
+        ChannelStatusToBeExcluded = cms.vstring(
+            "kDAC",
+            "kNoisy",
+            "kNNoisy",
+            "kFixedG6",
+            "kFixedG1",
+            "kFixedG0",
+            "kNonRespondingIsolated",
+            "kDeadVFE",
+            "kDeadFE",
+            "kNoDataNoTP"),
+        killDeadChannels = cms.bool(True),
+        EBLaserMIN = cms.double(0.01),
+        EELaserMIN = cms.double(0.01),
+        EBLaserMAX = cms.double(30.0),
+        EELaserMAX = cms.double(30.0),
+        flagsMapDBReco = cms.PSet(
+            kGood = cms.vstring("kOk","kDAC","kNoLaser","kNoisy"),
+            kNoisy = cms.vstring("kNNoisy","kFixedG6","kFixedG1"),
+            kNeighboursRecovered = cms.vstring("kFixedG0", "kNonRespondingIsolated", "kDeadVFE"),
+            kTowerRecovered = cms.vstring("kDeadFE"),
+            kDead = cms.vstring("kNoDataNoTP")
+        ),
+        recoverEBIsolatedChannels = cms.bool(False),
+        recoverEEIsolatedChannels = cms.bool(False),
+        recoverEBVFE = cms.bool(False),
+        recoverEEVFE = cms.bool(False),
+        recoverEBFE = cms.bool(True),
+        recoverEEFE = cms.bool(True),
+    )
+
+    process.hltEcalRecHitSoA = cms.EDProducer("EcalCPURecHitProducer",
+        recHitsInLabelEB = cms.InputTag("hltEcalRecHitGPU", "EcalRecHitsEB"),
+        recHitsInLabelEE = cms.InputTag("hltEcalRecHitGPU", "EcalRecHitsEE"),
+        recHitsOutLabelEB = cms.string("EcalRecHitsEB"),
+        recHitsOutLabelEE = cms.string("EcalRecHitsEE"),
+        containsTimingInformation = cms.bool(False),
+    )
+
+    # SwitchProducer wrapping the legacy ECAL calibrated rechits producer or a converter from SoA to legacy format
+    process.hltEcalRecHit = SwitchProducerCUDA(
+        # legacy producer
+        cpu = process.hltEcalRecHit,
+        # convert the ECAL calibrated rechits from SoA to legacy format
+        cuda = cms.EDProducer("EcalRecHitConvertGPU2CPUFormat",
+            recHitsLabelGPUEB = cms.InputTag("hltEcalRecHitSoA", "EcalRecHitsEB"),
+            recHitsLabelGPUEE = cms.InputTag("hltEcalRecHitSoA", "EcalRecHitsEE"),
+            recHitsLabelCPUEB = cms.string("EcalRecHitsEB"),
+            recHitsLabelCPUEE = cms.string("EcalRecHitsEE"),
+        )
+    """
+
+    
+    # SwitchProducer wrapping the legacy ECAL rechits producer
+    # the gpu unpacker does not produce the TPs used for the recovery, so the SwitchProducer alias does not provide them:
+    #   - the cpu uncalibrated rechit producer may mark them for recovery, read the TPs explicitly from the legacy unpacker
+    #   - the gpu uncalibrated rechit producer does not flag them for recovery, so the TPs are not necessary
+    process.hltEcalRecHit = SwitchProducerCUDA(
+        cpu = process.hltEcalRecHit.clone(
+            triggerPrimitiveDigiCollection = cms.InputTag('hltEcalDigisLegacy', 'EcalTriggerPrimitives')
+        ),
+        cuda = process.hltEcalRecHit.clone(
+            triggerPrimitiveDigiCollection = cms.InputTag('unused')
+        )
+    )
+
+    # Tasks and Sequences
+
+    process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask = cms.Task(
+        process.hltEcalDigisGPU,                            # unpack ECAL digis on gpu
+        process.hltEcalDigisLegacy,                         # legacy producer, referenced in the SwitchProducer
+        process.hltEcalDigis,                               # SwitchProducer
+        process.hltEcalUncalibRecHitGPU,                    # run ECAL local reconstruction and multifit on gpu
+        process.hltEcalUncalibRecHitSoA,                    # needed by hltEcalPhiSymFilter - copy to host
+        process.hltEcalUncalibRecHit,                       # needed by hltEcalPhiSymFilter - convert to legacy format
+      # process.hltEcalRecHitGPU,                           # make ECAL calibrated rechits on gpu
+      # process.hltEcalRecHitSoA,                           # copy to host
+        process.hltEcalDetIdToBeRecovered,                  # legacy producer
+        process.hltEcalRecHit)                              # legacy producer
+
+    process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerSequence = cms.Sequence(
+        process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask)
+
+    process.HLTPreshowerTask = cms.Task(
+        process.hltEcalPreshowerDigis,                      # unpack ECAL preshower digis on the host
+        process.hltEcalPreshowerRecHit)                     # build ECAL preshower rechits on the host
+
+    process.HLTPreshowerSequence = cms.Sequence(process.HLTPreshowerTask)
+
+    process.HLTDoFullUnpackingEgammaEcalTask = cms.Task(
+        process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask,
+        process.HLTPreshowerTask)
+
+    process.HLTDoFullUnpackingEgammaEcalSequence = cms.Sequence(
+        process.HLTDoFullUnpackingEgammaEcalTask)
+
+    process.HLTDoFullUnpackingEgammaEcalMFSequence = cms.Sequence(
+        process.HLTDoFullUnpackingEgammaEcalTask)
+
+
+    # done
+    return process
+
+# customisation for offloading the HCAL local reconstruction via CUDA if a supported gpu is present
+def customiseHcalLocalReconstruction(process):
+
+    if not 'HLTDoLocalHcalSequence' in process.__dict__:
+        return process
+
+
+    # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases
+
+    process.HLTDoLocalHcalSequence = cms.Sequence()
+    process.HLTStoppedHSCPLocalHcalReco = cms.Sequence()
+
+
+    # Event Setup
+
+    process.load("EventFilter.HcalRawToDigi.hcalElectronicsMappingGPUESProducer_cfi")
+
+    process.load("RecoLocalCalo.HcalRecProducers.hcalGainsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalGainWidthsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalLUTCorrsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalsGPUESProducer_cfi")
+    process.hcalConvertedEffectivePedestalsGPUESProducer.label0 = "withTopoEff"
+    process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalWidthsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalWidthsGPUESProducer_cfi")
+    process.hcalConvertedEffectivePedestalWidthsGPUESProducer.label0 = "withTopoEff"
+    process.hcalConvertedEffectivePedestalWidthsGPUESProducer.label1 = "withTopoEff"
+    process.load("RecoLocalCalo.HcalRecProducers.hcalQIECodersGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalRecoParamsWithPulseShapesGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalRespCorrsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalTimeCorrsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalQIETypesGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalSiPMParametersGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalSiPMCharacteristicsGPUESProducer_cfi")
+    process.load("RecoLocalCalo.HcalRecProducers.hcalMahiPulseOffsetsGPUESProducer_cfi")
+
+
+    # Modules and EDAliases
+
+    # The HCAL unpacker running on the gpu supports only the HB and HE digis.
+    # So, run the legacy unacker on the cpu, then convert the HB and HE digis
+    # to SoA format and copy them to the gpu.
+    process.hltHcalDigisGPU = cms.EDProducer("HcalDigisProducerGPU",
+        hbheDigisLabel = cms.InputTag("hltHcalDigis"),
+        qie11DigiLabel = cms.InputTag("hltHcalDigis"),
+        digisLabelF01HE = cms.string(""),
+        digisLabelF5HB = cms.string(""),
+        digisLabelF3HB = cms.string(""),
+        maxChannelsF01HE = cms.uint32(10000),
+        maxChannelsF5HB = cms.uint32(10000),
+        maxChannelsF3HB = cms.uint32(10000)
+    )
+
+    # run the HCAL local reconstruction (including Method 0 and MAHI) on gpu
+    from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPU_cfi import hbheRecHitProducerGPU as _hbheRecHitProducerGPU
+    process.hltHbherecoGPU = _hbheRecHitProducerGPU.clone(
+        digisLabelF01HE = "hltHcalDigisGPU",
+        digisLabelF5HB = "hltHcalDigisGPU",
+        digisLabelF3HB = "hltHcalDigisGPU",
+        recHitsLabelM0HBHE = ""
+    )
+
+    # transfer the HCAL rechits to the cpu, and convert them to the legacy format
+    from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer
+    process.hltHbherecoFromGPU = _hcalCPURecHitsProducer.clone(
+        recHitsM0LabelIn = "hltHbherecoGPU",
+        recHitsM0LabelOut = "",
+        recHitsLegacyLabelOut = ""
+    )
+
+    # SwitchProducer between the legacy producer and the copy from gpu with conversion
+    process.hltHbhereco = SwitchProducerCUDA(
+        # legacy producer
+        cpu = process.hltHbhereco.clone(),
+        # alias to the rechits converted to legacy format
+        cuda = cms.EDAlias(
+            hltHbherecoFromGPU = cms.VPSet(
+                cms.PSet(type = cms.string("HBHERecHitsSorted"))
+            )
+        )
+    )
+
+
+    # Tasks and Sequences
+
+    process.HLTDoLocalHcalTask = cms.Task(
+        process.hltHcalDigis,                               # legacy producer, unpack HCAL digis on cpu
+        process.hltHcalDigisGPU,                            # copy to gpu and convert to SoA format
+        process.hltHbherecoGPU,                             # run the HCAL local reconstruction (including Method 0 and MAHI) on gpu
+        process.hltHbherecoFromGPU,                         # transfer the HCAL rechits to the cpu, and convert them to the legacy format
+        process.hltHbhereco,                                # SwitchProducer between the legacy producer and the copy from gpu with conversion
+        process.hltHfprereco,                               # legacy producer
+        process.hltHfreco,                                  # legacy producer
+        process.hltHoreco)                                  # legacy producer
+
+    process.HLTDoLocalHcalSequence = cms.Sequence(
+        process.HLTDoLocalHcalTask)
+
+    process.HLTStoppedHSCPLocalHcalRecoTask = cms.Task(
+        process.hltHcalDigis,                               # legacy producer, unpack HCAL digis on cpu
+        process.hltHcalDigisGPU,                            # copy to gpu and convert to SoA format
+        process.hltHbherecoGPU,                             # run the HCAL local reconstruction (including Method 0 and MAHI) on gpu
+        process.hltHbherecoFromGPU,                         # transfer the HCAL rechits to the cpu, and convert them to the legacy format
+        process.hltHbhereco)                                # SwitchProducer between the legacy producer and the copy from gpu with conversion
+
+    process.HLTStoppedHSCPLocalHcalReco = cms.Sequence(
+        process.HLTStoppedHSCPLocalHcalRecoTask)
+
+
+    # done
+    return process
+
+
+# customisation to enable pixel triplets instead of quadruplets
+def enablePatatrackPixelTriplets(process):
+
+  # configure GPU pixel tracks for triplets
+  process.hltPixelTracksCUDA.minHitsPerNtuplet = 3
+  process.hltPixelTracksCUDA.includeJumpingForwardDoublets = True
+
+  # configure CPU pixel tracks for triplets
+  process.hltPixelTracksSoA.cpu.minHitsPerNtuplet = 3
+  process.hltPixelTracksSoA.cpu.includeJumpingForwardDoublets = True
+
+  # done
+  return process
+
+
+# customisation for running the Patatrack reconstruction, with automatic offload via CUDA when a supported gpu is available
+def customizeHLTforPatatrack(process):
+    process = customiseCommon(process)
+    process = customisePixelLocalReconstruction(process)
+    process = customisePixelTrackReconstruction(process)
+    process = customiseEcalLocalReconstruction(process)
+    process = customiseHcalLocalReconstruction(process)
+    return process
+
+
+# customisation for running the Patatrack triplets reconstruction, with automatic offload via CUDA when a supported gpu is available
+def customizeHLTforPatatrackTriplets(process):
+    process = customiseCommon(process)
+    process = customisePixelLocalReconstruction(process)
+    process = customisePixelTrackReconstruction(process)
+    process = customiseEcalLocalReconstruction(process)
+    process = customiseHcalLocalReconstruction(process)
+    process = enablePatatrackPixelTriplets(process)
+    return process
+
+
+def _addConsumerPath(process):
+    # add to a path all consumers and the tasks that define the producers
+    process.Consumer = cms.Path(
+        process.HLTBeginSequence +
+        process.hltPixelConsumer +
+        process.hltEcalConsumer +
+        process.hltHbheConsumer,
+        process.HLTDoLocalPixelTask,
+        process.HLTRecoPixelTracksTask,
+        process.HLTRecopixelvertexingTask,
+        process.HLTDoFullUnpackingEgammaEcalTask,
+        process.HLTDoLocalHcalTask,
+    )
+
+    if 'HLTSchedule' in process.__dict__:
+        process.HLTSchedule.append(process.Consumer)
+    if process.schedule is not None:
+        process.schedule.append(process.Consumer)
+
+    # done
+    return process
+
+
+def consumeGPUSoAProducts(process):
+    # consume the Pixel tracks and vertices on the GPU in SoA format
+    process.hltPixelConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltPixelTracksCUDA', 'hltPixelVerticesCUDA' )
+    )
+
+    # consume the ECAL uncalibrated rechits on the GPU in SoA format
+    process.hltEcalConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltEcalUncalibRecHitGPU' )
+    )
+
+    # consume the HCAL rechits on the GPU in SoA format
+    process.hltHbheConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltHbherecoGPU' )
+    )
+
+    # add to a path all consumers and the tasks that define the producers
+    process = _addConsumerPath(process)
+
+    # done
+    return process
+
+
+def consumeCPUSoAProducts(process):
+    # consume the Pixel tracks and vertices on the CPU in SoA format
+    process.hltPixelConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltPixelTracksSoA', 'hltPixelVerticesSoA' )
+    )
+
+    # consume the ECAL uncalibrated rechits on the CPU in SoA format
+    process.hltEcalConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltEcalUncalibRecHitSoA' )
+    )
+
+    # consume the HCAL rechits on the CPU in legacy format
+    process.hltHbheConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltHbhereco' )
+    )
+
+    # add to a path all consumers and the tasks that define the producers
+    process = _addConsumerPath(process)
+
+    # done
+    return process
+
+def consumeCPULegacyProducts(process):
+    # consume the Pixel tracks and vertices on the CPU in legacy format
+    process.hltPixelConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltPixelTracks', 'hltPixelVertices' )
+    )
+
+    # consume the ECAL runcalibrated echits on the CPU in legacy format
+    process.hltEcalConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltEcalUncalibRecHit' )
+    )
+
+    # consume the HCAL rechits on the CPU in legacy format
+    process.hltHbheConsumer = cms.EDAnalyzer("GenericConsumer",
+        eventProducts = cms.untracked.vstring( 'hltHbhereco' )
+    )
+
+    # add to a path all consumers and the tasks that define the producers
+    process = _addConsumerPath(process)
+
+    # done
+    return process
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 34ee6fadb04de..424ac13a43627 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -4,7 +4,21 @@
 #
 # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex
 #
-#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
-from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
+from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
+#from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
 recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
 recopixelvertexing = cms.Sequence(recopixelvertexingTask)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi import pixelVertexSoA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA
+
+_pixelVertexingCUDATask = cms.Task(pixelTracksTask,pixelVertexCUDA,pixelVertexSoA,pixelVertices)
+
+# pixelVertexSoAonCPU = pixelVertexCUDA.clone()
+# pixelVertexSoAonCPU.onGPU = False;
+
+gpu.toReplaceWith(pixelVertices,_pixelVertexFromSoA)
+gpu.toReplaceWith(recopixelvertexingTask,_pixelVertexingCUDATask)
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
deleted file mode 100644
index 4713b64e5e48a..0000000000000
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-def customizePixelTracksForProfiling(process):
-    process.out = cms.OutputModule("AsciiOutputModule",
-        outputCommands = cms.untracked.vstring(
-            "keep *_pixelTracks_*_*",
-        ),
-        verbosity = cms.untracked.uint32(0),
-    )
-
-    process.outPath = cms.EndPath(process.out)
-
-    process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.outPath)
-
-    return process
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
new file mode 100644
index 0000000000000..909959f2d81be
--- /dev/null
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -0,0 +1,61 @@
+import FWCore.ParameterSet.Config as cms
+
+def customizePixelTracksSoAonCPU(process):
+  
+  process.CUDAService = cms.Service('CUDAService',
+    enabled = cms.untracked.bool(False)
+  )
+
+  # ensure the same results when running on GPU (which supports only the 'HLT' payload) and CPU
+  process.siPixelClustersPreSplitting.cpu.payloadType = cms.string('HLT')
+
+  from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy
+  process.siPixelRecHitsPreSplitting = siPixelRecHitSoAFromLegacy.clone(
+    convertToLegacy = True
+  )
+
+  from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA
+  process.pixelTrackSoA = caHitNtupletCUDA.clone(
+    onGPU = False,
+    pixelRecHitSrc = 'siPixelRecHitsPreSplitting'
+  )
+
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+  process.pixelVertexSoA = pixelVertexCUDA.clone(
+    onGPU = False,
+    pixelTrackSrc = 'pixelTrackSoA'
+  )
+
+  from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA
+  process.pixelTracks = pixelTrackProducerFromSoA.clone(
+    pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting'
+  )
+
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA
+  process.pixelVertices = pixelVertexFromSoA.clone()
+
+  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA
+
+  return process
+
+
+def customizePixelTracksForTriplets(process):
+
+  from HLTrigger.Configuration.common import producers_by_type
+  for producer in producers_by_type(process, 'CAHitNtupletCUDA'):
+        producer.includeJumpingForwardDoublets = True
+        producer.minHitsPerNtuplet = 3
+ 
+  return process
+ 
+
+def customizePixelTracksSoAonCPUForProfiling(process):
+
+  process.MessageLogger.cerr.FwkReport.reportEvery = 100
+
+  process = customizePixelTracksSoAonCPU(process)
+  process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
+  
+  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA)
+  process.schedule = cms.Schedule(process.TkSoA)
+  return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
index e6fc938dc25a7..a589aad036996 100644
--- a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml
@@ -1,3 +1,5 @@
+<use name="cuda"/>
+<use name="eigen"/>
 <use name="root"/>
 <use name="CommonTools/Statistics"/>
 <use name="CommonTools/Utils"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
new file mode 100644
index 0000000000000..86fe6a278777c
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h
@@ -0,0 +1,606 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+
+#include <Eigen/Eigenvalues>
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
+namespace brokenline {
+
+  //!< Karimäki's parameters: (phi, d, k=1/R)
+  /*!< covariance matrix: \n
+    |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
+    |cov(phi, d )|cov( d , d )|cov( k , d )| \n
+    |cov(phi, k )|cov( d , k )|cov( k , k )| \n
+    as defined in Karimäki V., 1990, Effective circle fitting for particle trajectories, 
+    Nucl. Instr. and Meth. A305 (1991) 187.
+  */
+  using karimaki_circle_fit = riemannFit::CircleFit;
+
+  /*!
+    \brief data needed for the Broken Line fit procedure.
+  */
+  template <int n>
+  struct PreparedBrokenLineData {
+    int qCharge;                          //!< particle charge
+    riemannFit::Matrix2xNd<n> radii;      //!< xy data in the system in which the pre-fitted center is the origin
+    riemannFit::VectorNd<n> sTransverse;  //!< total distance traveled in the transverse plane
+                                          //   starting from the pre-fitted closest approach
+    riemannFit::VectorNd<n> sTotal;       //!< total distance traveled (three-dimensional)
+    riemannFit::VectorNd<n> zInSZplane;   //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    riemannFit::VectorNd<n> varBeta;      //!< kink angles in the SZ plane
+  };
+
+  /*!
+    \brief Computes the Coulomb multiple scattering variance of the planar angle.
+    
+    \param length length of the track in the material.
+    \param bField magnetic field in Gev/cm/c.
+    \param radius radius of curvature (needed to evaluate p).
+    \param layer denotes which of the four layers of the detector is the endpoint of the 
+   *             multiple scattered track. For example, if Layer=3, then the particle has 
+   *             just gone through the material between the second and the third layer.
+    
+    \todo add another Layer variable to identify also the start point of the track, 
+   *      so if there are missing hits or multiple hits, the part of the detector that 
+   *      the particle has traversed can be exactly identified.
+    
+    \warning the formula used here assumes beta=1, and so neglects the dependence 
+   *         of theta_0 on the mass of the particle at fixed momentum.
+    
+    \return the variance of the planar angle ((theta_0)^2 /3).
+  */
+  __host__ __device__ inline double multScatt(
+      const double& length, const double bField, const double radius, int layer, double slope) {
+    // limit R to 20GeV...
+    auto pt2 = std::min(20., bField * radius);
+    pt2 *= pt2;
+    constexpr double inv_X0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
+    //if(Layer==1) XXI_0=0.06/16.;
+    // else XXI_0=0.06/16.;
+    //XX_0*=1;
+
+    //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+    constexpr double geometry_factor = 0.7;
+    constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (std::abs(length) * inv_X0) *
+           riemannFit::sqr(1. + 0.038 * log(std::abs(length) * inv_X0));
+  }
+
+  /*!
+    \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
+    
+    \param slope tangent of the angle of rotation.
+    
+    \return 2D rotation matrix.
+  */
+  __host__ __device__ inline riemannFit::Matrix2d rotationMatrix(double slope) {
+    riemannFit::Matrix2d rot;
+    rot(0, 0) = 1. / sqrt(1. + riemannFit::sqr(slope));
+    rot(0, 1) = slope * rot(0, 0);
+    rot(1, 0) = -rot(0, 1);
+    rot(1, 1) = rot(0, 0);
+    return rot;
+  }
+
+  /*!
+    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a 
+   *       translation of the coordinate system, such that the old origin has coordinates (x0,y0) 
+   *       in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective 
+   *       circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
+    
+    \param circle circle fit in the old coordinate system. circle.par(0) is phi, circle.par(1) is d and circle.par(2) is rho. 
+    \param x0 x coordinate of the translation vector.
+    \param y0 y coordinate of the translation vector.
+    \param jacobian passed by reference in order to save stack.
+  */
+  __host__ __device__ inline void translateKarimaki(karimaki_circle_fit& circle,
+                                                    double x0,
+                                                    double y0,
+                                                    riemannFit::Matrix3d& jacobian) {
+    // Avoid multiple access to the circle.par vector.
+    using scalar = std::remove_reference<decltype(circle.par(0))>::type;
+    scalar phi = circle.par(0);
+    scalar dee = circle.par(1);
+    scalar rho = circle.par(2);
+
+    // Avoid repeated trig. computations
+    scalar sinPhi = sin(phi);
+    scalar cosPhi = cos(phi);
+
+    // Intermediate computations for the circle parameters
+    scalar deltaPara = x0 * cosPhi + y0 * sinPhi;
+    scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee;
+    scalar tempSmallU = 1 + rho * dee;
+    scalar tempC = -rho * y0 + tempSmallU * cosPhi;
+    scalar tempB = rho * x0 + tempSmallU * sinPhi;
+    scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara));
+    scalar tempU = sqrt(1. + rho * tempA);
+
+    // Intermediate computations for the error matrix transform
+    scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC));
+    scalar tempV = 1. + rho * deltaOrth;
+    scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU);
+    scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda;
+    scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara);
+    jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara,
+        2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.;
+
+    // translated circle parameters
+    // phi
+    circle.par(0) = atan2(tempB, tempC);
+    // d
+    circle.par(1) = tempA / (1 + tempU);
+    // rho after translation. It is invariant, so noop
+    // circle.par(2)= rho;
+
+    // translated error matrix
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
+  }
+
+  /*!
+    \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
+    
+    \param hits hits coordinates.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param bField magnetic field in Gev/cm/c.
+    \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
+  */
+  template <typename M3xN, typename V4, int n>
+  __host__ __device__ inline void prepareBrokenLineData(const M3xN& hits,
+                                                        const V4& fast_fit,
+                                                        const double bField,
+                                                        PreparedBrokenLineData<n>& results) {
+    riemannFit::Vector2d dVec;
+    riemannFit::Vector2d eVec;
+
+    dVec = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
+    eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
+    results.qCharge = riemannFit::cross2D(dVec, eVec) > 0 ? -1 : 1;
+
+    const double slope = -results.qCharge / fast_fit(3);
+
+    riemannFit::Matrix2d rotMat = rotationMatrix(slope);
+
+    // calculate radii and s
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1);
+    eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
+    for (u_int i = 0; i < n; i++) {
+      dVec = results.radii.block(0, i, 2, 1);
+      results.sTransverse(i) = results.qCharge * fast_fit(2) *
+                               atan2(riemannFit::cross2D(dVec, eVec), dVec.dot(eVec));  // calculates the arc length
+    }
+    riemannFit::VectorNd<n> zVec = hits.block(2, 0, 1, n).transpose();
+
+    //calculate sTotal and zVec
+    riemannFit::Matrix2xNd<n> pointsSZ = riemannFit::Matrix2xNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      pointsSZ(0, i) = results.sTransverse(i);
+      pointsSZ(1, i) = zVec(i);
+      pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1);
+    }
+    results.sTotal = pointsSZ.block(0, 0, 1, n).transpose();
+    results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose();
+
+    //calculate varBeta
+    results.varBeta(0) = results.varBeta(n - 1) = 0;
+    for (u_int i = 1; i < n - 1; i++) {
+      results.varBeta(i) = multScatt(results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) +
+                           multScatt(results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope);
+    }
+  }
+
+  /*!
+    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. 
+   *       This is the whole matrix in the case of the line fit and the main n-by-n block in the case 
+   *       of the circle fit.
+    
+    \param weights weights of the first part of the cost function, the one with the measurements 
+   *         and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
+    \param sTotal total distance traveled by the particle from the pre-fitted closest approach.
+    \param varBeta kink angles' variance.
+    
+    \return the n-by-n matrix of the linear system
+  */
+  template <int n>
+  __host__ __device__ inline riemannFit::MatrixNd<n> matrixC_u(const riemannFit::VectorNd<n>& weights,
+                                                               const riemannFit::VectorNd<n>& sTotal,
+                                                               const riemannFit::VectorNd<n>& varBeta) {
+    riemannFit::MatrixNd<n> c_uMat = riemannFit::MatrixNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, i) = weights(i);
+      if (i > 1)
+        c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1)));
+      if (i > 0 && i < n - 1)
+        c_uMat(i, i) +=
+            (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) /
+                                                ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
+      if (i < n - 2)
+        c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i)));
+
+      if (i > 0 && i < n - 1)
+        c_uMat(i, i + 1) =
+            1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
+      if (i < n - 2)
+        c_uMat(i, i + 1) +=
+            1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))));
+
+      if (i < n - 2)
+        c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)));
+
+      c_uMat(i, i) *= 0.5;
+    }
+    return c_uMat + c_uMat.transpose();
+  }
+
+  /*!
+    \brief A very fast helix fit.
+    
+    \param hits the measured hits.
+    
+    \return (X0,Y0,R,tan(theta)).
+    
+    \warning sign of theta is (intentionally, for now) mistaken for negative charges.
+  */
+
+  template <typename M3xN, typename V4>
+  __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) {
+    constexpr uint32_t n = M3xN::ColsAtCompileTime;
+
+    const riemannFit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+
+    auto tmp = 0.5 / riemannFit::cross2D(c, a);
+    result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
+    result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
+    // check Wikipedia for these formulas
+
+    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(riemannFit::cross2D(b, a)));
+    // Using Math Olympiad's formula R=abc/(4A)
+
+    const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+
+    result(3) = result(2) * atan2(riemannFit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+    // ds/dz slope between last and first point
+  }
+
+  /*!
+    \brief Performs the Broken Line fit in the curved track case (that is, the fit 
+   *       parameters are the interceptions u and the curvature correction \Delta\kappa).
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param bField magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param circle_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (phi, d, k); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit 
+   *         with the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint on 
+   * the cost function and solving the consequent linear system. It determines the 
+   * fitted parameters u and \Delta\kappa and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their 
+   * covariance matrix are transformed to the original coordinate system.
+  */
+  template <typename M3xN, typename M6xN, typename V4, int n>
+  __host__ __device__ inline void circleFit(const M3xN& hits,
+                                            const M6xN& hits_ge,
+                                            const V4& fast_fit,
+                                            const double bField,
+                                            PreparedBrokenLineData<n>& data,
+                                            karimaki_circle_fit& circle_results) {
+    circle_results.qCharge = data.qCharge;
+    auto& radii = data.radii;
+    const auto& sTransverse = data.sTransverse;
+    const auto& sTotal = data.sTotal;
+    auto& zInSZplane = data.zInSZplane;
+    auto& varBeta = data.varBeta;
+    const double slope = -circle_results.qCharge / fast_fit(3);
+    varBeta *= 1. + riemannFit::sqr(slope);  // the kink angles are projected!
+
+    for (u_int i = 0; i < n; i++) {
+      zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
+    }
+
+    riemannFit::Matrix2d vMat;           // covariance matrix
+    riemannFit::VectorNd<n> weightsVec;  // weights
+    riemannFit::Matrix2d rotMat;         // rotation matrix point by point
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      rotMat = rotationMatrix(-radii(0, i) / radii(1, i));
+      weightsVec(i) =
+          1. / ((rotMat * vMat * rotMat.transpose())(1, 1));  // compute the orthogonal weight point by point
+    }
+
+    riemannFit::VectorNplusONEd<n> r_uVec;
+    r_uVec(n) = 0;
+    for (u_int i = 0; i < n; i++) {
+      r_uVec(i) = weightsVec(i) * zInSZplane(i);
+    }
+
+    riemannFit::MatrixNplusONEd<n> c_uMat;
+    c_uMat.block(0, 0, n, n) = matrixC_u(weightsVec, sTransverse, varBeta);
+    c_uMat(n, n) = 0;
+    //add the border to the c_uMat matrix
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, n) = 0;
+      if (i > 0 && i < n - 1) {
+        c_uMat(i, n) +=
+            -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+            (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1)));
+      }
+      if (i > 1) {
+        c_uMat(i, n) +=
+            (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1)));
+      }
+      if (i < n - 2) {
+        c_uMat(i, n) +=
+            (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i)));
+      }
+      c_uMat(n, i) = c_uMat(i, n);
+      if (i > 0 && i < n - 1)
+        c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i));
+    }
+
+#ifdef CPP_DUMP
+    std::cout << "CU5\n" << c_uMat << std::endl;
+#endif
+    riemannFit::MatrixNplusONEd<n> iMat;
+    math::cholesky::invert(c_uMat, iMat);
+#ifdef CPP_DUMP
+    std::cout << "I5\n" << iMat << std::endl;
+#endif
+
+    riemannFit::VectorNplusONEd<n> uVec = iMat * r_uVec;  // obtain the fitted parameters by solving the linear system
+
+    // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
+
+    radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
+    radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
+
+    riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1);
+    riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1);
+
+    circle_results.par << atan2((eVec - dVec)(1), (eVec - dVec)(0)),
+        -circle_results.qCharge *
+            (fast_fit(2) - sqrt(riemannFit::sqr(fast_fit(2)) - 0.25 * (eVec - dVec).squaredNorm())),
+        circle_results.qCharge * (1. / fast_fit(2) + uVec(n));
+
+    assert(circle_results.qCharge * circle_results.par(1) <= 0);
+
+    riemannFit::Vector2d eMinusd = eVec - dVec;
+    double tmp1 = eMinusd.squaredNorm();
+    double tmp2 = sqrt(riemannFit::sqr(2 * fast_fit(2)) - tmp1);
+
+    riemannFit::Matrix3d jacobian;
+    jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
+        (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0,
+        (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) / tmp2,
+        (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) / tmp2, 0, 0, 0,
+        circle_results.qCharge;
+
+    circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0),
+        iMat(n, 1), iMat(n, n);
+
+    circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
+
+    //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
+
+    auto eMinusDVec = eVec - dVec;
+    translateKarimaki(circle_results, 0.5 * eMinusDVec(0), 0.5 * eMinusDVec(1), jacobian);
+    circle_results.cov(0, 0) +=
+        (1 + riemannFit::sqr(slope)) * multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope);
+
+    //...And translate back to the original system
+
+    translateKarimaki(circle_results, dVec(0), dVec(1), jacobian);
+
+    // compute chi2
+    circle_results.chi2 = 0;
+    for (u_int i = 0; i < n; i++) {
+      circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
+      if (i > 0 && i < n - 1)
+        circle_results.chi2 +=
+            riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) -
+                            uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+                                ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) +
+                            uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) +
+                            (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) /
+            varBeta(i);
+    }
+
+    // assert(circle_results.chi2>=0);
+  }
+
+  /*!
+    \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
+    
+    \param hits hits coordinates.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param bField magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param line_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (cot(theta), Zip); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit without 
+   *        the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint 
+   * on the cost function and solving the consequent linear system. It determines 
+   * the fitted parameters u and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their covariance 
+   * matrix are transformed to the original coordinate system.
+   */
+  template <typename V4, typename M6xN, int n>
+  __host__ __device__ inline void lineFit(const M6xN& hits_ge,
+                                          const V4& fast_fit,
+                                          const double bField,
+                                          const PreparedBrokenLineData<n>& data,
+                                          riemannFit::LineFit& line_results) {
+    const auto& radii = data.radii;
+    const auto& sTotal = data.sTotal;
+    const auto& zInSZplane = data.zInSZplane;
+    const auto& varBeta = data.varBeta;
+
+    const double slope = -data.qCharge / fast_fit(3);
+    riemannFit::Matrix2d rotMat = rotationMatrix(slope);
+
+    riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero();  // covariance matrix XYZ
+    riemannFit::Matrix2x3d jacobXYZtosZ =
+        riemannFit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    riemannFit::VectorNd<n> weights = riemannFit::VectorNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3];  // cov_xz
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4];  // cov_yz
+      vMat(2, 2) = hits_ge.col(i)[5];               // z errors
+      auto tmp = 1. / radii.block(0, i, 2, 1).norm();
+      jacobXYZtosZ(0, 0) = radii(1, i) * tmp;
+      jacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
+      jacobXYZtosZ(1, 2) = 1.;
+      weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())(
+                            1, 1));  // compute the orthogonal weight point by point
+    }
+
+    riemannFit::VectorNd<n> r_u;
+    for (u_int i = 0; i < n; i++) {
+      r_u(i) = weights(i) * zInSZplane(i);
+    }
+#ifdef CPP_DUMP
+    std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl;
+#endif
+    riemannFit::MatrixNd<n> iMat;
+    math::cholesky::invert(matrixC_u(weights, sTotal, varBeta), iMat);
+#ifdef CPP_DUMP
+    std::cout << "I4\n" << iMat << std::endl;
+#endif
+
+    riemannFit::VectorNd<n> uVec = iMat * r_u;  // obtain the fitted parameters by solving the linear system
+
+    // line parameters in the system in which the first hit is the origin and with axis along SZ
+    line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0);
+    auto idiff = 1. / (sTotal(1) - sTotal(0));
+    line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) +
+                            multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope),
+        (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0);
+
+    // translate to the original SZ system
+    riemannFit::Matrix2d jacobian;
+    jacobian(0, 0) = 1.;
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = -sTotal(0);
+    jacobian(1, 1) = 1.;
+    line_results.par(1) += -line_results.par(0) * sTotal(0);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+    // rotate to the original sz system
+    auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1);
+    jacobian(1, 1) = 1. / tmp;
+    jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
+    jacobian(0, 1) = 0;
+    jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0);
+    line_results.par(1) = line_results.par(1) * jacobian(1, 1);
+    line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1);
+    line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+    // compute chi2
+    line_results.chi2 = 0;
+    for (u_int i = 0; i < n; i++) {
+      line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
+      if (i > 0 && i < n - 1)
+        line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) -
+                                             uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) /
+                                                 ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) +
+                                             uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) /
+                             varBeta(i);
+    }
+  }
+
+  /*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n
+    -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n
+    Points must be passed ordered (from inner to outer layer).
+    
+    \param hits Matrix3xNd hits coordinates in this form: \n
+    |x1|x2|x3|...|xn| \n
+    |y1|y2|y3|...|yn| \n
+    |z1|z2|z3|...|zn|
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+    |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n
+    |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n
+    |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n
+    |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n
+    .       .       .       .       . .       .       .       .       . .       .       .       .       . \n
+    |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n
+    |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n
+    |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n
+    |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n
+    .       .       .    .          . .       .       .       .       . .       .       .       .       . \n
+    |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n
+    |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
+    |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
+    |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
+    \param bField magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
+    
+    \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
+    
+    \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs.
+    
+    \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
+  */
+  template <int n>
+  inline riemannFit::HelixFit helixFit(const riemannFit::Matrix3xNd<n>& hits,
+                                       const Eigen::Matrix<float, 6, 4>& hits_ge,
+                                       const double bField) {
+    riemannFit::HelixFit helix;
+    riemannFit::Vector4d fast_fit;
+    fastFit(hits, fast_fit);
+
+    PreparedBrokenLineData<n> data;
+    karimaki_circle_fit circle;
+    riemannFit::LineFit line;
+    riemannFit::Matrix3d jacobian;
+
+    prepareBrokenLineData(hits, fast_fit, bField, data);
+    lineFit(hits_ge, fast_fit, bField, data, line);
+    circleFit(hits, hits_ge, fast_fit, bField, data, circle);
+
+    // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
+    jacobian << 1., 0, 0, 0, 1., 0, 0, 0,
+        -std::abs(circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2));
+    circle.par(2) = bField / std::abs(circle.par(2));
+    circle.cov = jacobian * circle.cov * jacobian.transpose();
+
+    helix.par << circle.par, line.par;
+    helix.cov = riemannFit::MatrixXd::Zero(5, 5);
+    helix.cov.block(0, 0, 3, 3) = circle.cov;
+    helix.cov.block(3, 3, 2, 2) = line.cov;
+    helix.qCharge = circle.qCharge;
+    helix.chi2_circle = circle.chi2;
+    helix.chi2_line = line.chi2;
+
+    return helix;
+  }
+
+}  // namespace brokenline
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
new file mode 100644
index 0000000000000..01497719d2998
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
@@ -0,0 +1,65 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+namespace riemannFit {
+
+  using Vector2d = Eigen::Vector2d;
+  using Vector3d = Eigen::Vector3d;
+  using Vector4d = Eigen::Vector4d;
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix2d = Eigen::Matrix2d;
+  using Matrix3d = Eigen::Matrix3d;
+  using Matrix4d = Eigen::Matrix4d;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+  using Matrix6d = Eigen::Matrix<double, 6, 6>;
+
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;  // used for inputs hits
+
+  struct CircleFit {
+    Vector3d par;  //!< parameter: (X0,Y0,R)
+    Matrix3d cov;
+    /*!< covariance matrix: \n
+      |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
+      |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
+      |cov(X0, R)|cov(Y0, R)|cov( R, R)|
+    */
+    int32_t qCharge;  //!< particle charge
+    float chi2;
+  };
+
+  struct LineFit {
+    Vector2d par;  //!<(cotan(theta),Zip)
+    Matrix2d cov;
+    /*!<
+      |cov(c_t,c_t)|cov(Zip,c_t)| \n
+      |cov(c_t,Zip)|cov(Zip,Zip)|
+    */
+    double chi2;
+  };
+
+  struct HelixFit {
+    Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
+    Matrix5d cov;
+    /*!< ()->cov() \n
+      |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
+      |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
+      |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
+      |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
+      |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
+    */
+    float chi2_circle;
+    float chi2_line;
+    //    Vector4d fast_fit;
+    int32_t qCharge;  //!< particle charge
+  };                  // __attribute__((aligned(16)));
+
+}  // namespace riemannFit
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
new file mode 100644
index 0000000000000..2fe74f53a7bd2
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -0,0 +1,243 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+
+namespace riemannFit {
+
+  constexpr double epsilon = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
+
+  using VectorXd = Eigen::VectorXd;
+  using MatrixXd = Eigen::MatrixXd;
+  template <int N>
+  using MatrixNd = Eigen::Matrix<double, N, N>;
+  template <int N>
+  using MatrixNplusONEd = Eigen::Matrix<double, N + 1, N + 1>;
+  template <int N>
+  using ArrayNd = Eigen::Array<double, N, N>;
+  template <int N>
+  using Matrix2Nd = Eigen::Matrix<double, 2 * N, 2 * N>;
+  template <int N>
+  using Matrix3Nd = Eigen::Matrix<double, 3 * N, 3 * N>;
+  template <int N>
+  using Matrix2xNd = Eigen::Matrix<double, 2, N>;
+  template <int N>
+  using Array2xNd = Eigen::Array<double, 2, N>;
+  template <int N>
+  using MatrixNx3d = Eigen::Matrix<double, N, 3>;
+  template <int N>
+  using MatrixNx5d = Eigen::Matrix<double, N, 5>;
+  template <int N>
+  using VectorNd = Eigen::Matrix<double, N, 1>;
+  template <int N>
+  using VectorNplusONEd = Eigen::Matrix<double, N + 1, 1>;
+  template <int N>
+  using Vector2Nd = Eigen::Matrix<double, 2 * N, 1>;
+  template <int N>
+  using Vector3Nd = Eigen::Matrix<double, 3 * N, 1>;
+  template <int N>
+  using RowVectorNd = Eigen::Matrix<double, 1, 1, N>;
+  template <int N>
+  using RowVector2Nd = Eigen::Matrix<double, 1, 2 * N>;
+
+  using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
+
+  using Matrix3f = Eigen::Matrix3f;
+  using Vector3f = Eigen::Vector3f;
+  using Vector4f = Eigen::Vector4f;
+  using Vector6f = Eigen::Matrix<double, 6, 1>;
+
+  template <class C>
+  __host__ __device__ void printIt(C* m, const char* prefix = "") {
+#ifdef RFIT_DEBUG
+    for (uint r = 0; r < m->rows(); ++r) {
+      for (uint c = 0; c < m->cols(); ++c) {
+        printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
+      }
+    }
+#endif
+  }
+
+  /*!
+    \brief raise to square.
+  */
+  template <typename T>
+  constexpr T sqr(const T a) {
+    return a * a;
+  }
+
+  /*!
+    \brief Compute cross product of two 2D vector (assuming z component 0),
+    returning z component of the result.
+    \param a first 2D vector in the product.
+    \param b second 2D vector in the product.
+    \return z component of the cross product.
+  */
+
+  __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b) {
+    return a.x() * b.y() - a.y() * b.x();
+  }
+
+  /*!
+   *  load error in CMSSW format to our formalism
+   *  
+   */
+  template <typename M6xNf, typename M2Nd>
+  __host__ __device__ void loadCovariance2D(M6xNf const& ge, M2Nd& hits_cov) {
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+    for (uint32_t i = 0; i < hits_in_fit; ++i) {
+      {
+        constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+    }
+  }
+
+  template <typename M6xNf, typename M3xNd>
+  __host__ __device__ void loadCovariance(M6xNf const& ge, M3xNd& hits_cov) {
+    // Index numerology:
+    // i: index of the hits/point (0,..,3)
+    // j: index of space component (x,y,z)
+    // l: index of space components (x,y,z)
+    // ge is always in sync with the index i and is formatted as:
+    // ge[] ==> [xx, xy, yy, xz, yz, zz]
+    // in (j,l) notation, we have:
+    // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+    // so the index ge_idx corresponds to the matrix elements:
+    // | 0  1  3 |
+    // | 1  2  4 |
+    // | 3  4  5 |
+    constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+    for (uint32_t i = 0; i < hits_in_fit; ++i) {
+      {
+        constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 5, j = 2, l = 2;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 3, j = 2, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 4, j = 2, l = 1;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+    }
+  }
+
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+    \param B magnetic field in Gev/cm/c unit.
+    \param error flag for errors computation.
+  */
+  __host__ __device__ inline void par_uvrtopak(CircleFit& circle, const double B, const bool error) {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+        circle.qCharge * (temp1 - circle.par(2)), circle.par(2) * B;
+    if (error) {
+      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+      const double temp3 = 1. / temp1 * circle.qCharge;
+      Matrix3d j4Mat;
+      j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.qCharge, 0., 0., B;
+      circle.cov = j4Mat * circle.cov * j4Mat.transpose();
+    }
+    circle.par = par_pak;
+  }
+
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+  */
+  __host__ __device__ inline void fromCircleToPerigee(CircleFit& circle) {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+        circle.qCharge * (temp1 - circle.par(2)), circle.qCharge / circle.par(2);
+
+    const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+    const double temp3 = 1. / temp1 * circle.qCharge;
+    Matrix3d j4Mat;
+    j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+        circle.par(1) * temp3, -circle.qCharge, 0., 0., -circle.qCharge / (circle.par(2) * circle.par(2));
+    circle.cov = j4Mat * circle.cov * j4Mat.transpose();
+
+    circle.par = par_pak;
+  }
+
+  // transformation between the "perigee" to cmssw localcoord frame
+  // the plane of the latter is the perigee plane...
+  // from   //!<(phi,Tip,q/pt,cotan(theta)),Zip)
+  // to q/p,dx/dz,dy/dz,x,z
+  template <typename VI5, typename MI5, typename VO5, typename MO5>
+  __host__ __device__ inline void transformToPerigeePlane(VI5 const& ip, MI5 const& icov, VO5& op, MO5& ocov) {
+    auto sinTheta2 = 1. / (1. + ip(3) * ip(3));
+    auto sinTheta = std::sqrt(sinTheta2);
+    auto cosTheta = ip(3) * sinTheta;
+
+    op(0) = sinTheta * ip(2);
+    op(1) = 0.;
+    op(2) = -ip(3);
+    op(3) = ip(1);
+    op(4) = -ip(4);
+
+    Matrix5d jMat = Matrix5d::Zero();
+
+    jMat(0, 2) = sinTheta;
+    jMat(0, 3) = -sinTheta2 * cosTheta * ip(2);
+    jMat(1, 0) = 1.;
+    jMat(2, 3) = -1.;
+    jMat(3, 1) = 1.;
+    jMat(4, 4) = -1;
+
+    ocov = jMat * icov * jMat.transpose();
+  }
+
+}  // namespace riemannFit
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
new file mode 100644
index 0000000000000..9fb8843589669
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h
@@ -0,0 +1,27 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h
+
+#include <vector>
+
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackingRecHit/interface/TrackingRecHit.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitterBase.h"
+#include "RecoTracker/TkTrackingRegions/interface/TrackingRegion.h"
+
+class PixelNtupletsFitter final : public PixelFitterBase {
+public:
+  explicit PixelNtupletsFitter(float nominalB, const MagneticField* field, bool useRiemannFit);
+  ~PixelNtupletsFitter() override = default;
+  std::unique_ptr<reco::Track> run(const std::vector<const TrackingRecHit*>& hits,
+                                   const TrackingRegion& region,
+                                   const edm::EventSetup& setup) const override;
+
+private:
+  float nominalB_;
+  const MagneticField* field_;
+  bool useRiemannFit_;
+};
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
new file mode 100644
index 0000000000000..52cf4b637fb37
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h
@@ -0,0 +1,1008 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
+namespace riemannFit {
+
+  /*!  Compute the Radiation length in the uniform hypothesis
+ *
+ * The Pixel detector, barrel and forward, is considered as an homogeneous
+ * cylinder of material, whose radiation lengths has been derived from the TDR
+ * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
+ * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
+ * lengths are computed using this unique number, in both regions, barrel and
+ * endcap.
+ *
+ * NB: no angle corrections nor projections are computed inside this routine.
+ * It is therefore the responsibility of the caller to supply the proper
+ * lengths in input. These lengths are the path traveled by the particle along
+ * its trajectory, namely the so called S of the helix in 3D space.
+ *
+ * \param length_values vector of incremental distances that will be translated
+ * into radiation length equivalent. Each radiation length i is computed
+ * incrementally with respect to the previous length i-1. The first length has
+ * no reference point (i.e. it has the dca).
+ *
+ * \return incremental radiation lengths that correspond to each segment.
+ */
+
+  template <typename VNd1, typename VNd2>
+  __host__ __device__ inline void computeRadLenUniformMaterial(const VNd1& length_values, VNd2& rad_lengths) {
+    // Radiation length of the pixel detector in the uniform assumption, with
+    // 0.06 rad_len at 16 cm
+    constexpr double xx_0_inv = 0.06 / 16.;
+    uint n = length_values.rows();
+    rad_lengths(0) = length_values(0) * xx_0_inv;
+    for (uint j = 1; j < n; ++j) {
+      rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * xx_0_inv;
+    }
+  }
+
+  /*!
+    \brief Compute the covariance matrix along cartesian S-Z of points due to
+    multiple Coulomb scattering to be used in the line_fit, for the barrel
+    and forward cases.
+    The input covariance matrix is in the variables s-z, original and
+    unrotated.
+    The multiple scattering component is computed in the usual linear
+    approximation, using the 3D path which is computed as the squared root of
+    the squared sum of the s and z components passed in.
+    Internally a rotation by theta is performed and the covariance matrix
+    returned is the one in the direction orthogonal to the rotated S3D axis,
+    i.e. along the rotated Z axis.
+    The choice of the rotation is not arbitrary, but derived from the fact that
+    putting the horizontal axis along the S3D direction allows the usage of the
+    ordinary least squared fitting techiques with the trivial parametrization y
+    = mx + q, avoiding the patological case with m = +/- inf, that would
+    correspond to the case at eta = 0.
+ */
+
+  template <typename V4, typename VNd1, typename VNd2, int N>
+  __host__ __device__ inline auto scatterCovLine(Matrix2d const* cov_sz,
+                                                 const V4& fast_fit,
+                                                 VNd1 const& s_arcs,
+                                                 VNd2 const& z_values,
+                                                 const double theta,
+                                                 const double bField,
+                                                 MatrixNd<N>& ret) {
+#ifdef RFIT_DEBUG
+    riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
+#endif
+    constexpr uint n = N;
+    double p_t = std::min(20., fast_fit(2) * bField);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
+    VectorNd<N> rad_lengths_S;
+    // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
+    // Basically, to perform cwise operations on Matrices and Vectors, you need
+    // to transform them into Array-like objects.
+    VectorNd<N> s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    s_values = s_values.array().sqrt();
+    computeRadLenUniformMaterial(s_values, rad_lengths_S);
+    VectorNd<N> sig2_S;
+    sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
+#ifdef RFIT_DEBUG
+    riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
+#endif
+    Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
+    for (uint k = 0; k < n; ++k) {
+      tmp(k, k) = cov_sz[k](0, 0);
+      tmp(k + n, k + n) = cov_sz[k](1, 1);
+      tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
+    }
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < std::min(k, l); ++i) {
+          tmp(k + n, l + n) += std::abs(s_values(k) - s_values(i)) * std::abs(s_values(l) - s_values(i)) * sig2_S(i);
+        }
+        tmp(l + n, k + n) = tmp(k + n, l + n);
+      }
+    }
+    // We are interested only in the errors orthogonal to the rotated s-axis
+    // which, in our formalism, are in the lower square matrix.
+#ifdef RFIT_DEBUG
+    riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: ");
+#endif
+    ret = tmp.block(n, n, n, n);
+  }
+
+  /*!
+    \brief Compute the covariance matrix (in radial coordinates) of points in
+    the transverse plane due to multiple Coulomb scattering.
+    \param p2D 2D points in the transverse plane.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, Tan(Theta))).
+    \param B magnetic field use to compute p
+    \return scatter_cov_rad errors due to multiple scattering.
+    \warning input points must be ordered radially from the detector center
+    (from inner layer to outer ones; points on the same layer must ordered too).
+    \details Only the tangential component is computed (the radial one is
+    negligible).
+ */
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline MatrixNd<N> scatter_cov_rad(const M2xN& p2D,
+                                                         const V4& fast_fit,
+                                                         VectorNd<N> const& rad,
+                                                         double B) {
+    constexpr uint n = N;
+    double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
+    double theta = atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI : theta;
+    VectorNd<N> s_values;
+    VectorNd<N> rad_lengths;
+    const Vector2d oVec(fast_fit(0), fast_fit(1));
+
+    // associated Jacobian, used in weights and errors computation
+    for (uint i = 0; i < n; ++i) {  // x
+      Vector2d pVec = p2D.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(-oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
+      const double tempAtan2 = atan2(cross, dot);
+      s_values(i) = std::abs(tempAtan2 * fast_fit(2));
+    }
+    computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths);
+    MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
+    VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+    sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < std::min(k, l); ++i) {
+          scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
+        }
+        scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+      }
+    }
+#ifdef RFIT_DEBUG
+    riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+#endif
+    return scatter_cov_rad;
+  }
+
+  /*!
+    \brief Transform covariance matrix from radial (only tangential component)
+    to Cartesian coordinates (only transverse plane component).
+    \param p2D 2D points in the transverse plane.
+    \param cov_rad covariance matrix in radial coordinate.
+    \return cov_cart covariance matrix in Cartesian coordinates.
+*/
+
+  template <typename M2xN, int N>
+  __host__ __device__ inline Matrix2Nd<N> cov_radtocart(const M2xN& p2D,
+                                                        const MatrixNd<N>& cov_rad,
+                                                        const VectorNd<N>& rad) {
+#ifdef RFIT_DEBUG
+    printf("Address of p2D: %p\n", &p2D);
+#endif
+    printIt(&p2D, "cov_radtocart - p2D:");
+    constexpr uint n = N;
+    Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
+    VectorNd<N> rad_inv = rad.cwiseInverse();
+    printIt(&rad_inv, "cov_radtocart - rad_inv:");
+    for (uint i = 0; i < n; ++i) {
+      for (uint j = i; j < n; ++j) {
+        cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+        cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+        cov_cart(j, i) = cov_cart(i, j);
+        cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
+        cov_cart(j + n, i) = cov_cart(i, j + n);
+        cov_cart(j, i + n) = cov_cart(i + n, j);
+      }
+    }
+    return cov_cart;
+  }
+
+  /*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to radial coordinates (both radial and
+    tangential component but only diagonal terms, correlation between different
+    point are not managed).
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+    \return cov_rad covariance matrix in raidal coordinate.
+    \warning correlation between different point are not computed.
+*/
+  template <typename M2xN, int N>
+  __host__ __device__ inline VectorNd<N> cov_carttorad(const M2xN& p2D,
+                                                       const Matrix2Nd<N>& cov_cart,
+                                                       const VectorNd<N>& rad) {
+    constexpr uint n = N;
+    VectorNd<N> cov_rad;
+    const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
+    for (uint i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);
+      else {
+        cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
+                                    2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+      }
+    }
+    return cov_rad;
+  }
+
+  /*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to coordinates system orthogonal to the
+    pre-fitted circle in each point.
+    Further information in attached documentation.
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, tan(theta))).
+    \return cov_rad covariance matrix in the pre-fitted circle's
+    orthogonal system.
+*/
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline VectorNd<N> cov_carttorad_prefit(const M2xN& p2D,
+                                                              const Matrix2Nd<N>& cov_cart,
+                                                              V4& fast_fit,
+                                                              const VectorNd<N>& rad) {
+    constexpr uint n = N;
+    VectorNd<N> cov_rad;
+    for (uint i = 0; i < n; ++i) {
+      //!< in case you have (0,0) to avoid dividing by 0 radius
+      if (rad(i) < 1.e-4)
+        cov_rad(i) = cov_cart(i, i);  // TO FIX
+      else {
+        Vector2d a = p2D.col(i);
+        Vector2d b = p2D.col(i) - fast_fit.head(2);
+        const double x2 = a.dot(b);
+        const double y2 = cross2D(a, b);
+        const double tan_c = -y2 / x2;
+        const double tan_c2 = sqr(tan_c);
+        cov_rad(i) =
+            1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+      }
+    }
+    return cov_rad;
+  }
+
+  /*!
+    \brief Compute the points' weights' vector for the circle fit when multiple
+    scattering is managed.
+    Further information in attached documentation.
+    \param cov_rad_inv covariance matrix inverse in radial coordinated
+    (or, beter, pre-fitted circle's orthogonal system).
+    \return weight VectorNd points' weights' vector.
+    \bug I'm not sure this is the right way to compute the weights for non
+    diagonal cov matrix. Further investigation needed.
+*/
+
+  template <int N>
+  __host__ __device__ inline VectorNd<N> weightCircle(const MatrixNd<N>& cov_rad_inv) {
+    return cov_rad_inv.colwise().sum().transpose();
+  }
+
+  /*!
+    \brief Find particle q considering the  sign of cross product between
+    particles velocity (estimated by the first 2 hits) and the vector radius
+    between the first hit and the center of the fitted circle.
+    \param p2D 2D points in transverse plane.
+    \param par_uvr result of the circle fit in this form: (X0,Y0,R).
+    \return q int 1 or -1.
+*/
+  template <typename M2xN>
+  __host__ __device__ inline int32_t charge(const M2xN& p2D, const Vector3d& par_uvr) {
+    return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
+            0)
+               ? -1
+               : 1;
+  }
+
+  /*!
+    \brief Compute the eigenvector associated to the minimum eigenvalue.
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored.
+    \return the eigenvector associated to the minimum eigenvalue.
+    \warning double precision is needed for a correct assessment of chi2.
+    \details The minimus eigenvalue is related to chi2.
+    We exploit the fact that the matrix is symmetrical and small (2x2 for line
+    fit and 3x3 for circle fit), so the SelfAdjointEigenSolver from Eigen
+    library is used, with the computedDirect  method (available only for 2x2
+    and 3x3 Matrix) wich computes eigendecomposition of given matrix using a
+    fast closed-form algorithm.
+    For this optimization the matrix type must be known at compiling time.
+*/
+
+  __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) {
+#ifdef RFIT_DEBUG
+    printf("min_eigen3D - enter\n");
+#endif
+    Eigen::SelfAdjointEigenSolver<Matrix3d> solver(3);
+    solver.computeDirect(A);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
+#ifdef RFIT_DEBUG
+    printf("min_eigen3D - exit\n");
+#endif
+    return solver.eigenvectors().col(min_index);
+  }
+
+  /*!
+    \brief A faster version of min_eigen3D() where double precision is not
+    needed.
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+    \return the eigenvector associated to the minimum eigenvalue.
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 3x3 Matrix
+    indeed, use trigonometry function (it solves a third degree equation) which
+    speed up in  single precision.
+*/
+
+  __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A) {
+    Eigen::SelfAdjointEigenSolver<Matrix3f> solver(3);
+    solver.computeDirect(A.cast<float>());
+    int min_index;
+    solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index).cast<double>();
+  }
+
+  /*!
+    \brief 2D version of min_eigen3D().
+    \param aMat the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+    \return the eigenvector associated to the minimum eigenvalue.
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix
+    do not use special math function (just sqrt) therefore it doesn't speed up
+    significantly in single precision.
+*/
+
+  __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& aMat, double& chi2) {
+    Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
+    solver.computeDirect(aMat);
+    int min_index;
+    chi2 = solver.eigenvalues().minCoeff(&min_index);
+    return solver.eigenvectors().col(min_index);
+  }
+
+  /*!
+    \brief A very fast helix fit: it fits a circle by three points (first, middle
+    and last point) and a line by two points (first and last).
+    \param hits points to be fitted
+    \return result in this form: (X0,Y0,R,tan(theta)).
+    \warning points must be passed ordered (from internal layer to external) in
+    order to maximize accuracy and do not mistake tan(theta) sign.
+    \details This fast fit is used as pre-fit which is needed for:
+    - weights estimation and chi2 computation in line fit (fundamental);
+    - weights estimation and chi2 computation in circle fit (useful);
+    - computation of error due to multiple scattering.
+*/
+
+  template <typename M3xN, typename V4>
+  __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;  // get the number of hits
+    printIt(&hits, "Fast_fit - hits: ");
+
+    // CIRCLE FIT
+    // Make segments between middle-to-first(b) and last-to-first(c) hits
+    const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+    printIt(&bVec, "Fast_fit - b: ");
+    printIt(&cVec, "Fast_fit - c: ");
+    // Compute their lengths
+    auto b2 = bVec.squaredNorm();
+    auto c2 = cVec.squaredNorm();
+    // The algebra has been verified (MR). The usual approach has been followed:
+    // * use an orthogonal reference frame passing from the first point.
+    // * build the segments (chords)
+    // * build orthogonal lines through mid points
+    // * make a system and solve for X0 and Y0.
+    // * add the initial point
+    bool flip = abs(bVec.x()) < abs(bVec.y());
+    auto bx = flip ? bVec.y() : bVec.x();
+    auto by = flip ? bVec.x() : bVec.y();
+    auto cx = flip ? cVec.y() : cVec.x();
+    auto cy = flip ? cVec.x() : cVec.y();
+    //!< in case b.x is 0 (2 hits with same x)
+    auto div = 2. * (cx * by - bx * cy);
+    // if aligned TO FIX
+    auto y0 = (cx * b2 - bx * c2) / div;
+    auto x0 = (0.5 * b2 - y0 * by) / bx;
+    result(0) = hits(0, 0) + (flip ? y0 : x0);
+    result(1) = hits(1, 0) + (flip ? x0 : y0);
+    result(2) = sqrt(sqr(x0) + sqr(y0));
+    printIt(&result, "Fast_fit - result: ");
+
+    // LINE FIT
+    const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2);
+    const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2);
+    printIt(&eVec, "Fast_fit - e: ");
+    printIt(&dVec, "Fast_fit - d: ");
+    // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
+    auto dr = result(2) * atan2(cross2D(dVec, eVec), dVec.dot(eVec));
+    // Simple difference in Z between last and first hit
+    auto dz = hits(2, n - 1) - hits(2, 0);
+
+    result(3) = (dr / dz);
+
+#ifdef RFIT_DEBUG
+    printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
+#endif
+  }
+
+  /*!
+    \brief Fit a generic number of 2D points with a circle using Riemann-Chernov
+    algorithm. Covariance matrix of fitted parameter is optionally computed.
+    Multiple scattering (currently only in barrel layer) is optionally handled.
+    \param hits2D 2D points to be fitted.
+    \param hits_cov2D covariance matrix of 2D points.
+    \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)).
+    (tan(theta) is not used).
+    \param bField magnetic field
+    \param error flag for error computation.
+    \param scattering flag for multiple scattering
+    \return circle circle_fit:
+    -par parameter of the fitted circle in this form (X0,Y0,R); \n
+    -cov covariance matrix of the fitted parameter (not initialized if
+    error = false); \n
+    -q charge of the particle; \n
+    -chi2.
+    \warning hits must be passed ordered from inner to outer layer (double hits
+    on the same layer must be ordered too) so that multiple scattering is
+    treated properly.
+    \warning Multiple scattering for barrel is still not tested.
+    \warning Multiple scattering for endcap hits is not handled (yet). Do not
+    fit endcap hits with scattering = true !
+    \bug for small pt (<0.3 Gev/c) chi2 could be slightly underestimated.
+    \bug further investigation needed for error propagation with multiple
+    scattering.
+*/
+  template <typename M2xN, typename V4, int N>
+  __host__ __device__ inline CircleFit circleFit(const M2xN& hits2D,
+                                                 const Matrix2Nd<N>& hits_cov2D,
+                                                 const V4& fast_fit,
+                                                 const VectorNd<N>& rad,
+                                                 const double bField,
+                                                 const bool error) {
+#ifdef RFIT_DEBUG
+    printf("circle_fit - enter\n");
+#endif
+    // INITIALIZATION
+    Matrix2Nd<N> vMat = hits_cov2D;
+    constexpr uint n = N;
+    printIt(&hits2D, "circle_fit - hits2D:");
+    printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - WEIGHT COMPUTATION\n");
+#endif
+    // WEIGHT COMPUTATION
+    VectorNd<N> weight;
+    MatrixNd<N> gMat;
+    double renorm;
+    {
+      MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, vMat, fast_fit, rad).asDiagonal();
+      MatrixNd<N> scatterCovRadMat = scatter_cov_rad(hits2D, fast_fit, rad, bField);
+      printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:");
+      printIt(&hits2D, "circle_fit - hits2D bis:");
+#ifdef RFIT_DEBUG
+      printf("Address of hits2D: a) %p\n", &hits2D);
+#endif
+      vMat += cov_radtocart(hits2D, scatterCovRadMat, rad);
+      printIt(&vMat, "circle_fit - V:");
+      cov_rad += scatterCovRadMat;
+      printIt(&cov_rad, "circle_fit - cov_rad:");
+      math::cholesky::invert(cov_rad, gMat);
+      // gMat = cov_rad.inverse();
+      renorm = gMat.sum();
+      gMat *= 1. / renorm;
+      weight = weightCircle(gMat);
+    }
+    printIt(&weight, "circle_fit - weight:");
+
+    // SPACE TRANSFORMATION
+#ifdef RFIT_DEBUG
+    printf("circle_fit - SPACE TRANSFORMATION\n");
+#endif
+
+    // center
+#ifdef RFIT_DEBUG
+    printf("Address of hits2D: b) %p\n", &hits2D);
+#endif
+    const Vector2d hCentroid = hits2D.rowwise().mean();  // centroid
+    printIt(&hCentroid, "circle_fit - h_:");
+    Matrix3xNd<N> p3D;
+    p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid;
+    printIt(&p3D, "circle_fit - p3D: a)");
+    Vector2Nd<N> mc;  // centered hits, used in error computation
+    mc << p3D.row(0).transpose(), p3D.row(1).transpose();
+    printIt(&mc, "circle_fit - mc(centered hits):");
+
+    // scale
+    const double tempQ = mc.squaredNorm();
+    const double tempS = sqrt(n * 1. / tempQ);  // scaling factor
+    p3D *= tempS;
+
+    // project on paraboloid
+    p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
+    printIt(&p3D, "circle_fit - p3D: b)");
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - COST FUNCTION\n");
+#endif
+    // COST FUNCTION
+
+    // compute
+    Vector3d r0;
+    r0.noalias() = p3D * weight;  // center of gravity
+    const Matrix3xNd<N> xMat = p3D.colwise() - r0;
+    Matrix3d aMat = xMat * gMat * xMat.transpose();
+    printIt(&aMat, "circle_fit - A:");
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - MINIMIZE\n");
+#endif
+    // minimize
+    double chi2;
+    Vector3d vVec = min_eigen3D(aMat, chi2);
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN\n");
+#endif
+    printIt(&vVec, "v BEFORE INVERSION");
+    vVec *= (vVec(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+    printIt(&vVec, "v AFTER INVERSION");
+    // This hack to be able to run on GPU where the automatic assignment to a
+    // double from the vector multiplication is not working.
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN 1\n");
+#endif
+    Eigen::Matrix<double, 1, 1> cm;
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN 2\n");
+#endif
+    cm = -vVec.transpose() * r0;
+#ifdef RFIT_DEBUG
+    printf("circle_fit - AFTER MIN_EIGEN 3\n");
+#endif
+    const double tempC = cm(0, 0);
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
+#endif
+    // COMPUTE CIRCLE PARAMETER
+
+    // auxiliary quantities
+    const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2));
+    const double v2x2_inv = 1. / (2. * vVec(2));
+    const double s_inv = 1. / tempS;
+    Vector3d par_uvr;  // used in error propagation
+    par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv;
+
+    CircleFit circle;
+    circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv;
+    circle.qCharge = charge(hits2D, circle.par);
+    circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS);
+    printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
+    printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
+#ifdef RFIT_DEBUG
+    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge);
+#endif
+
+#ifdef RFIT_DEBUG
+    printf("circle_fit - ERROR PROPAGATION\n");
+#endif
+    // ERROR PROPAGATION
+    if (error) {
+#ifdef RFIT_DEBUG
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+#endif
+      ArrayNd<N> vcsMat[2][2];  // cov matrix of center & scaled points
+      MatrixNd<N> cMat[3][3];   // cov matrix of 3D transformed points
+#ifdef RFIT_DEBUG
+      printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+#endif
+      {
+        Eigen::Matrix<double, 1, 1> cm;
+        Eigen::Matrix<double, 1, 1> cm2;
+        cm = mc.transpose() * vMat * mc;
+        const double tempC2 = cm(0, 0);
+        Matrix2Nd<N> tempVcsMat;
+        tempVcsMat.template triangularView<Eigen::Upper>() =
+            (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) *
+                                     (2. * vMat.squaredNorm() + 4. * tempC2) *  // mc.transpose() * V * mc) *
+                                     (mc * mc.transpose()));
+
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
+        cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[0][1] = tempVcsMat.block(0, n, n, n);
+        cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[1][0] = vcsMat[0][1].transpose();
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
+      }
+
+      {
+        const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+        const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+        const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
+        const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
+        const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
+        const ArrayNd<N> t10 = t01.transpose();
+        vcsMat[0][0] = cMat[0][0];
+        cMat[0][1] = vcsMat[0][1];
+        cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1);
+        vcsMat[1][1] = cMat[1][1];
+        cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1);
+        MatrixNd<N> tmp;
+        tmp.template triangularView<Eigen::Upper>() =
+            (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] +
+                   vcsMat[1][1] * vcsMat[1][1]) +
+             4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11))
+                .matrix();
+        cMat[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+      }
+      printIt(&cMat[0][0], "circle_fit - C[0][0]:");
+
+      Matrix3d c0Mat;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+      for (uint i = 0; i < 3; ++i) {
+        for (uint j = i; j < 3; ++j) {
+          Eigen::Matrix<double, 1, 1> tmp;
+          tmp = weight.transpose() * cMat[i][j] * weight;
+          // Workaround to get things working in GPU
+          const double tempC = tmp(0, 0);
+          c0Mat(i, j) = tempC;  //weight.transpose() * C[i][j] * weight;
+          c0Mat(j, i) = c0Mat(i, j);
+        }
+      }
+      printIt(&c0Mat, "circle_fit - C0:");
+
+      const MatrixNd<N> wMat = weight * weight.transpose();
+      const MatrixNd<N> hMat = MatrixNd<N>::Identity().rowwise() - weight.transpose();
+      const MatrixNx3d<N> s_v = hMat * p3D.transpose();
+      printIt(&wMat, "circle_fit - W:");
+      printIt(&hMat, "circle_fit - H:");
+      printIt(&s_v, "circle_fit - s_v:");
+
+      MatrixNd<N> dMat[3][3];  // cov(s_v)
+      dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][0] = dMat[0][1].transpose();
+      dMat[2][0] = dMat[0][2].transpose();
+      dMat[2][1] = dMat[1][2].transpose();
+      printIt(&dMat[0][0], "circle_fit - D_[0][0]:");
+
+      constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+      Matrix6d eMat;  // cov matrix of the 6 independent elements of A
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        for (uint b = a; b < 6; ++b) {
+          const uint k = nu[b][0], l = nu[b][1];
+          VectorNd<N> t0(n);
+          VectorNd<N> t1(n);
+          if (l == k) {
+            t0 = 2. * dMat[j][l] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = 2. * dMat[i][l] * s_v.col(l);
+          } else {
+            t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l);
+            if (i == j)
+              t1 = t0;
+            else
+              t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l);
+          }
+
+          if (i == j) {
+            Eigen::Matrix<double, 1, 1> cm;
+            cm = s_v.col(i).transpose() * (t0 + t1);
+            // Workaround to get things working in GPU
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;
+          } else {
+            Eigen::Matrix<double, 1, 1> cm;
+            cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+            // Workaround to get things working in GPU
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+          }
+          if (b != a)
+            eMat(b, a) = eMat(a, b);
+        }
+      }
+      printIt(&eMat, "circle_fit - E:");
+
+      Eigen::Matrix<double, 3, 6> j2Mat;  // Jacobian of min_eigen() (numerically computed)
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        Matrix3d delta = Matrix3d::Zero();
+        delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon);
+        j2Mat.col(a) = min_eigen3D_fast(aMat + delta);
+        const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1;
+        j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j);
+      }
+      printIt(&j2Mat, "circle_fit - J2:");
+
+      Matrix4d cvcMat;  // joint cov matrix of (v0,v1,v2,c)
+      {
+        Matrix3d t0 = j2Mat * eMat * j2Mat.transpose();
+        Vector3d t1 = -t0 * r0;
+        cvcMat.block(0, 0, 3, 3) = t0;
+        cvcMat.block(0, 3, 3, 1) = t1;
+        cvcMat.block(3, 0, 1, 3) = t1.transpose();
+        Eigen::Matrix<double, 1, 1> cm1;
+        Eigen::Matrix<double, 1, 1> cm3;
+        cm1 = (vVec.transpose() * c0Mat * vVec);
+        //      cm2 = (c0Mat.cwiseProduct(t0)).sum();
+        cm3 = (r0.transpose() * t0 * r0);
+        // Workaround to get things working in GPU
+        const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0);
+        cvcMat(3, 3) = tempC;
+        // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+      }
+      printIt(&cvcMat, "circle_fit - Cvc:");
+
+      Eigen::Matrix<double, 3, 4> j3Mat;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+      {
+        const double t = 1. / tempH;
+        j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0,
+            vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t,
+            -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t;
+      }
+      printIt(&j3Mat, "circle_fit - J3:");
+
+      const RowVector2Nd<N> Jq = mc.transpose() * tempS * 1. / n;  // var(q)
+      printIt(&Jq, "circle_fit - Jq:");
+
+      Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                         + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose());
+
+      circle.cov = cov_uvr;
+    }
+
+    printIt(&circle.cov, "Circle cov:");
+#ifdef RFIT_DEBUG
+    printf("circle_fit - exit\n");
+#endif
+    return circle;
+  }
+
+  /*!  \brief Perform an ordinary least square fit in the s-z plane to compute
+ * the parameters cotTheta and Zip.
+ *
+ * The fit is performed in the rotated S3D-Z' plane, following the formalism of
+ * Frodesen, Chapter 10, p. 259.
+ *
+ * The system has been rotated to both try to use the combined errors in s-z
+ * along Z', as errors in the Y direction and to avoid the patological case of
+ * degenerate lines with angular coefficient m = +/- inf.
+ *
+ * The rotation is using the information on the theta angle computed in the
+ * fast fit. The rotation is such that the S3D axis will be the X-direction,
+ * while the rotated Z-axis will be the Y-direction. This pretty much follows
+ * what is done in the same fit in the Broken Line approach.
+ */
+
+  template <typename M3xN, typename M6xN, typename V4>
+  __host__ __device__ inline LineFit lineFit(const M3xN& hits,
+                                             const M6xN& hits_ge,
+                                             const CircleFit& circle,
+                                             const V4& fast_fit,
+                                             const double bField,
+                                             const bool error) {
+    constexpr uint32_t N = M3xN::ColsAtCompileTime;
+    constexpr auto n = N;
+    double theta = -circle.qCharge * atan(fast_fit(3));
+    theta = theta < 0. ? theta + M_PI : theta;
+
+    // Prepare the Rotation Matrix to rotate the points
+    Eigen::Matrix<double, 2, 2> rot;
+    rot << sin(theta), cos(theta), -cos(theta), sin(theta);
+
+    // PROJECTION ON THE CILINDER
+    //
+    // p2D will be:
+    // [s1, s2, s3, ..., sn]
+    // [z1, z2, z3, ..., zn]
+    // s values will be ordinary x-values
+    // z values will be ordinary y-values
+
+    Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
+    Eigen::Matrix<double, 2, 6> jxMat;
+
+#ifdef RFIT_DEBUG
+    printf("Line_fit - B: %g\n", bField);
+    printIt(&hits, "Line_fit points: ");
+    printIt(&hits_ge, "Line_fit covs: ");
+    printIt(&rot, "Line_fit rot: ");
+#endif
+    // x & associated Jacobian
+    // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+    // Slide 11
+    // a ==> -o i.e. the origin of the circle in XY plane, negative
+    // b ==> p i.e. distances of the points wrt the origin of the circle.
+    const Vector2d oVec(circle.par(0), circle.par(1));
+
+    // associated Jacobian, used in weights and errors computation
+    Matrix6d covMat = Matrix6d::Zero();
+    Matrix2d cov_sz[N];
+    for (uint i = 0; i < n; ++i) {
+      Vector2d pVec = hits.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(-oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
+      // atan2(cross, dot) give back the angle in the transverse plane so tha the
+      // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
+      const double tempQAtan2 = -circle.qCharge * atan2(cross, dot);
+      //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
+      p2D(0, i) = tempQAtan2 * circle.par(2);
+
+      // associated Jacobian, used in weights and errors- computation
+      const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+      double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+      if (error) {
+        d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross);
+        d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross);
+        d_R = tempQAtan2;
+      }
+      const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross);
+      const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross);
+      jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+      covMat.block(0, 0, 3, 3) = circle.cov;
+      covMat(3, 3) = hits_ge.col(i)[0];                 // x errors
+      covMat(4, 4) = hits_ge.col(i)[2];                 // y errors
+      covMat(5, 5) = hits_ge.col(i)[5];                 // z errors
+      covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1];  // cov_xy
+      covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3];  // cov_xz
+      covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4];  // cov_yz
+      Matrix2d tmp = jxMat * covMat * jxMat.transpose();
+      cov_sz[i].noalias() = rot * tmp * rot.transpose();
+    }
+    // Math of d_{X0,Y0,R,x,y} all verified by hand
+    p2D.row(1) = hits.row(2);
+
+    // The following matrix will contain errors orthogonal to the rotated S
+    // component only, with the Multiple Scattering properly treated!!
+    MatrixNd<N> cov_with_ms;
+    scatterCovLine(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms);
+#ifdef RFIT_DEBUG
+    printIt(cov_sz, "line_fit - cov_sz:");
+    printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+#endif
+
+    // Rotate Points with the shape [2, n]
+    Matrix2xNd<N> p2D_rot = rot * p2D;
+
+#ifdef RFIT_DEBUG
+    printf("Fast fit Tan(theta): %g\n", fast_fit(3));
+    printf("Rotation angle: %g\n", theta);
+    printIt(&rot, "Rotation Matrix:");
+    printIt(&p2D, "Original Hits(s,z):");
+    printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
+    printIt(&rot, "Rotation Matrix:");
+#endif
+
+    // Build the A Matrix
+    Matrix2xNd<N> aMat;
+    aMat << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+
+#ifdef RFIT_DEBUG
+    printIt(&aMat, "A Matrix:");
+#endif
+
+    // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
+    MatrixNd<N> vyInvMat;
+    math::cholesky::invert(cov_with_ms, vyInvMat);
+    // MatrixNd<N> vyInvMat = cov_with_ms.inverse();
+    Eigen::Matrix<double, 2, 2> covParamsMat = aMat * vyInvMat * aMat.transpose();
+    // Compute the Covariance Matrix of the fit parameters
+    math::cholesky::invert(covParamsMat, covParamsMat);
+
+    // Now Compute the Parameters in the form [2,1]
+    // The first component is q.
+    // The second component is m.
+    Eigen::Matrix<double, 2, 1> sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose();
+
+#ifdef RFIT_DEBUG
+    printIt(&sol, "Rotated solutions:");
+#endif
+
+    // We need now to transfer back the results in the original s-z plane
+    const auto sinTheta = sin(theta);
+    const auto cosTheta = cos(theta);
+    auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta);
+    Eigen::Matrix<double, 2, 2> jMat;
+    jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor;
+
+    double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta);
+    double tempQ = common_factor * sol(0, 0);
+    auto cov_mq = jMat * covParamsMat * jMat.transpose();
+
+    VectorNd<N> res = p2D_rot.row(1).transpose() - aMat.transpose() * sol;
+    double chi2 = res.transpose() * vyInvMat * res;
+
+    LineFit line;
+    line.par << tempM, tempQ;
+    line.cov << cov_mq;
+    line.chi2 = chi2;
+
+#ifdef RFIT_DEBUG
+    printf("Common_factor: %g\n", common_factor);
+    printIt(&jMat, "Jacobian:");
+    printIt(&sol, "Rotated solutions:");
+    printIt(&covParamsMat, "Cov_params:");
+    printIt(&cov_mq, "Rotated Covariance Matrix:");
+    printIt(&(line.par), "Real Parameters:");
+    printIt(&(line.cov), "Real Covariance Matrix:");
+    printf("Chi2: %g\n", chi2);
+#endif
+
+    return line;
+  }
+
+  /*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of hits projected in the transverse plane by Riemann-Chernov
+        algorithm (see Circle_fit() for further info); \n
+    -line fit of hits projected on cylinder surface by orthogonal distance
+        regression (see Line_fit for further info). \n
+    Points must be passed ordered (from inner to outer layer).
+    \param hits Matrix3xNd hits coordinates in this form: \n
+        |x0|x1|x2|...|xn| \n
+        |y0|y1|y2|...|yn| \n
+        |z0|z1|z2|...|zn|
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+   |(x0,x0)|(x1,x0)|(x2,x0)|.|(y0,x0)|(y1,x0)|(y2,x0)|.|(z0,x0)|(z1,x0)|(z2,x0)| \n
+   |(x0,x1)|(x1,x1)|(x2,x1)|.|(y0,x1)|(y1,x1)|(y2,x1)|.|(z0,x1)|(z1,x1)|(z2,x1)| \n
+   |(x0,x2)|(x1,x2)|(x2,x2)|.|(y0,x2)|(y1,x2)|(y2,x2)|.|(z0,x2)|(z1,x2)|(z2,x2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,y0)|(x1,y0)|(x2,y0)|.|(y0,y0)|(y1,y0)|(y2,x0)|.|(z0,y0)|(z1,y0)|(z2,y0)| \n
+   |(x0,y1)|(x1,y1)|(x2,y1)|.|(y0,y1)|(y1,y1)|(y2,x1)|.|(z0,y1)|(z1,y1)|(z2,y1)| \n
+   |(x0,y2)|(x1,y2)|(x2,y2)|.|(y0,y2)|(y1,y2)|(y2,x2)|.|(z0,y2)|(z1,y2)|(z2,y2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n
+   |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n
+   |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)|
+   \param bField magnetic field in the center of the detector in Gev/cm/c
+   unit, in order to perform pt calculation.
+   \param error flag for error computation.
+   \param scattering flag for multiple scattering treatment.
+   (see Circle_fit() documentation for further info).
+   \warning see Circle_fit(), Line_fit() and Fast_fit() warnings.
+   \bug see Circle_fit(), Line_fit() and Fast_fit() bugs.
+*/
+
+  template <int N>
+  inline HelixFit helixFit(const Matrix3xNd<N>& hits,
+                           const Eigen::Matrix<float, 6, N>& hits_ge,
+                           const double bField,
+                           const bool error) {
+    constexpr uint n = N;
+    VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm());
+
+    // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
+    Vector4d fast_fit;
+    fastFit(hits, fast_fit);
+    riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+    riemannFit::loadCovariance2D(hits_ge, hits_cov);
+    CircleFit circle = circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, bField, error);
+    LineFit line = lineFit(hits, hits_ge, circle, fast_fit, bField, error);
+
+    par_uvrtopak(circle, bField, error);
+
+    HelixFit helix;
+    helix.par << circle.par, line.par;
+    if (error) {
+      helix.cov = MatrixXd::Zero(5, 5);
+      helix.cov.block(0, 0, 3, 3) = circle.cov;
+      helix.cov.block(3, 3, 2, 2) = line.cov;
+    }
+    helix.qCharge = circle.qCharge;
+    helix.chi2_circle = circle.chi2;
+    helix.chi2_line = line.chi2;
+
+    return helix;
+  }
+
+}  // namespace riemannFit
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
index be113d7a5a3dc..ecfbd99b667fc 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
@@ -1,3 +1,6 @@
+<use name="cuda"/>
+<use name="CUDADataFormats/Track"/>
+<use name="HeterogeneousCore/CUDACore"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
 <library file="*.cc" name="RecoPixelVertexingPixelTrackFittingPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
new file mode 100644
index 0000000000000..f49d2f01f48c6
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc
@@ -0,0 +1,44 @@
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "MagneticField/Engine/interface/MagneticField.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitter.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+class PixelNtupletsFitterProducer : public edm::global::EDProducer<> {
+public:
+  explicit PixelNtupletsFitterProducer(const edm::ParameterSet& iConfig)
+      : useRiemannFit_(iConfig.getParameter<bool>("useRiemannFit")), idealMagneticFieldToken_(esConsumes()) {
+    produces<PixelFitter>();
+  }
+  ~PixelNtupletsFitterProducer() override {}
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
+    descriptions.add("pixelNtupletsFitterDefault", desc);
+  }
+
+private:
+  bool useRiemannFit_;
+  const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
+  void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+};
+
+void PixelNtupletsFitterProducer::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  auto const& idealField = iSetup.getData(idealMagneticFieldToken_);
+  float bField = 1 / PixelRecoUtilities::fieldInInvGev(iSetup);
+  auto impl = std::make_unique<PixelNtupletsFitter>(bField, &idealField, useRiemannFit_);
+  auto prod = std::make_unique<PixelFitter>(std::move(impl));
+  iEvent.put(std::move(prod));
+}
+
+DEFINE_FWK_MODULE(PixelNtupletsFitterProducer);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
new file mode 100644
index 0000000000000..2f0965be50eb8
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -0,0 +1,86 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
+public:
+  explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);
+  ~PixelTrackDumpCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
+  const bool m_onGPU;
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+};
+
+PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
+    : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+  if (m_onGPU) {
+    tokenGPUTrack_ =
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ =
+        consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  } else {
+    tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  }
+}
+
+void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<bool>("onGPU", true);
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+  desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVertexCUDA"));
+  descriptions.add("pixelTrackDumpCUDA", desc);
+}
+
+void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
+                                 edm::Event const& iEvent,
+                                 const edm::EventSetup& iSetup) const {
+  if (m_onGPU) {
+    auto const& hTracks = iEvent.get(tokenGPUTrack_);
+    cms::cuda::ScopedContextProduce ctx{hTracks};
+
+    auto const& tracks = ctx.get(hTracks);
+    auto const* tsoa = tracks.get();
+    assert(tsoa);
+
+    auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
+    auto const* vsoa = vertices.get();
+    assert(vsoa);
+
+  } else {
+    auto const* tsoa = iEvent.get(tokenSoATrack_).get();
+    assert(tsoa);
+
+    auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
+    assert(vsoa);
+  }
+}
+
+DEFINE_FWK_MODULE(PixelTrackDumpCUDA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
index bd390f5f65352..91c3a44cc8643 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc
@@ -1,23 +1,22 @@
-#include "PixelTrackProducer.h"
+#include <vector>
 
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
-#include "DataFormats/TrackReco/interface/Track.h"
-#include "DataFormats/TrackReco/interface/TrackFwd.h"
-#include "DataFormats/TrackReco/interface/TrackExtra.h"
-#include "DataFormats/Common/interface/OrphanHandle.h"
-
-#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 
-#include <vector>
+#include "PixelTrackProducer.h"
+#include "storeTracks.h"
 
 using namespace pixeltrackfitting;
 using edm::ParameterSet;
@@ -45,62 +44,9 @@ void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es) {
 
   TracksWithTTRHs tracks;
   theReconstruction.run(tracks, ev, es);
-
   edm::ESHandle<TrackerTopology> httopo;
   es.get<TrackerTopologyRcd>().get(httopo);
 
   // store tracks
-  store(ev, tracks, *httopo);
-}
-
-void PixelTrackProducer::store(edm::Event& ev, const TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo) {
-  auto tracks = std::make_unique<reco::TrackCollection>();
-  auto recHits = std::make_unique<TrackingRecHitCollection>();
-  auto trackExtras = std::make_unique<reco::TrackExtraCollection>();
-
-  int cc = 0, nTracks = tracksWithHits.size();
-
-  for (int i = 0; i < nTracks; i++) {
-    reco::Track* track = tracksWithHits.at(i).first;
-    const SeedingHitSet& hits = tracksWithHits.at(i).second;
-
-    for (unsigned int k = 0; k < hits.size(); k++) {
-      TrackingRecHit* hit = hits[k]->hit()->clone();
-
-      track->appendHitPattern(*hit, ttopo);
-      recHits->push_back(hit);
-    }
-    tracks->push_back(*track);
-    delete track;
-  }
-
-  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event"
-                            << "\n";
-  edm::OrphanHandle<TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
-
-  edm::RefProd<TrackingRecHitCollection> hitCollProd(ohRH);
-  for (int k = 0; k < nTracks; k++) {
-    reco::TrackExtra theTrackExtra{};
-
-    //fill the TrackExtra with TrackingRecHitRef
-    unsigned int nHits = tracks->at(k).numberOfValidHits();
-    theTrackExtra.setHits(hitCollProd, cc, nHits);
-    cc += nHits;
-    AlgebraicVector5 v = AlgebraicVector5(0, 0, 0, 0, 0);
-    reco::TrackExtra::TrajParams trajParams(nHits, LocalTrajectoryParameters(v, 1.));
-    reco::TrackExtra::Chi2sFive chi2s(nHits, 0);
-    theTrackExtra.setTrajParams(std::move(trajParams), std::move(chi2s));
-    trackExtras->push_back(theTrackExtra);
-  }
-
-  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event"
-                            << "\n";
-  edm::OrphanHandle<reco::TrackExtraCollection> ohTE = ev.put(std::move(trackExtras));
-
-  for (int k = 0; k < nTracks; k++) {
-    const reco::TrackExtraRef theTrackExtraRef(ohTE, k);
-    (tracks->at(k)).setExtra(theTrackExtraRef);
-  }
-
-  ev.put(std::move(tracks));
+  storeTracks(ev, tracks, *httopo);
 }
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
index d756a9cf963f5..c38fd44c0d7f5 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h
@@ -1,8 +1,7 @@
-#ifndef PixelTrackProducer_H
-#define PixelTrackProducer_H
+#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h
+#define RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h
 
 #include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstruction.h"
 
 namespace edm {
@@ -24,7 +23,7 @@ class PixelTrackProducer : public edm::stream::EDProducer<> {
   void produce(edm::Event& ev, const edm::EventSetup& es) override;
 
 private:
-  void store(edm::Event& ev, const pixeltrackfitting::TracksWithTTRHs& selectedTracks, const TrackerTopology& ttopo);
   PixelTrackReconstruction theReconstruction;
 };
-#endif
+
+#endif  // RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
new file mode 100644
index 0000000000000..94c490e948575
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -0,0 +1,205 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+
+#include "storeTracks.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+
+/**
+ * This class creates "leagcy"  reco::Track
+ * objects from the output of SoA CA. 
+ */
+class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
+public:
+  using IndToEdm = std::vector<uint16_t>;
+
+  explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~PixelTrackProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+  //  using HitModuleStart = std::array<uint32_t, gpuClustering::maxNumModules + 1>;
+  using HMSstorage = HostProduct<uint32_t[]>;
+
+private:
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+  // Event Data tokens
+  const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
+  const edm::EDGetTokenT<HMSstorage> hmsToken_;
+  // Event Setup tokens
+  const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
+  const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> ttTopoToken_;
+
+  int32_t const minNumberOfHits_;
+};
+
+PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig)
+    : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      idealMagneticFieldToken_(esConsumes()),
+      ttTopoToken_(esConsumes()),
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits")) {
+  produces<reco::TrackCollection>();
+  produces<TrackingRecHitCollection>();
+  produces<reco::TrackExtraCollection>();
+  produces<IndToEdm>();
+}
+
+void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("trackSrc", edm::InputTag("pixelTrackSoA"));
+  desc.add<edm::InputTag>("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsPreSplittingLegacy"));
+  desc.add<int>("minNumberOfHits", 0);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
+                                        edm::Event &iEvent,
+                                        const edm::EventSetup &iSetup) const {
+  // std::cout << "Converting gpu helix in reco tracks" << std::endl;
+
+  auto indToEdmP = std::make_unique<IndToEdm>();
+  auto &indToEdm = *indToEdmP;
+
+  auto const &idealField = iSetup.getData(idealMagneticFieldToken_);
+
+  pixeltrackfitting::TracksWithRecHits tracks;
+
+  auto const &httopo = iSetup.getData(ttTopoToken_);
+
+  const auto &bsh = iEvent.get(tBeamSpot_);
+  GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
+
+  auto const &rechits = iEvent.get(cpuHits_);
+  std::vector<TrackingRecHit const *> hitmap;
+  auto const &rcs = rechits.data();
+  auto nhits = rcs.size();
+  hitmap.resize(nhits, nullptr);
+
+  auto const *hitsModuleStart = iEvent.get(hmsToken_).get();
+  auto fc = hitsModuleStart;
+
+  for (auto const &h : rcs) {
+    auto const &thit = static_cast<BaseTrackerRecHit const &>(h);
+    auto detI = thit.det()->index();
+    auto const &clus = thit.firstClusterRef();
+    assert(clus.isPixel());
+    auto i = fc[detI] + clus.pixelCluster().originalId();
+    if (i >= hitmap.size())
+      hitmap.resize(i + 256, nullptr);  // only in case of hit overflow in one module
+    assert(nullptr == hitmap[i]);
+    hitmap[i] = &h;
+  }
+
+  std::vector<const TrackingRecHit *> hits;
+  hits.reserve(5);
+
+  const auto &tsoa = *iEvent.get(tokenTrack_);
+
+  auto const *quality = tsoa.qualityData();
+  auto const &fit = tsoa.stateAtBS;
+  auto const &hitIndices = tsoa.hitIndices;
+  auto maxTracks = tsoa.stride();
+
+  int32_t nt = 0;
+
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
+    indToEdm.push_back(-1);
+    auto q = quality[it];
+    if (q != pixelTrack::Quality::loose)
+      continue;
+    if (nHits < minNumberOfHits_)
+      continue;
+    indToEdm.back() = nt;
+    ++nt;
+
+    hits.resize(nHits);
+    auto b = hitIndices.begin(it);
+    for (int iHit = 0; iHit < nHits; ++iHit)
+      hits[iHit] = hitmap[*(b + iHit)];
+
+    // mind: this values are respect the beamspot!
+
+    float chi2 = tsoa.chi2(it);
+    float phi = tsoa.phi(it);
+
+    riemannFit::Vector5d ipar, opar;
+    riemannFit::Matrix5d icov, ocov;
+    fit.copyToDense(ipar, icov, it);
+    riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
+
+    LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
+    AlgebraicSymMatrix55 m;
+    for (int i = 0; i < 5; ++i)
+      for (int j = i; j < 5; ++j)
+        m(i, j) = ocov(i, j);
+
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
+
+    Plane impPointPlane(bs, rot);
+    GlobalTrajectoryParameters gp(
+        impPointPlane.toGlobal(lpar.position()), impPointPlane.toGlobal(lpar.momentum()), lpar.charge(), &idealField);
+    JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, idealField);
+
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
+
+    int ndof = 2 * hits.size() - 5;
+    chi2 = chi2 * ndof;
+    GlobalPoint vv = gp.position();
+    math::XYZPoint pos(vv.x(), vv.y(), vv.z());
+    GlobalVector pp = gp.momentum();
+    math::XYZVector mom(pp.x(), pp.y(), pp.z());
+
+    auto track = std::make_unique<reco::Track>(chi2, ndof, pos, mom, gp.charge(), CurvilinearTrajectoryError(mo));
+    // filter???
+    tracks.emplace_back(track.release(), hits);
+  }
+  // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl;
+
+  // store tracks
+  storeTracks(iEvent, tracks, httopo);
+  iEvent.put(std::move(indToEdmP));
+}
+
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
new file mode 100644
index 0000000000000..2de8ec6c335b5
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -0,0 +1,86 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+// Switch on to enable checks and printout for found tracks
+#undef PIXEL_DEBUG_PRODUCE
+
+class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~PixelTrackSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
+
+  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
+};
+
+PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
+    : tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
+
+void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("src", edm::InputTag("caHitNtupletCUDA"));
+  descriptions.add("pixelTrackSoA", desc);
+}
+
+void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                    edm::EventSetup const& iSetup,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  soa_ = inputData.toHostAsync(ctx.stream());
+}
+
+void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+#ifdef PIXEL_DEBUG_PRODUCE
+  auto const& tsoa = *soa_;
+  auto maxTracks = tsoa.stride();
+  std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    assert(nHits == int(tsoa.hitIndices.size(it)));
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
+    nt++;
+  }
+  std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
+#endif
+
+  // DO NOT  make a copy  (actually TWO....)
+  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(soa_)));
+
+  assert(!soa_);
+}
+
+DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
new file mode 100644
index 0000000000000..59101b6ba5214
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
@@ -0,0 +1,72 @@
+#ifndef RecoPixelVertexingPixelTrackFittingStoreTracks_H
+#define RecoPixelVertexingPixelTrackFittingStoreTracks_H
+
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h"
+
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+
+template <typename Ev, typename TWH>
+void storeTracks(Ev& ev, const TWH& tracksWithHits, const TrackerTopology& ttopo) {
+  auto tracks = std::make_unique<reco::TrackCollection>();
+  auto recHits = std::make_unique<TrackingRecHitCollection>();
+  auto trackExtras = std::make_unique<reco::TrackExtraCollection>();
+
+  int cc = 0, nTracks = tracksWithHits.size();
+
+  for (int i = 0; i < nTracks; i++) {
+    reco::Track* track = tracksWithHits[i].first;
+    const auto& hits = tracksWithHits[i].second;
+
+    for (unsigned int k = 0; k < hits.size(); k++) {
+      auto* hit = hits[k]->clone();
+
+      track->appendHitPattern(*hit, ttopo);
+      recHits->push_back(hit);
+    }
+    tracks->push_back(*track);
+    delete track;
+  }
+
+  LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event"
+                            << "\n";
+  edm::OrphanHandle<TrackingRecHitCollection> ohRH = ev.put(std::move(recHits));
+
+  edm::RefProd<TrackingRecHitCollection> hitCollProd(ohRH);
+  for (int k = 0; k < nTracks; k++) {
+    reco::TrackExtra theTrackExtra{};
+
+    //fill the TrackExtra with TrackingRecHitRef
+    unsigned int nHits = tracks->at(k).numberOfValidHits();
+    theTrackExtra.setHits(hitCollProd, cc, nHits);
+    cc += nHits;
+    AlgebraicVector5 v = AlgebraicVector5(0, 0, 0, 0, 0);
+    reco::TrackExtra::TrajParams trajParams(nHits, LocalTrajectoryParameters(v, 1.));
+    reco::TrackExtra::Chi2sFive chi2s(nHits, 0);
+    theTrackExtra.setTrajParams(std::move(trajParams), std::move(chi2s));
+    trackExtras->push_back(theTrackExtra);
+  }
+
+  LogDebug("TrackProducer") << "put the collection of TrackExtra in the event"
+                            << "\n";
+  edm::OrphanHandle<reco::TrackExtraCollection> ohTE = ev.put(std::move(trackExtras));
+
+  for (int k = 0; k < nTracks; k++) {
+    const reco::TrackExtraRef theTrackExtraRef(ohTE, k);
+    (tracks->at(k)).setExtra(theTrackExtraRef);
+  }
+
+  ev.put(std::move(tracks));
+}
+
+#endif
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 4334d724358f3..5ff404cb603d4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -11,6 +11,7 @@
 from RecoTracker.TkSeedingLayers.PixelLayerTriplets_cfi import *
 from RecoTracker.TkSeedingLayers.TTRHBuilderWithoutAngle4PixelTriplets_cfi import *
 from RecoPixelVertexing.PixelTrackFitting.pixelFitterByHelixProjections_cfi import pixelFitterByHelixProjections
+from RecoPixelVertexing.PixelTrackFitting.pixelNtupletsFitter_cfi import pixelNtupletsFitter
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackFilterByKinematics_cfi import pixelTrackFilterByKinematics
 from RecoPixelVertexing.PixelTrackFitting.pixelTrackCleanerBySharedHits_cfi import pixelTrackCleanerBySharedHits
 from RecoPixelVertexing.PixelTrackFitting.pixelTracks_cfi import pixelTracks as _pixelTracks
@@ -76,4 +77,26 @@
 _pixelTracksTask_lowPU.replace(pixelTracksHitQuadruplets, pixelTracksHitTriplets)
 trackingLowPU.toReplaceWith(pixelTracksTask, _pixelTracksTask_lowPU)
 
+# Use ntuple fit and substitute previous Fitter producer with the ntuple one
+from Configuration.ProcessModifiers.pixelNtupleFit_cff import pixelNtupleFit as ntupleFit
+ntupleFit.toModify(pixelTracks, Fitter = "pixelNtupletsFitter")
+_pixelTracksTask_ntupleFit = pixelTracksTask.copy()
+_pixelTracksTask_ntupleFit.replace(pixelFitterByHelixProjections, pixelNtupletsFitter)
+ntupleFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_ntupleFit)
+
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi import pixelTrackSoA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackFromSoA
+_pixelTracksGPUTask = cms.Task(
+  caHitNtupletCUDA,
+  pixelTrackSoA,
+  pixelTracks # FromSoA
+)
+
+gpu.toReplaceWith(pixelTracksTask, _pixelTracksGPUTask)
+gpu.toReplaceWith(pixelTracks,_pixelTrackFromSoA)
+
+
 pixelTracksSequence = cms.Sequence(pixelTracksTask)
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py b/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py
new file mode 100644
index 0000000000000..10e1e3852e9c4
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py
@@ -0,0 +1,6 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoPixelVertexing.PixelTrackFitting.pixelNtupletsFitterDefault_cfi import pixelNtupletsFitterDefault
+
+pixelNtupletsFitter = pixelNtupletsFitterDefault.clone()
+
diff --git a/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
new file mode 100644
index 0000000000000..96f5d5fe03448
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc
@@ -0,0 +1,102 @@
+#include "CommonTools/Utils/interface/DynArray.h"
+#include "DataFormats/GeometryCommonDetAlgo/interface/GlobalError.h"
+#include "DataFormats/GeometryCommonDetAlgo/interface/Measurement1D.h"
+#include "DataFormats/GeometryVector/interface/GlobalPoint.h"
+#include "DataFormats/GeometryVector/interface/LocalPoint.h"
+#include "DataFormats/GeometryVector/interface/Pi.h"
+#include "DataFormats/TrackingRecHit/interface/TrackingRecHit.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/CommonDetUnit/interface/GeomDet.h"
+#include "Geometry/CommonDetUnit/interface/GeomDetType.h"
+#include "MagneticField/Engine/interface/MagneticField.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackBuilder.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackErrorParam.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+using namespace std;
+
+PixelNtupletsFitter::PixelNtupletsFitter(float nominalB, const MagneticField* field, bool useRiemannFit)
+    : nominalB_(nominalB), field_(field), useRiemannFit_(useRiemannFit) {}
+
+std::unique_ptr<reco::Track> PixelNtupletsFitter::run(const std::vector<const TrackingRecHit*>& hits,
+                                                      const TrackingRegion& region,
+                                                      const edm::EventSetup&) const {
+  using namespace riemannFit;
+
+  std::unique_ptr<reco::Track> ret;
+
+  unsigned int nhits = hits.size();
+
+  if (nhits < 2)
+    return ret;
+
+  declareDynArray(GlobalPoint, nhits, points);
+  declareDynArray(GlobalError, nhits, errors);
+  declareDynArray(bool, nhits, isBarrel);
+
+  for (unsigned int i = 0; i != nhits; ++i) {
+    auto const& recHit = hits[i];
+    points[i] = GlobalPoint(recHit->globalPosition().basicVector() - region.origin().basicVector());
+    errors[i] = recHit->globalPositionError();
+    isBarrel[i] = recHit->detUnit()->type().isBarrel();
+  }
+
+  assert(nhits == 4);
+  riemannFit::Matrix3xNd<4> hits_gp;
+
+  Eigen::Matrix<float, 6, 4> hits_ge = Eigen::Matrix<float, 6, 4>::Zero();
+
+  for (unsigned int i = 0; i < nhits; ++i) {
+    hits_gp.col(i) << points[i].x(), points[i].y(), points[i].z();
+
+    hits_ge.col(i) << errors[i].cxx(), errors[i].cyx(), errors[i].cyy(), errors[i].czx(), errors[i].czy(),
+        errors[i].czz();
+  }
+
+  HelixFit fittedTrack = useRiemannFit_ ? riemannFit::helixFit(hits_gp, hits_ge, nominalB_, true)
+                                        : brokenline::helixFit(hits_gp, hits_ge, nominalB_);
+
+  int iCharge = fittedTrack.qCharge;
+
+  // parameters are:
+  // 0: phi
+  // 1: tip
+  // 2: curvature
+  // 3: cottheta
+  // 4: zip
+  float valPhi = fittedTrack.par(0);
+
+  float valTip = fittedTrack.par(1);
+
+  float valCotTheta = fittedTrack.par(3);
+
+  float valZip = fittedTrack.par(4);
+  float valPt = fittedTrack.par(2);
+  //
+  //  PixelTrackErrorParam param(valEta, valPt);
+  float errValPhi = std::sqrt(fittedTrack.cov(0, 0));
+  float errValTip = std::sqrt(fittedTrack.cov(1, 1));
+
+  float errValPt = std::sqrt(fittedTrack.cov(2, 2));
+
+  float errValCotTheta = std::sqrt(fittedTrack.cov(3, 3));
+  float errValZip = std::sqrt(fittedTrack.cov(4, 4));
+
+  float chi2 = fittedTrack.chi2_line + fittedTrack.chi2_circle;
+
+  PixelTrackBuilder builder;
+  Measurement1D phi(valPhi, errValPhi);
+  Measurement1D tip(valTip, errValTip);
+
+  Measurement1D pt(valPt, errValPt);
+  Measurement1D cotTheta(valCotTheta, errValCotTheta);
+  Measurement1D zip(valZip, errValZip);
+
+  ret.reset(builder.build(pt, phi, cotTheta, tip, zip, chi2, iCharge, hits, field_, region.origin()));
+  return ret;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
index 44820da381dd1..98dc3d9b282f1 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml
@@ -1,8 +1,80 @@
-<use name="boost"/>
-<use name="root"/>
-<use name="DataFormats/TrackReco"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
 <library file="PixelTrackTest.cc" name="PixelTrackTest">
+  <use name="boost"/>
+  <use name="root"/>
+  <use name="DataFormats/Common"/>
+  <use name="DataFormats/TrackReco"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/MessageLogger"/>
+  <use name="FWCore/ParameterSet"/>
   <flags EDM_PLUGIN="1"/>
 </library>
+
+<bin file="testRiemannFit.cpp">
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="testFits.cpp" name="testBrokenLineFit">
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DUSE_BL"/>
+</bin>
+
+<bin file="testFits.cpp" name="testRiemannFitDump">
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DRFIT_DEBUG"/>
+</bin>
+
+<bin file="testEigenGPU.cu" name="testRiemannFitGPU_t">
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="testEigenGPU.cu" name="testBrokenLineFitGPU_t">
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DUSE_BL"/>
+</bin>
+
+<bin file="testEigenGPUNoFit.cu" name="testEigenGPUNoFit_t">
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="PixelTrackRiemannFit.cc">
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <use name="root"/>
+  <flags CXXFLAGS="-DEIGEN_NO_DEBUG"/>
+</bin>
+
+<bin file="PixelTrackFits.cc" name="PixelTrackBrokenLineFit">
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <use name="root"/>
+  <flags CXXFLAGS="-DEIGEN_NO_DEBUG -DUSE_BL"/>
+</bin>
+
+<bin file="PixelTrackFits.cc" name="PixelTrackRiemannFit_Debug">
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <use name="root"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="testEigenJacobian.cpp">  
+  <use name="cuda"/>
+  <use name="eigen"/>
+  <use name="DataFormats/GeometrySurface"/>
+  <use name="MagneticField/Engine"/>
+  <use name="TrackingTools/AnalyticalJacobians"/>
+  <use name="TrackingTools/TrajectoryParametrization"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc
new file mode 100644
index 0000000000000..e5a652e9d43f8
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc
@@ -0,0 +1,431 @@
+#define _USE_MATH_DEFINES
+
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+
+#include <TFile.h>
+#include <TH1F.h>
+
+#ifdef USE_BL
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#else
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#endif
+
+using namespace std;
+using namespace Eigen;
+using namespace riemannFit;
+using std::unique_ptr;
+
+namespace riemannFit {
+  using Vector3i = Eigen::Matrix<int, 3, 1>;
+  using Vector4i = Eigen::Matrix<int, 4, 1>;
+  using Vector6d = Eigen::Matrix<double, 6, 1>;
+  using Vector8d = Eigen::Matrix<double, 8, 1>;
+};  // namespace riemannFit
+
+// quadruplets...
+struct hits_gen {
+  Matrix3xNd<4> hits;
+  Eigen::Matrix<float, 6, 4> hits_ge;
+  Vector5d true_par;
+};
+
+struct geometry {
+  Vector8d barrel;
+  Vector4i barrel_2;
+  Vector8d R_err;
+  Vector8d Rp_err;
+  Vector8d z_err;
+  Vector6d hand;
+  Vector3i hand_2;
+  Vector6d xy_err;
+  Vector6d zh_err;
+  double z_max;
+  double r_max;
+};
+
+void test_helix_fit();
+
+constexpr int c_speed = 299792458;
+constexpr double pi = M_PI;
+default_random_engine generator(1);
+
+void smearing(const Vector5d& err, const bool& isbarrel, double& x, double& y, double& z) {
+  normal_distribution<double> dist_R(0., err[0]);
+  normal_distribution<double> dist_Rp(0., err[1]);
+  normal_distribution<double> dist_z(0., err[2]);
+  normal_distribution<double> dist_xyh(0., err[3]);
+  normal_distribution<double> dist_zh(0., err[4]);
+  if (isbarrel) {
+    double dev_Rp = dist_Rp(generator);
+    double dev_R = dist_R(generator);
+    double R = sqrt(riemannFit::sqr(x) + riemannFit::sqr(y));
+    x += dev_Rp * +y / R + dev_R * -x / R;
+    y += dev_Rp * -x / R + dev_R * -y / R;
+    z += dist_z(generator);
+  } else {
+    x += dist_xyh(generator);
+    y += dist_xyh(generator);
+    z += dist_zh(generator);
+  }
+}
+
+template <int N>
+void Hits_cov(Eigen::Matrix<float, 6, 4>& V,
+              const unsigned int& i,
+              const unsigned int& n,
+              const Matrix3xNd<N>& hits,
+              const Vector5d& err,
+              bool isbarrel) {
+  if (isbarrel) {
+    double R2 = riemannFit::sqr(hits(0, i)) + riemannFit::sqr(hits(1, i));
+    V.col(i)[0] = (riemannFit::sqr(err[1]) * riemannFit::sqr(hits(1, i)) +
+                   riemannFit::sqr(err[0]) * riemannFit::sqr(hits(0, i))) /
+                  R2;
+    V.col(i)[2] = (riemannFit::sqr(err[1]) * riemannFit::sqr(hits(0, i)) +
+                   riemannFit::sqr(err[0]) * riemannFit::sqr(hits(1, i))) /
+                  R2;
+    V.col(i)[1] = (riemannFit::sqr(err[0]) - riemannFit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2;
+    V.col(i)[5] = riemannFit::sqr(err[2]);
+  } else {
+    V.col(i)[0] = riemannFit::sqr(err[3]);
+    V.col(i)[2] = riemannFit::sqr(err[3]);
+    V.col(i)[5] = riemannFit::sqr(err[4]);
+  }
+}
+
+hits_gen Hits_gen(const unsigned int& n, const Matrix<double, 6, 1>& gen_par) {
+  hits_gen gen;
+  gen.hits = MatrixXd::Zero(3, n);
+  gen.hits_ge = Eigen::Matrix<float, 6, 4>::Zero();
+  // err /= 10000.;
+  constexpr double rad[8] = {2.95, 6.8, 10.9, 16., 3.1, 7., 11., 16.2};
+  // constexpr double R_err[8] = {5./10000, 5./10000, 5./10000, 5./10000, 5./10000,
+  // 5./10000, 5./10000, 5./10000};  constexpr double Rp_err[8] = {35./10000, 18./10000,
+  // 15./10000, 34./10000, 35./10000, 18./10000, 15./10000, 34./10000};  constexpr double z_err[8] =
+  // {72./10000, 38./10000, 25./10000, 56./10000, 72./10000, 38./10000, 25./10000, 56./10000};
+  constexpr double R_err[8] = {
+      10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000};
+  constexpr double Rp_err[8] = {
+      35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000, 35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000};
+  constexpr double z_err[8] = {
+      72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000, 72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000};
+  const double x2 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180);
+  const double y2 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180);
+  const double alpha = atan2(y2, x2);
+
+  for (unsigned int i = 0; i < n; ++i) {
+    const double a = gen_par(4);
+    const double b = rad[i];
+    const double c = sqrt(riemannFit::sqr(x2) + riemannFit::sqr(y2));
+    const double beta = acos((riemannFit::sqr(a) - riemannFit::sqr(b) - riemannFit::sqr(c)) / (-2. * b * c));
+    const double gamma = alpha + beta;
+    gen.hits(0, i) = rad[i] * cos(gamma);
+    gen.hits(1, i) = rad[i] * sin(gamma);
+    gen.hits(2, i) =
+        gen_par(2) +
+        1 / tan(gen_par(5) * pi / 180) * 2. *
+            asin(sqrt(riemannFit::sqr((gen_par(0) - gen.hits(0, i))) + riemannFit::sqr((gen_par(1) - gen.hits(1, i)))) /
+                 (2. * gen_par(4))) *
+            gen_par(4);
+    // isbarrel(i) = ??
+    Vector5d err;
+    err << R_err[i], Rp_err[i], z_err[i], 0, 0;
+    smearing(err, true, gen.hits(0, i), gen.hits(1, i), gen.hits(2, i));
+    Hits_cov(gen.hits_ge, i, n, gen.hits, err, true);
+  }
+
+  return gen;
+}
+
+Vector5d True_par(const Matrix<double, 6, 1>& gen_par, const int& charge, const double& B_field) {
+  Vector5d true_par;
+  const double x0 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180);
+  const double y0 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180);
+  CircleFit circle;
+  circle.par << x0, y0, gen_par(4);
+  circle.qCharge = 1;
+  riemannFit::par_uvrtopak(circle, B_field, false);
+  true_par.block(0, 0, 3, 1) = circle.par;
+  true_par(3) = 1 / tan(gen_par(5) * pi / 180);
+  const int dir = ((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1)) * (gen_par(1) - y0) -
+                       (gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)) * (gen_par(0) - x0) >
+                   0)
+                      ? -1
+                      : 1;
+  true_par(4) = gen_par(2) + 1 / tan(gen_par(5) * pi / 180) * dir * 2.f *
+                                 asin(sqrt(riemannFit::sqr((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1))) +
+                                           riemannFit::sqr((gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)))) /
+                                      (2.f * gen_par(4))) *
+                                 gen_par(4);
+  return true_par;
+}
+
+Matrix<double, 6, 1> New_par(const Matrix<double, 6, 1>& gen_par, const int& charge, const double& B_field) {
+  Matrix<double, 6, 1> new_par;
+  new_par.block(0, 0, 3, 1) = gen_par.block(0, 0, 3, 1);
+  new_par(3) = gen_par(3) - charge * 90;
+  new_par(4) = gen_par(4) / B_field;
+  //  new_par(5) = atan(sinh(gen_par(5))) * 180 / pi;
+  new_par(5) = 2. * atan(exp(-gen_par(5))) * 180 / pi;
+  return new_par;
+}
+
+template <typename Fit, size_t N>
+void computePull(std::array<Fit, N>& fit, const char* label, int n_, int iteration, const Vector5d& true_par) {
+  Eigen::Matrix<double, 41, Eigen::Dynamic, 1> score(41, iteration);
+
+  std::string histo_name("Phi Pull");
+  histo_name += label;
+  TH1F phi_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "dxy Pull ";
+  histo_name += label;
+  TH1F dxy_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "dz Pull ";
+  histo_name += label;
+  TH1F dz_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Theta Pull ";
+  histo_name += label;
+  TH1F theta_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Pt Pull ";
+  histo_name += label;
+  TH1F pt_pull(histo_name.data(), histo_name.data(), 100, -10., 10.);
+  histo_name = "Phi Error ";
+  histo_name += label;
+  TH1F phi_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "dxy error ";
+  histo_name += label;
+  TH1F dxy_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "dz error ";
+  histo_name += label;
+  TH1F dz_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "Theta error ";
+  histo_name += label;
+  TH1F theta_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  histo_name = "Pt error ";
+  histo_name += label;
+  TH1F pt_error(histo_name.data(), histo_name.data(), 100, 0., 0.1);
+  for (int x = 0; x < iteration; x++) {
+    // Compute PULLS information
+    score(0, x) = (fit[x].par(0) - true_par(0)) / sqrt(fit[x].cov(0, 0));
+    score(1, x) = (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(1, 1));
+    score(2, x) = (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(2, 2));
+    score(3, x) = (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(3, 3));
+    score(4, x) = (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(4, 4));
+    phi_pull.Fill(score(0, x));
+    dxy_pull.Fill(score(1, x));
+    pt_pull.Fill(score(2, x));
+    theta_pull.Fill(score(3, x));
+    dz_pull.Fill(score(4, x));
+    phi_error.Fill(sqrt(fit[x].cov(0, 0)));
+    dxy_error.Fill(sqrt(fit[x].cov(1, 1)));
+    pt_error.Fill(sqrt(fit[x].cov(2, 2)));
+    theta_error.Fill(sqrt(fit[x].cov(3, 3)));
+    dz_error.Fill(sqrt(fit[x].cov(4, 4)));
+    score(5, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / (fit[x].cov(0, 1));
+    score(6, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(0, 2));
+    score(7, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(1, 2));
+    score(8, x) = (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / (fit[x].cov(3, 4));
+    score(9, x) = fit[x].chi2_circle;
+    score(25, x) = fit[x].chi2_line;
+    score(10, x) = sqrt(fit[x].cov(0, 0)) / fit[x].par(0) * 100;
+    score(13, x) = sqrt(fit[x].cov(3, 3)) / fit[x].par(3) * 100;
+    score(14, x) = sqrt(fit[x].cov(4, 4)) / fit[x].par(4) * 100;
+    score(15, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(3, 3));
+    score(16, x) =
+        (fit[x].par(1) - true_par(1)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(3, 3));
+    score(17, x) =
+        (fit[x].par(2) - true_par(2)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(3, 3));
+    score(18, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(4, 4));
+    score(19, x) =
+        (fit[x].par(1) - true_par(1)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(4, 4));
+    score(20, x) =
+        (fit[x].par(2) - true_par(2)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(4, 4));
+    score(21, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(1, 1));
+    score(22, x) =
+        (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(2, 2));
+    score(23, x) =
+        (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(2, 2));
+    score(24, x) =
+        (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(3, 3)) / sqrt(fit[x].cov(4, 4));
+    score(30, x) = fit[x].par(0);
+    score(31, x) = fit[x].par(1);
+    score(32, x) = fit[x].par(2);
+    score(33, x) = fit[x].par(3);
+    score(34, x) = fit[x].par(4);
+    score(35, x) = sqrt(fit[x].cov(0, 0));
+    score(36, x) = sqrt(fit[x].cov(1, 1));
+    score(37, x) = sqrt(fit[x].cov(2, 2));
+    score(38, x) = sqrt(fit[x].cov(3, 3));
+    score(39, x) = sqrt(fit[x].cov(4, 4));
+  }
+
+  double phi_ = score.row(0).mean();
+  double a_ = score.row(1).mean();
+  double pt_ = score.row(2).mean();
+  double coT_ = score.row(3).mean();
+  double Zip_ = score.row(4).mean();
+  std::cout << std::setprecision(5) << std::scientific << label << " AVERAGE FITTED VALUES: \n"
+            << "phi: " << score.row(30).mean() << " +/- " << score.row(35).mean() << " [+/-] "
+            << sqrt(score.row(35).array().abs2().mean() - score.row(35).mean() * score.row(35).mean()) << std::endl
+            << "d0:  " << score.row(31).mean() << " +/- " << score.row(36).mean() << " [+/-] "
+            << sqrt(score.row(36).array().abs2().mean() - score.row(36).mean() * score.row(36).mean()) << std::endl
+            << "pt:  " << score.row(32).mean() << " +/- " << score.row(37).mean() << " [+/-] "
+            << sqrt(score.row(37).array().abs2().mean() - score.row(37).mean() * score.row(37).mean()) << std::endl
+            << "coT: " << score.row(33).mean() << " +/- " << score.row(38).mean() << " [+/-] "
+            << sqrt(score.row(38).array().abs2().mean() - score.row(38).mean() * score.row(38).mean()) << std::endl
+            << "Zip: " << score.row(34).mean() << " +/- " << score.row(39).mean() << " [+/-] "
+            << sqrt(score.row(39).array().abs2().mean() - score.row(39).mean() * score.row(39).mean()) << std::endl;
+
+  Matrix5d correlation;
+  correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(), score.row(20).mean(),
+      score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(), score.row(19).mean(), score.row(22).mean(),
+      score.row(23).mean(), 1., score.row(17).mean(), score.row(20).mean(), score.row(15).mean(), score.row(16).mean(),
+      score.row(17).mean(), 1., score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(),
+      score.row(24).mean(), 1.;
+
+  cout << "\n"
+       << label << " PULLS (mean, sigma, relative_error):\n"
+       << "phi:  " << phi_ << "     " << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(10).mean()) << "%\n"
+       << "a0 :  " << a_ << "     " << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(11).mean()) << "%\n"
+       << "pt :  " << pt_ << "     " << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(12).mean()) << "%\n"
+       << "coT:  " << coT_ << "     " << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(13).mean()) << "%\n"
+       << "Zip:  " << Zip_ << "     " << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << "   "
+       << abs(score.row(14).mean()) << "%\n\n"
+       << "cov(phi,a0)_:  " << score.row(5).mean() << "\n"
+       << "cov(phi,pt)_:  " << score.row(6).mean() << "\n"
+       << "cov(a0,pt)_:   " << score.row(7).mean() << "\n"
+       << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n"
+       << "chi2_circle:  " << score.row(9).mean() << " vs " << n_ - 3 << "\n"
+       << "chi2_line:    " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n"
+       << "correlation matrix:\n"
+       << correlation << "\n\n"
+       << endl;
+
+  phi_pull.Fit("gaus", "Q");
+  dxy_pull.Fit("gaus", "Q");
+  dz_pull.Fit("gaus", "Q");
+  theta_pull.Fit("gaus", "Q");
+  pt_pull.Fit("gaus", "Q");
+  phi_pull.Write();
+  dxy_pull.Write();
+  dz_pull.Write();
+  theta_pull.Write();
+  pt_pull.Write();
+  phi_error.Write();
+  dxy_error.Write();
+  dz_error.Write();
+  theta_error.Write();
+  pt_error.Write();
+}
+
+void test_helix_fit(bool getcin) {
+  int n_;
+  const double B_field = 3.8 * c_speed / pow(10, 9) / 100;
+  Matrix<double, 6, 1> gen_par;
+  Vector5d true_par;
+  Vector5d err;
+  generator.seed(1);
+  std::cout << std::setprecision(6);
+  cout << "_________________________________________________________________________\n";
+  cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration debug" << endl;
+  if (getcin) {
+    cout << "hits: ";
+    cin >> n_;
+    cout << "x: ";
+    cin >> gen_par(0);
+    cout << "y: ";
+    cin >> gen_par(1);
+    cout << "z: ";
+    cin >> gen_par(2);
+    cout << "phi: ";
+    cin >> gen_par(3);
+    cout << "p_t: ";
+    cin >> gen_par(4);
+    cout << "eta: ";
+    cin >> gen_par(5);
+  } else {
+    n_ = 4;
+    gen_par(0) = -0.1;  // x
+    gen_par(1) = 0.1;   // y
+    gen_par(2) = -1.;   // z
+    gen_par(3) = 45.;   // phi
+    gen_par(4) = 10.;   // R (p_t)
+    gen_par(5) = 1.;    // eta
+  }
+
+  const int iteration = 5000;
+  gen_par = New_par(gen_par, 1, B_field);
+  true_par = True_par(gen_par, 1, B_field);
+  std::array<HelixFit, iteration> helixRiemann_fit;
+
+  std::cout << "\nTrue parameters: "
+            << "phi: " << true_par(0) << " "
+            << "dxy: " << true_par(1) << " "
+            << "pt: " << true_par(2) << " "
+            << "CotT: " << true_par(3) << " "
+            << "Zip: " << true_par(4) << " " << std::endl;
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+  for (int i = 0; i < 100 * iteration; i++) {
+    hits_gen gen;
+    gen = Hits_gen(n_, gen_par);
+    //      gen.hits = MatrixXd::Zero(3, 4);
+    //      gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4);
+    //      gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325;
+    //      gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467;
+    //      gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005;
+    //      gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213;
+    delta -= std::chrono::high_resolution_clock::now() - start;
+    helixRiemann_fit[i % iteration] =
+#ifdef USE_BL
+        brokenline::helixFit(gen.hits, gen.hits_ge, B_field);
+#else
+        riemannFit::helixFit(gen.hits, gen.hits_ge, B_field, true);
+#endif
+    delta += std::chrono::high_resolution_clock::now() - start;
+
+    if (helixRiemann_fit[i % iteration].par(0) > 10.)
+      std::cout << "error" << std::endl;
+    if (0 == i)
+      cout << std::setprecision(6) << "phi:  " << helixRiemann_fit[i].par(0) << " +/- "
+           << sqrt(helixRiemann_fit[i].cov(0, 0)) << " vs " << true_par(0) << endl
+           << "Tip:  " << helixRiemann_fit[i].par(1) << " +/- " << sqrt(helixRiemann_fit[i].cov(1, 1)) << " vs "
+           << true_par(1) << endl
+           << "p_t:  " << helixRiemann_fit[i].par(2) << " +/- " << sqrt(helixRiemann_fit[i].cov(2, 2)) << " vs "
+           << true_par(2) << endl
+           << "theta:" << helixRiemann_fit[i].par(3) << " +/- " << sqrt(helixRiemann_fit[i].cov(3, 3)) << " vs "
+           << true_par(3) << endl
+           << "Zip:  " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs "
+           << true_par(4) << endl
+           << "charge:" << helixRiemann_fit[i].qCharge << " vs 1" << endl
+           << "covariance matrix:" << endl
+           << helixRiemann_fit[i].cov << endl
+           << "Initial hits:\n"
+           << gen.hits << endl
+           << "Initial Covariance:\n"
+           << gen.hits_ge << endl;
+  }
+  std::cout << "elapsted time " << double(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count()) / 1.e6
+            << std::endl;
+  computePull(helixRiemann_fit, "Riemann", n_, iteration, true_par);
+}
+
+int main(int nargs, char**) {
+  TFile f("TestFitResults.root", "RECREATE");
+  test_helix_fit(nargs > 1);
+  f.Close();
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
new file mode 100644
index 0000000000000..d5eba9be26594
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -0,0 +1,343 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+#ifdef USE_BL
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#else
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#endif
+
+#include "test_common.h"
+
+using namespace Eigen;
+
+namespace riemannFit {
+  constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks(); }
+  // hits
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()>>;
+  // errors
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()>>;
+  // fast fit
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()>>;
+
+}  // namespace riemannFit
+
+template <int N>
+__global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  riemannFit::Map3xNd<N> hits(phits + i, 3, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
+  if (i != 0)
+    return;
+  printf("GPU sizes %lu %lu %lu %lu %lu\n",
+         sizeof(hits[i]),
+         sizeof(hits_ge[i]),
+         sizeof(Vector4d),
+         sizeof(riemannFit::LineFit),
+         sizeof(riemannFit::CircleFit));
+}
+
+template <int N>
+__global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d result(presults + i, 4);
+#ifdef USE_BL
+  brokenline::fastFit(hits, result);
+#else
+  riemannFit::fastFit(hits, result);
+#endif
+}
+
+#ifdef USE_BL
+
+template <int N>
+__global__ void kernelBrokenLineFit(double* __restrict__ phits,
+                                    float* __restrict__ phits_ge,
+                                    double* __restrict__ pfast_fit_input,
+                                    double B,
+                                    riemannFit::CircleFit* circle_fit,
+                                    riemannFit::LineFit* line_fit) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+
+  brokenline::PreparedBrokenLineData<N> data;
+  riemannFit::Matrix3d Jacob;
+
+  auto& line_fit_results = line_fit[i];
+  auto& circle_fit_results = circle_fit[i];
+
+  brokenline::prepareBrokenLineData(hits, fast_fit_input, B, data);
+  brokenline::lineFit(hits_ge, fast_fit_input, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
+
+#ifdef TEST_DEBUG
+  if (0 == i) {
+    printf("Circle param %f,%f,%f\n", circle_fit[i].par(0), circle_fit[i].par(1), circle_fit[i].par(2));
+  }
+#endif
+}
+
+#else
+
+template <int N>
+__global__ void kernel_CircleFit(double* __restrict__ phits,
+                                 float* __restrict__ phits_ge,
+                                 double* __restrict__ pfast_fit_input,
+                                 double B,
+                                 riemannFit::CircleFit* circle_fit_resultsGPU) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+
+  constexpr auto n = N;
+
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+  riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
+
+#ifdef TEST_DEBUG
+  if (0 == i) {
+    printf("hits %f, %f\n", hits.block(0, 0, 2, n)(0, 0), hits.block(0, 0, 2, n)(0, 1));
+    printf("hits %f, %f\n", hits.block(0, 0, 2, n)(1, 0), hits.block(0, 0, 2, n)(1, 1));
+    printf("fast_fit_input(0): %f\n", fast_fit_input(0));
+    printf("fast_fit_input(1): %f\n", fast_fit_input(1));
+    printf("fast_fit_input(2): %f\n", fast_fit_input(2));
+    printf("fast_fit_input(3): %f\n", fast_fit_input(3));
+    printf("rad(0,0): %f\n", rad(0, 0));
+    printf("rad(1,1): %f\n", rad(1, 1));
+    printf("rad(2,2): %f\n", rad(2, 2));
+    printf("hits_cov(0,0): %f\n", (*hits_cov)(0, 0));
+    printf("hits_cov(1,1): %f\n", (*hits_cov)(1, 1));
+    printf("hits_cov(2,2): %f\n", (*hits_cov)(2, 2));
+    printf("hits_cov(11,11): %f\n", (*hits_cov)(11, 11));
+    printf("B: %f\n", B);
+  }
+#endif
+  circle_fit_resultsGPU[i] = riemannFit::circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
+#ifdef TEST_DEBUG
+  if (0 == i) {
+    printf("Circle param %f,%f,%f\n",
+           circle_fit_resultsGPU[i].par(0),
+           circle_fit_resultsGPU[i].par(1),
+           circle_fit_resultsGPU[i].par(2));
+  }
+#endif
+}
+
+template <int N>
+__global__ void kernelLineFit(double* __restrict__ phits,
+                              float* __restrict__ phits_ge,
+                              double B,
+                              riemannFit::CircleFit* circle_fit,
+                              double* __restrict__ pfast_fit_input,
+                              riemannFit::LineFit* line_fit) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  line_fit[i] = riemannFit::lineFit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
+}
+#endif
+
+template <typename M3xN, typename M6xN>
+__device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
+  constexpr uint32_t N = M3xN::ColsAtCompileTime;
+
+  if (N == 5) {
+    hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+        -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
+    return;
+  }
+
+  if (N > 3)
+    hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
+  else
+    hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
+
+  hits_ge.col(0)[0] = 7.14652e-06;
+  hits_ge.col(1)[0] = 2.15789e-06;
+  hits_ge.col(2)[0] = 1.63328e-06;
+  if (N > 3)
+    hits_ge.col(3)[0] = 6.27919e-06;
+  hits_ge.col(0)[2] = 6.10348e-06;
+  hits_ge.col(1)[2] = 2.08211e-06;
+  hits_ge.col(2)[2] = 1.61672e-06;
+  if (N > 3)
+    hits_ge.col(3)[2] = 6.28081e-06;
+  hits_ge.col(0)[5] = 5.184e-05;
+  hits_ge.col(1)[5] = 1.444e-05;
+  hits_ge.col(2)[5] = 6.25e-06;
+  if (N > 3)
+    hits_ge.col(3)[5] = 3.136e-05;
+  hits_ge.col(0)[1] = -5.60077e-06;
+  hits_ge.col(1)[1] = -1.11936e-06;
+  hits_ge.col(2)[1] = -6.24945e-07;
+  if (N > 3)
+    hits_ge.col(3)[1] = -5.28e-06;
+}
+
+template <int N>
+__global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  hits_ge = MatrixXf::Zero(6, N);
+  fillHitsAndHitsCov(hits, hits_ge);
+}
+
+template <int N>
+void testFit() {
+  constexpr double B = 0.0113921;
+  riemannFit::Matrix3xNd<N> hits;
+  riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+  double* hitsGPU = nullptr;
+  ;
+  float* hits_geGPU = nullptr;
+  double* fast_fit_resultsGPU = nullptr;
+  double* fast_fit_resultsGPUret = new double[riemannFit::maxNumberOfTracks() * sizeof(Vector4d)];
+  riemannFit::CircleFit* circle_fit_resultsGPU = nullptr;
+  riemannFit::CircleFit* circle_fit_resultsGPUret = new riemannFit::CircleFit();
+  riemannFit::LineFit* line_fit_resultsGPU = nullptr;
+  riemannFit::LineFit* line_fit_resultsGPUret = new riemannFit::LineFit();
+
+  fillHitsAndHitsCov(hits, hits_ge);
+
+  std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' '
+            << sizeof(riemannFit::LineFit) << ' ' << sizeof(riemannFit::CircleFit) << std::endl;
+
+  std::cout << "Generated hits:\n" << hits << std::endl;
+  std::cout << "Generated cov:\n" << hits_ge << std::endl;
+
+  // FAST_FIT_CPU
+#ifdef USE_BL
+  Vector4d fast_fit_results;
+  brokenline::fastFit(hits, fast_fit_results);
+#else
+  Vector4d fast_fit_results;
+  riemannFit::fastFit(hits, fast_fit_results);
+#endif
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+  // for timing    purposes we fit    4096 tracks
+  constexpr uint32_t Ntracks = 4096;
+  cudaCheck(cudaMalloc(&hitsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix3xNd<N>)));
+  cudaCheck(cudaMalloc(&hits_geGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix6xNf<N>)));
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMalloc(&line_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::CircleFit)));
+
+  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
+
+  kernelPrintSizes<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
+  kernelFillHitsAndHitsCov<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
+
+  // FAST_FIT GPU
+  kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
+                       fast_fit_resultsGPU,
+                       riemannFit::maxNumberOfTracks() * sizeof(Vector4d),
+                       cudaMemcpyDeviceToHost));
+  riemannFit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
+  assert(isEqualFuzzy(fast_fit_results, fast_fit));
+
+#ifdef USE_BL
+  // CIRCLE AND LINE FIT CPU
+  brokenline::PreparedBrokenLineData<N> data;
+  brokenline::karimaki_circle_fit circle_fit_results;
+  riemannFit::LineFit line_fit_results;
+  riemannFit::Matrix3d Jacob;
+  brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
+
+  // fit on GPU
+  kernelBrokenLineFit<N>
+      <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+#else
+  // CIRCLE_FIT CPU
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+
+  riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::CircleFit circle_fit_results =
+      riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+
+  // CIRCLE_FIT GPU
+  kernel_CircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  cudaDeviceSynchronize();
+
+  // LINE_FIT CPU
+  riemannFit::LineFit line_fit_results =
+      riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+
+  kernelLineFit<N>
+      <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
+  cudaDeviceSynchronize();
+#endif
+
+  std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
+
+  cudaCheck(cudaMemcpy(
+      circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(riemannFit::CircleFit), cudaMemcpyDeviceToHost));
+  std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
+
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
+  // LINE_FIT GPU
+  cudaCheck(
+      cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(riemannFit::LineFit), cudaMemcpyDeviceToHost));
+  std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
+  assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
+
+  std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl;
+  std::cout << "Fitted cov (LineFit): GPU\n" << line_fit_resultsGPUret->cov << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  cms::cudatest::requireDevices();
+
+  testFit<4>();
+  testFit<3>();
+  testFit<5>();
+
+  std::cout << "TEST FIT, NO ERRORS" << std::endl;
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
new file mode 100644
index 0000000000000..6ac1088943305
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -0,0 +1,248 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "test_common.h"
+
+using namespace Eigen;
+
+using Matrix5d = Matrix<double, 5, 5>;
+
+__host__ __device__ void eigenValues(Matrix3d *m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret) {
+#if TEST_DEBUG
+  printf("Matrix(0,0): %f\n", (*m)(0, 0));
+  printf("Matrix(1,1): %f\n", (*m)(1, 1));
+  printf("Matrix(2,2): %f\n", (*m)(2, 2));
+#endif
+  SelfAdjointEigenSolver<Matrix3d> es;
+  es.computeDirect(*m);
+  (*ret) = es.eigenvalues();
+  return;
+}
+
+__global__ void kernel(Matrix3d *m, Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret) {
+  eigenValues(m, ret);
+}
+
+__global__ void kernelInverse3x3(Matrix3d *in, Matrix3d *out) { (*out) = in->inverse(); }
+
+__global__ void kernelInverse4x4(Matrix4d *in, Matrix4d *out) { (*out) = in->inverse(); }
+
+__global__ void kernelInverse5x5(Matrix5d *in, Matrix5d *out) { (*out) = in->inverse(); }
+
+template <typename M1, typename M2, typename M3>
+__global__ void kernelMultiply(M1 *J, M2 *C, M3 *result) {
+//  Map<M3> res(result->data());
+#if TEST_DEBUG
+  printf("*** GPU IN ***\n");
+#endif
+  printIt(J);
+  printIt(C);
+  //  res.noalias() = (*J) * (*C);
+  //  printIt(&res);
+  (*result) = (*J) * (*C);
+#if TEST_DEBUG
+  printf("*** GPU OUT ***\n");
+#endif
+  return;
+}
+
+template <int row1, int col1, int row2, int col2>
+void testMultiply() {
+  std::cout << "TEST MULTIPLY" << std::endl;
+  std::cout << "Product of type " << row1 << "x" << col1 << " * " << row2 << "x" << col2 << std::endl;
+  Eigen::Matrix<double, row1, col1> J;
+  fillMatrix(J);
+  Eigen::Matrix<double, row2, col2> C;
+  fillMatrix(C);
+  Eigen::Matrix<double, row1, col2> multiply_result = J * C;
+#if TEST_DEBUG
+  std::cout << "Input J:" << std::endl;
+  printIt(&J);
+  std::cout << "Input C:" << std::endl;
+  printIt(&C);
+  std::cout << "Output:" << std::endl;
+  printIt(&multiply_result);
+#endif
+  // GPU
+  Eigen::Matrix<double, row1, col1> *JGPU = nullptr;
+  Eigen::Matrix<double, row2, col2> *CGPU = nullptr;
+  Eigen::Matrix<double, row1, col2> *multiply_resultGPU = nullptr;
+  Eigen::Matrix<double, row1, col2> *multiply_resultGPUret = new Eigen::Matrix<double, row1, col2>();
+
+  cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>)));
+  cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>)));
+  cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>)));
+  cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));
+
+  kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(
+      multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost));
+  printIt(multiply_resultGPUret);
+  assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
+}
+
+void testInverse3x3() {
+  std::cout << "TEST INVERSE 3x3" << std::endl;
+  Matrix3d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix3d m_inv = m.inverse();
+  Matrix3d *mGPU = nullptr;
+  Matrix3d *mGPUret = nullptr;
+  Matrix3d *mCPUret = new Matrix3d();
+
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
+
+  kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
+void testInverse4x4() {
+  std::cout << "TEST INVERSE 4x4" << std::endl;
+  Matrix4d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix4d m_inv = m.inverse();
+  Matrix4d *mGPU = nullptr;
+  Matrix4d *mGPUret = nullptr;
+  Matrix4d *mCPUret = new Matrix4d();
+
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice));
+
+  kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
+void testInverse5x5() {
+  std::cout << "TEST INVERSE 5x5" << std::endl;
+  Matrix5d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix5d m_inv = m.inverse();
+  Matrix5d *mGPU = nullptr;
+  Matrix5d *mGPUret = nullptr;
+  Matrix5d *mCPUret = new Matrix5d();
+
+#if TEST_DEBUG
+  std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
+  std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d)));
+  cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)));
+  cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice));
+
+  kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
+#endif
+  assert(isEqualFuzzy(m_inv, *mCPUret));
+}
+
+void testEigenvalues() {
+  std::cout << "TEST EIGENVALUES" << std::endl;
+  Matrix3d m;
+  fillMatrix(m);
+  m += m.transpose().eval();
+
+  Matrix3d *m_gpu = nullptr;
+  Matrix3d *mgpudebug = new Matrix3d();
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret =
+      new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret1 =
+      new Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType;
+  Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType *ret_gpu = nullptr;
+  eigenValues(&m, ret);
+#if TEST_DEBUG
+  std::cout << "Generated Matrix M 3x3:\n" << m << std::endl;
+  std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
+  std::cout << "*************************\n\n" << std::endl;
+#endif
+  cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)));
+  cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType)));
+  cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));
+
+  kernel<<<1, 1>>>(m_gpu, ret_gpu);
+  cudaDeviceSynchronize();
+
+  cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(
+      ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
+#if TEST_DEBUG
+  std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
+  std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
+  std::cout << "*************************\n\n" << std::endl;
+#endif
+  assert(isEqualFuzzy(*ret, *ret1));
+}
+
+int main(int argc, char *argv[]) {
+  cms::cudatest::requireDevices();
+
+  testEigenvalues();
+  testInverse3x3();
+  testInverse4x4();
+  testInverse5x5();
+
+  testMultiply<1, 2, 2, 1>();
+  testMultiply<1, 2, 2, 2>();
+  testMultiply<1, 2, 2, 3>();
+  testMultiply<1, 2, 2, 4>();
+  testMultiply<1, 2, 2, 5>();
+  testMultiply<2, 1, 1, 2>();
+  testMultiply<2, 1, 1, 3>();
+  testMultiply<2, 1, 1, 4>();
+  testMultiply<2, 1, 1, 5>();
+  testMultiply<2, 2, 2, 2>();
+  testMultiply<2, 3, 3, 1>();
+  testMultiply<2, 3, 3, 2>();
+  testMultiply<2, 3, 3, 4>();
+  testMultiply<2, 3, 3, 5>();
+  testMultiply<3, 2, 2, 3>();
+  testMultiply<2, 3, 3, 3>();  // DOES NOT COMPILE W/O PATCHING EIGEN
+  testMultiply<3, 3, 3, 3>();
+  testMultiply<8, 8, 8, 8>();
+  testMultiply<3, 4, 4, 3>();
+  testMultiply<2, 4, 4, 2>();
+  testMultiply<3, 4, 4, 2>();  // DOES NOT COMPILE W/O PATCHING EIGEN
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
new file mode 100644
index 0000000000000..a8e040fa0df38
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
@@ -0,0 +1,134 @@
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+#include <cmath>
+
+using riemannFit::Matrix5d;
+using riemannFit::Vector5d;
+
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+
+#include "DataFormats/GeometrySurface/interface/Surface.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+
+#include "MagneticField/Engine/interface/MagneticField.h"
+
+namespace {
+
+  struct M5T : public MagneticField {
+    M5T() : mf(0., 0., 5.) {}
+    virtual GlobalVector inTesla(const GlobalPoint&) const { return mf; }
+
+    GlobalVector mf;
+  };
+
+}  // namespace
+
+// old pixeltrack version...
+Matrix5d transfFast(Matrix5d cov, Vector5d const& p) {
+  auto sqr = [](auto x) { return x * x; };
+  auto sinTheta = 1 / std::sqrt(1 + p(3) * p(3));
+  auto cosTheta = p(3) * sinTheta;
+  cov(2, 2) = sqr(sinTheta) * (cov(2, 2) * sqr(1. / (p(2) * p(2))) + cov(3, 3) * sqr(cosTheta * sinTheta / p(2)));
+  cov(3, 2) = cov(2, 3) = cov(3, 3) * cosTheta * sqr(sinTheta) / p(2);
+  // for (int i=0; i<5; ++i) cov(i,2) *= -sinTheta/(p(2)*p(2));
+  // for (int i=0; i<5; ++i) cov(2,i) *= -sinTheta/(p(2)*p(2));
+  return cov;
+}
+
+Matrix5d loadCov(Vector5d const& e) {
+  Matrix5d cov;
+  for (int i = 0; i < 5; ++i)
+    cov(i, i) = e(i) * e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
+      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
+      cov(j, i) = cov(i, j);
+    }
+  }
+  return cov;
+}
+
+#include <iostream>
+int main() {
+  M5T const mf;
+
+  for (auto charge = -1; charge < 2; charge += 2)
+    for (auto szip = -1; szip < 2; szip += 2)
+      for (auto stip = -1; stip < 2; stip += 2) {
+        Vector5d par0;
+        par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
+        Vector5d del0;
+        del0 << 0.01, 0.01, 0.035, -0.03, -0.01;
+        //!<(phi,Tip,pt,cotan(theta)),Zip)
+        par0(1) *= stip;
+        par0(4) *= szip;
+
+        Matrix5d cov0 = loadCov(del0);
+
+        Vector5d par1;
+        Vector5d par2;
+
+        Matrix5d cov1;
+        Matrix5d cov2;
+
+        // Matrix5d covf = transfFast(cov0,par0);
+
+        riemannFit::transformToPerigeePlane(par0, cov0, par1, cov1);
+
+        std::cout << "cov1\n" << cov1 << std::endl;
+
+        LocalTrajectoryParameters lpar(par1(0), par1(1), par1(2), par1(3), par1(4), 1.);
+        AlgebraicSymMatrix55 m;
+        for (int i = 0; i < 5; ++i)
+          for (int j = i; j < 5; ++j)
+            m(i, j) = cov1(i, j);
+
+        float phi = par0(0);
+        float sp = std::sin(phi);
+        float cp = std::cos(phi);
+        Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
+
+        Surface::PositionType bs(0., 0., 0.);
+        Plane plane(bs, rot);
+        GlobalTrajectoryParameters gp(
+            plane.toGlobal(lpar.position()), plane.toGlobal(lpar.momentum()), lpar.charge(), &mf);
+        std::cout << "global par " << gp.position() << ' ' << gp.momentum() << ' ' << gp.charge() << std::endl;
+        JacobianLocalToCurvilinear jl2c(plane, lpar, mf);
+        std::cout << "jac l2c" << jl2c.jacobian() << std::endl;
+
+        AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
+        std::cout << "curv error\n" << mo << std::endl;
+
+        /*
+
+  // not accurate as the perigee plane move as well...
+  Vector5d del1 = par2-par1;
+
+
+  // don't ask: guess
+  std::cout << "charge " << charge << std::endl;
+  std::cout << "par0 " << par0.transpose() << std::endl;
+  std::cout << "del0 " << del0.transpose() << std::endl;
+
+
+  std::cout << "par1 " << par1.transpose() << std::endl;
+  std::cout << "del1 " << del1.transpose() << std::endl;
+  // std::cout << "del2 " << (J*del0).transpose() << std::endl;
+
+  std::cout << "del1^2 " << (del1.array()*del1.array()).transpose() << std::endl;
+  std::cout << std::endl;
+  
+  std::cout << "cov0\n" << cov0 << std::endl;
+  std::cout << "cov1\n" << cov1 << std::endl;
+  std::cout << "cov2\n" << cov2 << std::endl;
+  */
+
+        std::cout << std::endl << "----------" << std::endl;
+
+      }  // lopp over signs
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp
new file mode 100644
index 0000000000000..7c0dab3be3e00
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp
@@ -0,0 +1,154 @@
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#ifdef USE_BL
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+#else
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+#endif
+
+#include "test_common.h"
+
+using namespace Eigen;
+
+namespace riemannFit {
+  constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
+  constexpr uint32_t stride() { return maxNumberOfTracks(); }
+  // hits
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()> >;
+  // errors
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()> >;
+  // fast fit
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
+
+}  // namespace riemannFit
+
+/*
+Hit global: 641,0 2: 2.934787,0.773211,-10.980247
+Error: 641,0 2: 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05
+Hit global: 641,1 104: 6.314229,1.816356,-23.162731
+Error: 641,1 104: 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06
+Hit global: 641,2 1521: 8.936963,2.765734,-32.759060
+Error: 641,2 1521: 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07
+Hit global: 641,3 1712: 10.360559,3.330824,-38.061260
+Error: 641,3 1712: 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08
+Hit global: 641,4 1824: 12.856387,4.422212,-47.518867
+Error: 641,4 1824: 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07
+*/
+
+template <typename M3xN, typename M6xN>
+void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
+  constexpr uint32_t N = M3xN::ColsAtCompileTime;
+
+  if (N == 5) {
+    hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212,
+        -10.980247, -23.162731, -32.759060, -38.061260, -47.518867;
+    hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05;
+    hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06;
+    hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07;
+    hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08;
+    hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07;
+    return;
+  }
+
+  if (N > 3)
+    hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793;
+  else
+    hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808;
+
+  hits_ge.col(0)[0] = 7.14652e-06;
+  hits_ge.col(1)[0] = 2.15789e-06;
+  hits_ge.col(2)[0] = 1.63328e-06;
+  if (N > 3)
+    hits_ge.col(3)[0] = 6.27919e-06;
+  hits_ge.col(0)[2] = 6.10348e-06;
+  hits_ge.col(1)[2] = 2.08211e-06;
+  hits_ge.col(2)[2] = 1.61672e-06;
+  if (N > 3)
+    hits_ge.col(3)[2] = 6.28081e-06;
+  hits_ge.col(0)[5] = 5.184e-05;
+  hits_ge.col(1)[5] = 1.444e-05;
+  hits_ge.col(2)[5] = 6.25e-06;
+  if (N > 3)
+    hits_ge.col(3)[5] = 3.136e-05;
+  hits_ge.col(0)[1] = -5.60077e-06;
+  hits_ge.col(1)[1] = -1.11936e-06;
+  hits_ge.col(2)[1] = -6.24945e-07;
+  if (N > 3)
+    hits_ge.col(3)[1] = -5.28e-06;
+}
+
+template <int N>
+void testFit() {
+  constexpr double B = 0.0113921;
+  riemannFit::Matrix3xNd<N> hits;
+  riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+
+  fillHitsAndHitsCov(hits, hits_ge);
+
+  std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << std::endl;
+
+  std::cout << "Generated hits:\n" << hits << std::endl;
+  std::cout << "Generated cov:\n" << hits_ge << std::endl;
+
+  // FAST_FIT_CPU
+#ifdef USE_BL
+  Vector4d fast_fit_results;
+  brokenline::fastFit(hits, fast_fit_results);
+#else
+  Vector4d fast_fit_results;
+  riemannFit::fastFit(hits, fast_fit_results);
+#endif
+  std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
+
+  // CIRCLE_FIT CPU
+
+#ifdef USE_BL
+  brokenline::PreparedBrokenLineData<N> data;
+  brokenline::karimaki_circle_fit circle_fit_results;
+  riemannFit::Matrix3d Jacob;
+
+  brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  riemannFit::LineFit line_fit_results;
+  brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+  circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
+  circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
+#else
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+  riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::CircleFit circle_fit_results =
+      riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+  // LINE_FIT CPU
+  riemannFit::LineFit line_fit_results =
+      riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+  riemannFit::par_uvrtopak(circle_fit_results, B, true);
+
+#endif
+
+  std::cout << "Fitted values (CircleFit):\n"
+            << circle_fit_results.par << "\nchi2 " << circle_fit_results.chi2 << std::endl;
+  std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << "\nchi2 " << line_fit_results.chi2 << std::endl;
+
+  std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl;
+  std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  testFit<4>();
+  testFit<3>();
+  testFit<5>();
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
new file mode 100644
index 0000000000000..6377628b0eeca
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h
@@ -0,0 +1,47 @@
+#ifndef RecoPixelVertexing__PixelTrackFitting__test_common_h
+#define RecoPixelVertexing__PixelTrackFitting__test_common_h
+
+#include <algorithm>
+#include <cassert>
+#include <random>
+
+template <class C>
+__host__ __device__ void printIt(C* m) {
+#ifdef TEST_DEBUG
+  printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
+  for (u_int r = 0; r < m->rows(); ++r) {
+    for (u_int c = 0; c < m->cols(); ++c) {
+      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r, c));
+    }
+  }
+#endif
+}
+
+template <class C1, class C2>
+bool isEqualFuzzy(C1 a, C2 b, double epsilon = 1e-6) {
+  for (unsigned int i = 0; i < a.rows(); ++i) {
+    for (unsigned int j = 0; j < a.cols(); ++j) {
+      assert(std::abs(a(i, j) - b(i, j)) < std::min(std::abs(a(i, j)), std::abs(b(i, j))) * epsilon);
+    }
+  }
+  return true;
+}
+
+bool isEqualFuzzy(double a, double b, double epsilon = 1e-6) {
+  return std::abs(a - b) < std::min(std::abs(a), std::abs(b)) * epsilon;
+}
+
+template <typename T>
+void fillMatrix(T& t) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 2.0);
+  for (int row = 0; row < t.rows(); ++row) {
+    for (int col = 0; col < t.cols(); ++col) {
+      t(row, col) = dis(gen);
+    }
+  }
+  return;
+}
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
index 9d149533eefbc..deb2beb6099ee 100644
--- a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
+++ b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h
@@ -42,7 +42,7 @@ class CAHitQuadrupletGenerator {
   ~CAHitQuadrupletGenerator() = default;
 
   static void fillDescriptions(edm::ParameterSetDescription& desc);
-  static const char* fillDescriptionsLabel() { return "caHitQuadruplet"; }
+  static const char* fillDescriptionsLabel() { return "caHitQuadrupletDefault"; }
 
   void initEvent(const edm::Event& ev, const edm::EventSetup& es);
 
diff --git a/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
new file mode 100644
index 0000000000000..986fe2e2992b9
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h
@@ -0,0 +1,97 @@
+#ifndef RecoPixelVertexingPixelTripletsCircleEq_H
+#define RecoPixelVertexingPixelTripletsCircleEq_H
+/**
+| 1) circle is parameterized as:                                              |
+|    C*[(X-Xp)**2+(Y-Yp)**2] - 2*alpha*(X-Xp) - 2*beta*(Y-Yp) = 0             |
+|    Xp,Yp is a point on the track;                                           |
+|    C = 1/r0 is the curvature  ( sign of C is charge of particle );          |
+|    alpha & beta are the direction cosines of the radial vector at Xp,Yp     |
+|    i.e.  alpha = C*(X0-Xp),                                                 |
+|          beta  = C*(Y0-Yp),                                                 |
+|    where center of circle is at X0,Y0.                                      |
+|                                                                             |
+|    Slope dy/dx of tangent at Xp,Yp is -alpha/beta.                          |
+| 2) the z dimension of the helix is parameterized by gamma = dZ/dSperp       |
+|    this is also the tangent of the pitch angle of the helix.                |
+|    with this parameterization, (alpha,beta,gamma) rotate like a vector.     |
+| 3) For tracks going inward at (Xp,Yp), C, alpha, beta, and gamma change sign|
+|
+*/
+
+#include <cmath>
+
+template <typename T>
+class CircleEq {
+public:
+  CircleEq() {}
+
+  constexpr CircleEq(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); }
+
+  constexpr void compute(T x1, T y1, T x2, T y2, T x3, T y3);
+
+  // dca to origin divided by curvature
+  constexpr T dca0() const {
+    auto x = m_c * m_xp + m_alpha;
+    auto y = m_c * m_yp + m_beta;
+    return std::sqrt(x * x + y * y) - T(1);
+  }
+
+  // dca to given point (divided by curvature)
+  constexpr T dca(T x, T y) const {
+    x = m_c * (m_xp - x) + m_alpha;
+    y = m_c * (m_yp - y) + m_beta;
+    return std::sqrt(x * x + y * y) - T(1);
+  }
+
+  // curvature
+  constexpr auto curvature() const { return m_c; }
+
+  // alpha and beta
+  constexpr std::pair<T, T> cosdir() const { return std::make_pair(m_alpha, m_beta); }
+
+  // alpha and beta af given point
+  constexpr std::pair<T, T> cosdir(T x, T y) const {
+    return std::make_pair(m_alpha - m_c * (x - m_xp), m_beta - m_c * (y - m_yp));
+  }
+
+  // center
+  constexpr std::pair<T, T> center() const { return std::make_pair(m_xp + m_alpha / m_c, m_yp + m_beta / m_c); }
+
+  constexpr auto radius() const { return T(1) / m_c; }
+
+  T m_xp = 0;
+  T m_yp = 0;
+  T m_c = 0;
+  T m_alpha = 0;
+  T m_beta = 0;
+};
+
+template <typename T>
+constexpr void CircleEq<T>::compute(T x1, T y1, T x2, T y2, T x3, T y3) {
+  bool noflip = std::abs(x3 - x1) < std::abs(y3 - y1);
+
+  auto x1p = noflip ? x1 - x2 : y1 - y2;
+  auto y1p = noflip ? y1 - y2 : x1 - x2;
+  auto d12 = x1p * x1p + y1p * y1p;
+  auto x3p = noflip ? x3 - x2 : y3 - y2;
+  auto y3p = noflip ? y3 - y2 : x3 - x2;
+  auto d32 = x3p * x3p + y3p * y3p;
+
+  auto num = x1p * y3p - y1p * x3p;  // num also gives correct sign for CT
+  auto det = d12 * y3p - d32 * y1p;
+
+  auto st2 = (d12 * x3p - d32 * x1p);
+  auto seq = det * det + st2 * st2;
+  auto al2 = T(1.) / std::sqrt(seq);
+  auto be2 = -st2 * al2;
+  auto ct = T(2.) * num * al2;
+  al2 *= det;
+
+  m_xp = x2;
+  m_yp = y2;
+  m_c = noflip ? ct : -ct;
+  m_alpha = noflip ? al2 : -be2;
+  m_beta = noflip ? be2 : -al2;
+}
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
new file mode 100644
index 0000000000000..bebfe0e08008e
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -0,0 +1,70 @@
+#include "BrokenLineFitOnGPU.h"
+
+void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
+  assert(tuples_);
+
+  //  Fit internals
+  auto hitsGPU_ =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU_ =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    kernel_BLFastFit<3>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+
+    kernel_BLFit<3>(tupleMultiplicity_,
+                    bField_,
+                    outputSoa_,
+                    hitsGPU_.get(),
+                    hits_geGPU_.get(),
+                    fast_fit_resultsGPU_.get(),
+                    3,
+                    offset);
+
+    // fit quads
+    kernel_BLFastFit<4>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+
+    kernel_BLFit<4>(tupleMultiplicity_,
+                    bField_,
+                    outputSoa_,
+                    hitsGPU_.get(),
+                    hits_geGPU_.get(),
+                    fast_fit_resultsGPU_.get(),
+                    4,
+                    offset);
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      kernel_BLFastFit<4>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      kernel_BLFit<4>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU_.get(),
+                      hits_geGPU_.get(),
+                      fast_fit_resultsGPU_.get(),
+                      5,
+                      offset);
+    } else {
+      // fit penta (all 5)
+      kernel_BLFastFit<5>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      kernel_BLFit<5>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU_.get(),
+                      hits_geGPU_.get(),
+                      fast_fit_resultsGPU_.get(),
+                      5,
+                      offset);
+    }
+
+  }  // loop on concurrent fits
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
new file mode 100644
index 0000000000000..d2ca583e86bd0
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -0,0 +1,85 @@
+#include "BrokenLineFitOnGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples,
+                                            cudaStream_t stream) {
+  assert(tuples_);
+
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                              bField_,
+                                                              outputSoa_,
+                                                              hitsGPU_.get(),
+                                                              hits_geGPU_.get(),
+                                                              fast_fit_resultsGPU_.get(),
+                                                              3,
+                                                              offset);
+    cudaCheck(cudaGetLastError());
+
+    // fit quads
+    kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                  bField_,
+                                                                  outputSoa_,
+                                                                  hitsGPU_.get(),
+                                                                  hits_geGPU_.get(),
+                                                                  fast_fit_resultsGPU_.get(),
+                                                                  4,
+                                                                  offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
+      cudaCheck(cudaGetLastError());
+    } else {
+      // fit penta (all 5)
+      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_BLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
+      cudaCheck(cudaGetLastError());
+    }
+
+  }  // loop on concurrent fits
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
new file mode 100644
index 0000000000000..ee5065e81fc45
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -0,0 +1,184 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+// #define BROKENLINE_DEBUG
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
+
+#include "HelixFitOnGPU.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
+
+// #define BL_DUMP_HITS
+
+template <int N>
+__global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
+                                 caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                 HitsOnGPU const *__restrict__ hhp,
+                                 double *__restrict__ phits,
+                                 float *__restrict__ phits_ge,
+                                 double *__restrict__ pfast_fit,
+                                 uint32_t nHits,
+                                 uint32_t offset) {
+  constexpr uint32_t hitsInFit = N;
+
+  assert(hitsInFit <= nHits);
+
+  assert(hhp);
+  assert(pfast_fit);
+  assert(foundNtuplets);
+  assert(tupleMultiplicity);
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+
+#ifdef BROKENLINE_DEBUG
+  if (0 == local_start) {
+    printf("%d total Ntuple\n", foundNtuplets->nbins());
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+  }
+#endif
+
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it from the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    assert(tkid < foundNtuplets->nbins());
+
+    assert(foundNtuplets->size(tkid) == nHits);
+
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+#ifdef BL_DUMP_HITS
+    __shared__ int done;
+    done = 0;
+    __syncthreads();
+    bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
+#endif
+
+    // Prepare data structure
+    auto const *hitId = foundNtuplets->begin(tkid);
+    for (unsigned int i = 0; i < hitsInFit; ++i) {
+      auto hit = hitId[i];
+      float ge[6];
+      hhp->cpeParams()
+          .detParams(hhp->detectorIndex(hit))
+          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+#ifdef BL_DUMP_HITS
+      if (dump) {
+        printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
+               tkid,
+               hhp->detectorIndex(hit),
+               i,
+               hhp->xGlobal(hit),
+               hhp->yGlobal(hit),
+               hhp->zGlobal(hit));
+        printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
+               tkid,
+               hhp->detetectorIndex(hit),
+               i,
+               ge[0],
+               ge[1],
+               ge[2],
+               ge[3],
+               ge[4],
+               ge[5]);
+      }
+#endif
+      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+    }
+    brokenline::fastFit(hits, fast_fit);
+
+    // no NaN here....
+    assert(fast_fit(0) == fast_fit(0));
+    assert(fast_fit(1) == fast_fit(1));
+    assert(fast_fit(2) == fast_fit(2));
+    assert(fast_fit(3) == fast_fit(3));
+  }
+}
+
+template <int N>
+__global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                             double bField,
+                             OutputSoA *results,
+                             double *__restrict__ phits,
+                             float *__restrict__ phits_ge,
+                             double *__restrict__ pfast_fit,
+                             uint32_t nHits,
+                             uint32_t offset) {
+  assert(N <= nHits);
+
+  assert(results);
+  assert(pfast_fit);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it for the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    brokenline::PreparedBrokenLineData<N> data;
+
+    brokenline::karimaki_circle_fit circle;
+    riemannFit::LineFit line;
+
+    brokenline::prepareBrokenLineData(hits, fast_fit, bField, data);
+    brokenline::lineFit(hits_ge, fast_fit, bField, data, line);
+    brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
+
+    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+    results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
+    results->eta(tkid) = asinhf(line.par(0));
+    results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
+
+#ifdef BROKENLINE_DEBUG
+    if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
+      printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
+    printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+           N,
+           nHits,
+           tkid,
+           circle.par(0),
+           circle.par(1),
+           circle.par(2));
+    printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
+    printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
+           circle.chi2,
+           line.chi2,
+           circle.cov(0, 0),
+           circle.cov(1, 1),
+           circle.cov(2, 2),
+           line.cov(0, 0),
+           line.cov(1, 1));
+#endif
+  }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index f76451675de59..3a54cd1134bc2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,10 +1,14 @@
+<use name="cuda"/>
+<use name="ofast-flag"/>
+<use name="CUDADataFormats/Track"/>
 <use name="CommonTools/RecoAlgos"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="HeterogeneousCore/CUDACore"/>
 <use name="RecoTracker/TkTrackingRegions"/>
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
-<library file="*.cc" name="RecoPixelVertexingPixelTripletsPlugins">
+<library file="*.cc *.cu" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
-
-<use name="ofast-flag"/>
-<flags CXXFLAGS="-fno-math-errno"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
new file mode 100644
index 0000000000000..5342141d2c9e4
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -0,0 +1,83 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+//#define ONLY_PHICUT
+
+// Cellular automaton constants
+namespace caConstants {
+
+  // constants
+#ifdef ONLY_PHICUT
+  constexpr uint32_t maxCellNeighbors = 64;
+  constexpr uint32_t maxCellTracks = 64;
+  constexpr uint32_t maxNumberOfTuples = 48 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 2 * 1024 * 1024;
+  constexpr uint32_t maxCellsPerHit = 8 * 128;
+#else  // ONLY_PHICUT
+  constexpr uint32_t maxCellNeighbors = 36;
+  constexpr uint32_t maxCellTracks = 48;
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumberOfTuples = 3 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 128 * 1024;
+  constexpr uint32_t maxCellsPerHit = 128 / 2;
+#else   // GPU_SMALL_EVENTS
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumberOfTuples = 24 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 512 * 1024;
+  constexpr uint32_t maxCellsPerHit = 128;
+#endif  // GPU_SMALL_EVENTS
+#endif  // ONLY_PHICUT
+  constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
+  constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples;
+
+  constexpr uint32_t maxNumberOfLayerPairs = 20;
+  constexpr uint32_t maxNumberOfLayers = 10;
+  constexpr uint32_t maxTuples = maxNumberOfTuples;
+
+  // Modules constants
+  constexpr uint32_t max_ladder_bpx0 = 12;
+  constexpr uint32_t first_ladder_bpx0 = 0;
+  constexpr float module_length_bpx0 = 6.7f;
+  constexpr float module_tolerance_bpx0 = 0.4f;  // projection to cylinder is inaccurate on BPIX1
+  constexpr uint32_t max_ladder_bpx4 = 64;
+  constexpr uint32_t first_ladder_bpx4 = 84;
+  constexpr float radius_even_ladder = 15.815f;
+  constexpr float radius_odd_ladder = 16.146f;
+  constexpr float module_length_bpx4 = 6.7f;
+  constexpr float module_tolerance_bpx4 = 0.2f;
+  constexpr float barrel_z_length = 26.f;
+  constexpr float forward_z_begin = 32.f;
+
+  // Last indexes
+  constexpr uint32_t last_bpix1_detIndex = 96;
+  constexpr uint32_t last_barrel_detIndex = 1184;
+
+  // types
+  using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+  using tindex_type = uint16_t;  // for tuples
+
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, maxCellNeighbors>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, maxCellTracks>;
+
+  using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
+  using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
+
+  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
+  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
+  using HitToTuple =
+      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples>;  // 3.5 should be enough
+  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples>;
+
+}  // namespace caConstants
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
new file mode 100644
index 0000000000000..beba54c33f513
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -0,0 +1,83 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+
+class CAHitNtupletCUDA : public edm::global::EDProducer<> {
+public:
+  explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig);
+  ~CAHitNtupletCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  bool onGPU_;
+
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
+  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
+
+  CAHitNtupletGeneratorOnGPU gpuAlgo_;
+};
+
+CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
+    : onGPU_(iConfig.getParameter<bool>("onGPU")), gpuAlgo_(iConfig, consumesCollector()) {
+  if (onGPU_) {
+    tokenHitGPU_ =
+        consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
+  } else {
+    tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
+  }
+}
+
+void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<bool>("onGPU", true);
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
+
+  CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
+  descriptions.add("caHitNtupletCUDA", desc);
+}
+
+void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
+  auto bf = 1. / PixelRecoUtilities::fieldInInvGev(es);
+
+  if (onGPU_) {
+    auto hHits = iEvent.getHandle(tokenHitGPU_);
+
+    cms::cuda::ScopedContextProduce ctx{*hHits};
+    auto const& hits = ctx.get(*hHits);
+
+    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+  } else {
+    auto const& hits = iEvent.get(tokenHitCPU_);
+    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
+  }
+}
+
+DEFINE_FWK_MODULE(CAHitNtupletCUDA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
new file mode 100644
index 0000000000000..c4b8a5a54847f
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -0,0 +1,184 @@
+#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
+  kernel_printCounters(counters);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) {
+  kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  // overkill to use template here (std::make_unique would suffice)
+  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
+  device_isOuterHitOfCell_.reset(
+      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
+  assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_.reset((unsigned char *)malloc(caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+                                             caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks)));
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
+                                                                                      sizeof(GPUCACell::CellNeighbors));
+
+  gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
+                                 nhits,
+                                 device_theCellNeighbors_.get(),
+                                 device_theCellNeighborsContainer_,
+                                 device_theCellTracks_.get(),
+                                 device_theCellTracksContainer_);
+
+  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
+  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * params_.maxNumberOfDoublets_));
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!params_.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (params_.minHitsPerNtuplet_ > 3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
+  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
+                                         device_nCells_,
+                                         device_theCellNeighbors_.get(),
+                                         device_theCellTracks_.get(),
+                                         hh.view(),
+                                         device_isOuterHitOfCell_.get(),
+                                         nActualPairs,
+                                         params_.idealConditions_,
+                                         params_.doClusterCut_,
+                                         params_.doZ0Cut_,
+                                         params_.doPtCut_,
+                                         params_.maxNumberOfDoublets_);
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = tracks_d->qualityData();
+
+  assert(tuples_d && quality_d);
+
+  // zero tuples
+  cms::cuda::launchZero(tuples_d, cudaStream);
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  kernel_connect(device_hitTuple_apc_,
+                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+                 hh.view(),
+                 device_theCells_.get(),
+                 device_nCells_,
+                 device_theCellNeighbors_.get(),
+                 device_isOuterHitOfCell_.get(),
+                 params_.hardCurvCut_,
+                 params_.ptmin_,
+                 params_.CAThetaCutBarrel_,
+                 params_.CAThetaCutForward_,
+                 params_.dcaCutInnerTriplet_,
+                 params_.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && params_.earlyFishbone_) {
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+  }
+
+  kernel_find_ntuplets(hh.view(),
+                       device_theCells_.get(),
+                       device_nCells_,
+                       device_theCellTracks_.get(),
+                       tuples_d,
+                       device_hitTuple_apc_,
+                       quality_d,
+                       params_.minHitsPerNtuplet_);
+  if (params_.doStats_)
+    kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
+
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+
+  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
+  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+
+  if (nhits > 1 && params_.lateFishbone_) {
+    gpuPixelDoublets::fishbone(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+  }
+
+  if (params_.doStats_) {
+    kernel_checkOverflows(tuples_d,
+                          device_tupleMultiplicity_.get(),
+                          device_hitToTuple_.get(),
+                          device_hitTuple_apc_,
+                          device_theCells_.get(),
+                          device_nCells_,
+                          device_theCellNeighbors_.get(),
+                          device_theCellTracks_.get(),
+                          device_isOuterHitOfCell_.get(),
+                          nhits,
+                          params_.maxNumberOfDoublets_,
+                          counters_);
+  }
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = tracks_d->qualityData();
+
+  // classify tracks based on kinematics
+  kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);
+
+  if (params_.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+
+  // fill hit->track "map"
+  kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+  cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
+  kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+
+  // remove duplicates (tracks that share a hit)
+  kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+
+  if (params_.doStats_) {
+    // counters (add flag???)
+    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
+    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
+  }
+
+#ifdef DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+#endif
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
new file mode 100644
index 0000000000000..96639e98939f9
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -0,0 +1,308 @@
+#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto blockSize = 128;
+  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+
+  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
+  cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = tracks_d->qualityData();
+
+  // zero tuples
+  cms::cuda::launchZero(tuples_d, cudaStream);
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  auto nthTot = 64;
+  auto stride = 4;
+  auto blockSize = nthTot / stride;
+  auto numberOfBlocks = nDoubletBlocks(blockSize);
+  auto rescale = numberOfBlocks / 65536;
+  blockSize *= (rescale + 1);
+  numberOfBlocks = nDoubletBlocks(blockSize);
+  assert(numberOfBlocks < 65536);
+  assert(blockSize > 0 && 0 == blockSize % 16);
+  dim3 blks(1, numberOfBlocks, 1);
+  dim3 thrs(stride, blockSize, 1);
+
+  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
+      device_hitTuple_apc_,
+      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+      hh.view(),
+      device_theCells_.get(),
+      device_nCells_,
+      device_theCellNeighbors_.get(),
+      device_isOuterHitOfCell_.get(),
+      params_.hardCurvCut_,
+      params_.ptmin_,
+      params_.CAThetaCutBarrel_,
+      params_.CAThetaCutForward_,
+      params_.dcaCutInnerTriplet_,
+      params_.dcaCutOuterTriplet_);
+  cudaCheck(cudaGetLastError());
+
+  if (nhits > 1 && params_.earlyFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+    cudaCheck(cudaGetLastError());
+  }
+
+  blockSize = 64;
+  numberOfBlocks = (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                                     device_theCells_.get(),
+                                                                     device_nCells_,
+                                                                     device_theCellTracks_.get(),
+                                                                     tuples_d,
+                                                                     device_hitTuple_apc_,
+                                                                     quality_d,
+                                                                     params_.minHitsPerNtuplet_);
+  cudaCheck(cudaGetLastError());
+
+  if (params_.doStats_)
+    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
+  cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  blockSize = 128;
+  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = nDoubletBlocks(blockSize);
+  kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  blockSize = 128;
+  numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudaCheck(cudaGetLastError());
+
+  if (nhits > 1 && params_.lateFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    cudaCheck(cudaGetLastError());
+  }
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // free space asap
+  // device_isOuterHitOfCell_.reset();
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  assert(device_isOuterHitOfCell_.get());
+
+  cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
+      caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
+      stream);
+  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
+  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
+                                                                                      sizeof(GPUCACell::CellNeighbors));
+
+  {
+    int threadsPerBlock = 128;
+    // at least one block!
+    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
+                                                                           nhits,
+                                                                           device_theCellNeighbors_.get(),
+                                                                           device_theCellNeighborsContainer_,
+                                                                           device_theCellTracks_.get(),
+                                                                           device_theCellTracksContainer_);
+    cudaCheck(cudaGetLastError());
+  }
+
+  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!params_.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (params_.minHitsPerNtuplet_ > 3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
+  int stride = 4;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
+  int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  dim3 blks(1, blocks, 1);
+  dim3 thrs(stride, threadsPerBlock, 1);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
+                                                                    device_nCells_,
+                                                                    device_theCellNeighbors_.get(),
+                                                                    device_theCellTracks_.get(),
+                                                                    hh.view(),
+                                                                    device_isOuterHitOfCell_.get(),
+                                                                    nActualPairs,
+                                                                    params_.idealConditions_,
+                                                                    params_.doClusterCut_,
+                                                                    params_.doZ0Cut_,
+                                                                    params_.doPtCut_,
+                                                                    params_.maxNumberOfDoublets_);
+  cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = tracks_d->qualityData();
+
+  auto blockSize = 64;
+
+  // classify tracks based on kinematics
+  auto numberOfBlocks = nQuadrupletBlocks(blockSize);
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, params_.cuts_, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  if (params_.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    numberOfBlocks = nDoubletBlocks(blockSize);
+    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        device_theCells_.get(), device_nCells_, quality_d);
+    cudaCheck(cudaGetLastError());
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = nDoubletBlocks(blockSize);
+  kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+  cudaCheck(cudaGetLastError());
+
+  if (params_.minHitsPerNtuplet_ < 4 || params_.doStats_) {
+    // fill hit->track "map"
+    numberOfBlocks = nQuadrupletBlocks(blockSize);
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+    cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
+    cudaCheck(cudaGetLastError());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+  if (params_.minHitsPerNtuplet_ < 4) {
+    // remove duplicates (tracks that share a hit)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+
+  if (params_.doStats_) {
+    auto nhits = hh.nHits();
+    numberOfBlocks = (std::max(nhits, params_.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitToTuple_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_.get(),
+                                                                        device_theCellTracks_.get(),
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        params_.maxNumberOfDoublets_,
+                                                                        counters_);
+    cudaCheck(cudaGetLastError());
+  }
+
+  if (params_.doStats_) {
+    // counters (add flag???)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
+    cudaCheck(cudaGetLastError());
+    numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
+    cudaCheck(cudaGetLastError());
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
+      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+#endif
+}
+
+template <>
+void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) {
+  kernel_printCounters<<<1, 1>>>(counters);
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
new file mode 100644
index 0000000000000..d1a9f3d13a67f
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -0,0 +1,223 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "GPUCACell.h"
+
+// #define DUMP_GPU_TK_TUPLES
+
+namespace cAHitNtupletGenerator {
+
+  // counters
+  struct Counters {
+    unsigned long long nEvents;
+    unsigned long long nHits;
+    unsigned long long nCells;
+    unsigned long long nTuples;
+    unsigned long long nFitTracks;
+    unsigned long long nGoodTracks;
+    unsigned long long nUsedHits;
+    unsigned long long nDupHits;
+    unsigned long long nKilledCells;
+    unsigned long long nEmptyCells;
+    unsigned long long nZeroTrackCells;
+  };
+
+  using HitsView = TrackingRecHit2DSOAView;
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+
+  using HitToTuple = caConstants::HitToTuple;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+  struct QualityCuts {
+    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
+    float chi2Coeff[4];
+    float chi2MaxPt;  // GeV
+    float chi2Scale;
+
+    struct Region {
+      float maxTip;  // cm
+      float minPt;   // GeV
+      float maxZip;  // cm
+    };
+
+    Region triplet;
+    Region quadruplet;
+  };
+
+  // params
+  struct Params {
+    Params(bool onGPU,
+           uint32_t minHitsPerNtuplet,
+           uint32_t maxNumberOfDoublets,
+           bool useRiemannFit,
+           bool fit5as4,
+           bool includeJumpingForwardDoublets,
+           bool earlyFishbone,
+           bool lateFishbone,
+           bool idealConditions,
+           bool doStats,
+           bool doClusterCut,
+           bool doZ0Cut,
+           bool doPtCut,
+           float ptmin,
+           float CAThetaCutBarrel,
+           float CAThetaCutForward,
+           float hardCurvCut,
+           float dcaCutInnerTriplet,
+           float dcaCutOuterTriplet,
+           QualityCuts const& cuts)
+        : onGPU_(onGPU),
+          minHitsPerNtuplet_(minHitsPerNtuplet),
+          maxNumberOfDoublets_(maxNumberOfDoublets),
+          useRiemannFit_(useRiemannFit),
+          fit5as4_(fit5as4),
+          includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
+          earlyFishbone_(earlyFishbone),
+          lateFishbone_(lateFishbone),
+          idealConditions_(idealConditions),
+          doStats_(doStats),
+          doClusterCut_(doClusterCut),
+          doZ0Cut_(doZ0Cut),
+          doPtCut_(doPtCut),
+          ptmin_(ptmin),
+          CAThetaCutBarrel_(CAThetaCutBarrel),
+          CAThetaCutForward_(CAThetaCutForward),
+          hardCurvCut_(hardCurvCut),
+          dcaCutInnerTriplet_(dcaCutInnerTriplet),
+          dcaCutOuterTriplet_(dcaCutOuterTriplet),
+          cuts_(cuts) {}
+
+    const bool onGPU_;
+    const uint32_t minHitsPerNtuplet_;
+    const uint32_t maxNumberOfDoublets_;
+    const bool useRiemannFit_;
+    const bool fit5as4_;
+    const bool includeJumpingForwardDoublets_;
+    const bool earlyFishbone_;
+    const bool lateFishbone_;
+    const bool idealConditions_;
+    const bool doStats_;
+    const bool doClusterCut_;
+    const bool doZ0Cut_;
+    const bool doPtCut_;
+    const float ptmin_;
+    const float CAThetaCutBarrel_;
+    const float CAThetaCutForward_;
+    const float hardCurvCut_;
+    const float dcaCutInnerTriplet_;
+    const float dcaCutOuterTriplet_;
+
+    // quality cuts
+    QualityCuts cuts_{// polynomial coefficients for the pT-dependent chi2 cut
+                      {0.68177776, 0.74609577, -0.08035491, 0.00315399},
+                      // max pT used to determine the chi2 cut
+                      10.,
+                      // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                      30.,
+                      // regional cuts for triplets
+                      {
+                          0.3,  // |Tip| < 0.3 cm
+                          0.5,  // pT > 0.5 GeV
+                          12.0  // |Zip| < 12.0 cm
+                      },
+                      // regional cuts for quadruplets
+                      {
+                          0.5,  // |Tip| < 0.5 cm
+                          0.3,  // pT > 0.3 GeV
+                          12.0  // |Zip| < 12.0 cm
+                      }};
+
+  };  // Params
+
+}  // namespace cAHitNtupletGenerator
+
+template <typename TTraits>
+class CAHitNtupletGeneratorKernels {
+public:
+  using Traits = TTraits;
+
+  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
+  using Params = cAHitNtupletGenerator::Params;
+  using Counters = cAHitNtupletGenerator::Counters;
+
+  template <typename T>
+  using unique_ptr = typename Traits::template unique_ptr<T>;
+
+  using HitsView = TrackingRecHit2DSOAView;
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
+
+  using HitToTuple = caConstants::HitToTuple;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+  CAHitNtupletGeneratorKernels(Params const& params)
+      : params_(params), paramsMaxDoubletes3Quarters_(3 * params.maxNumberOfDoublets_ / 4) {}
+  ~CAHitNtupletGeneratorKernels() = default;
+
+  TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
+
+  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+
+  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+
+  void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
+
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void allocateOnGPU(cudaStream_t stream);
+  void cleanup(cudaStream_t cudaStream);
+
+  static void printCounters(Counters const* counters);
+  void setCounters(Counters* counters) { counters_ = counters; }
+
+private:
+  Counters* counters_ = nullptr;
+
+  // workspace
+  unique_ptr<unsigned char[]> cellStorage_;
+  unique_ptr<caConstants::CellNeighborsVector> device_theCellNeighbors_;
+  caConstants::CellNeighbors* device_theCellNeighborsContainer_;
+  unique_ptr<caConstants::CellTracksVector> device_theCellTracks_;
+  caConstants::CellTracks* device_theCellTracksContainer_;
+
+  unique_ptr<GPUCACell[]> device_theCells_;
+  unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
+  uint32_t* device_nCells_ = nullptr;
+
+  unique_ptr<HitToTuple> device_hitToTuple_;
+  cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
+
+  cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr;
+
+  unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
+
+  unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
+  // params
+  Params const& params_;
+  /// Intermediate result avoiding repeated computations.
+  const uint32_t paramsMaxDoubletes3Quarters_;
+  /// Compute the number of doublet blocks for block size
+  inline uint32_t nDoubletBlocks(uint32_t blockSize) {
+    // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed.
+    return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize;
+  }
+
+  /// Compute the number of quadruplet blocks for block size
+  inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
+    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
+    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+  }
+};
+
+using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
+using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits>;
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
new file mode 100644
index 0000000000000..96381673388ca
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -0,0 +1 @@
+#include "CAHitNtupletGeneratorKernelsAlloc.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
new file mode 100644
index 0000000000000..96381673388ca
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
@@ -0,0 +1 @@
+#include "CAHitNtupletGeneratorKernelsAlloc.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
new file mode 100644
index 0000000000000..1d19aa43d6e1b
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -0,0 +1,35 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "CAHitNtupletGeneratorKernels.h"
+
+template <>
+#ifdef __CUDACC__
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
+#else
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
+#endif
+  //////////////////////////////////////////////////////////
+  // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
+  //////////////////////////////////////////////////////////
+
+  device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(stream);
+  device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(stream);
+
+  device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
+
+  device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
+
+  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
+
+  device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
+  device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
+  device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
+
+  if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
+  } else {
+    *device_nCells_ = 0;
+  }
+  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
+  cms::cuda::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
new file mode 100644
index 0000000000000..7c0cec51b8057
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -0,0 +1,593 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+// #define NTUPLE_DEBUG
+
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+#include "CAConstants.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "GPUCACell.h"
+#include "gpuFishbone.h"
+#include "gpuPixelDoublets.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using HitsOnCPU = TrackingRecHit2DCUDA;
+
+using HitToTuple = caConstants::HitToTuple;
+using TupleMultiplicity = caConstants::TupleMultiplicity;
+
+using Quality = pixelTrack::Quality;
+using TkSoA = pixelTrack::TrackSoA;
+using HitContainer = pixelTrack::HitContainer;
+
+__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
+                                      caConstants::TupleMultiplicity const *tupleMultiplicity,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
+                                      cms::cuda::AtomicPairCounter *apc,
+                                      GPUCACell const *__restrict__ cells,
+                                      uint32_t const *__restrict__ nCells,
+                                      gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
+                                      gpuPixelDoublets::CellTracksVector const *cellTracks,
+                                      GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                                      uint32_t nHits,
+                                      uint32_t maxNumberOfDoublets,
+                                      CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto &c = *counters;
+  // counters once per event
+  if (0 == first) {
+    atomicAdd(&c.nEvents, 1);
+    atomicAdd(&c.nHits, nHits);
+    atomicAdd(&c.nCells, *nCells);
+    atomicAdd(&c.nTuples, apc->get().m);
+    atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
+  }
+
+#ifdef NTUPLE_DEBUG
+  if (0 == first) {
+    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
+           *nCells,
+           apc->get().m,
+           apc->get().n,
+           nHits);
+    if (apc->get().m < caConstants::maxNumberOfQuadruplets()) {
+      assert(foundNtuplets->size(apc->get().m) == 0);
+      assert(foundNtuplets->size() == apc->get().n);
+    }
+  }
+
+  for (int idx = first, nt = foundNtuplets->nbins(); idx < nt; idx += gridDim.x * blockDim.x) {
+    if (foundNtuplets->size(idx) > 5)
+      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
+    assert(foundNtuplets->size(idx) < 6);
+    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+      assert(*ih < nHits);
+  }
+#endif
+
+  if (0 == first) {
+    if (apc->get().m >= caConstants::maxNumberOfQuadruplets)
+      printf("Tuples overflow\n");
+    if (*nCells >= maxNumberOfDoublets)
+      printf("Cells overflow\n");
+    if (cellNeighbors && cellNeighbors->full())
+      printf("cellNeighbors overflow\n");
+    if (cellTracks && cellTracks->full())
+      printf("cellTracks overflow\n");
+  }
+
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
+    if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+      printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
+    if (thisCell.isKilled())
+      atomicAdd(&c.nKilledCells, 1);
+    if (thisCell.unused())
+      atomicAdd(&c.nEmptyCells, 1);
+    if (0 == hitToTuple->size(thisCell.inner_hit_id()) && 0 == hitToTuple->size(thisCell.outer_hit_id()))
+      atomicAdd(&c.nZeroTrackCells, 1);
+  }
+
+  for (int idx = first, nt = nHits; idx < nt; idx += gridDim.x * blockDim.x) {
+    if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
+      printf("OuterHitOfCell overflow %d\n", idx);
+  }
+}
+
+__global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
+  constexpr auto bad = pixelTrack::Quality::bad;
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (!thisCell.isKilled())
+      continue;
+
+    for (auto it : thisCell.tracks())
+      quality[it] = bad;
+  }
+}
+
+__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
+                                             uint32_t const *__restrict__ nCells,
+                                             HitContainer *foundNtuplets,
+                                             Quality *quality) {
+  // constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = pixelTrack::Quality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+
+    if (thisCell.tracks().size() < 2)
+      continue;
+    //if (0==thisCell.theUsed) continue;
+    // if (thisCell.theDoubletId < 0) continue;
+
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it : thisCell.tracks()) {
+      auto nh = foundNtuplets->size(it);
+      maxNh = std::max(nh, maxNh);
+    }
+
+    for (auto it : thisCell.tracks()) {
+      if (foundNtuplets->size(it) != maxNh)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+  }
+}
+
+__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer const *__restrict__ foundNtuplets,
+                                            TkSoA *__restrict__ tracks) {
+  constexpr auto bad = pixelTrack::Quality::bad;
+  constexpr auto dup = pixelTrack::Quality::dup;
+  constexpr auto loose = pixelTrack::Quality::loose;
+
+  assert(nCells);
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.tracks().size() < 2)
+      continue;
+    // if (thisCell.theDoubletId < 0) continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+
+    auto score = [&](auto it) {
+      return std::abs(tracks->tip(it));  // tip
+      // return tracks->chi2(it);  //chi2
+    };
+
+    // find min score
+    for (auto it : thisCell.tracks()) {
+      if (tracks->quality(it) == loose && score(it) < mc) {
+        mc = score(it);
+        im = it;
+      }
+    }
+    // mark all other duplicates
+    for (auto it : thisCell.tracks()) {
+      if (tracks->quality(it) != bad && it != im)
+        tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
+    }
+  }
+}
+
+__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
+                               cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
+                               GPUCACell::Hits const *__restrict__ hhp,
+                               GPUCACell *cells,
+                               uint32_t const *__restrict__ nCells,
+                               gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
+                               GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                               float hardCurvCut,
+                               float ptmin,
+                               float CAThetaCutBarrel,
+                               float CAThetaCutForward,
+                               float dcaCutInnerTriplet,
+                               float dcaCutOuterTriplet) {
+  auto const &hh = *hhp;
+
+  auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+  auto first = threadIdx.x;
+  auto stride = blockDim.x;
+
+  if (0 == (firstCellIndex + first)) {
+    (*apc1) = 0;
+    (*apc2) = 0;
+  }  // ready for next kernel
+
+  for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
+    auto cellIndex = idx;
+    auto &thisCell = cells[idx];
+    auto innerHitId = thisCell.inner_hit_id();
+    int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+    auto vi = isOuterHitOfCell[innerHitId].data();
+
+    auto ri = thisCell.inner_r(hh);
+    auto zi = thisCell.inner_z(hh);
+
+    auto ro = thisCell.outer_r(hh);
+    auto zo = thisCell.outer_z(hh);
+    auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
+
+    for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
+      auto otherCell = __ldg(vi + j);
+      auto &oc = cells[otherCell];
+      auto r1 = oc.inner_r(hh);
+      auto z1 = oc.inner_z(hh);
+      bool aligned = GPUCACell::areAlignedRZ(
+          r1,
+          z1,
+          ri,
+          zi,
+          ro,
+          zo,
+          ptmin,
+          isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+      if (aligned && thisCell.dcaCut(hh,
+                                     oc,
+                                     oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                              : dcaCutOuterTriplet,
+                                     hardCurvCut)) {  // FIXME tune cuts
+        oc.addOuterNeighbor(cellIndex, *cellNeighbors);
+        thisCell.setUsedBit(1);
+        oc.setUsedBit(1);
+      }
+    }  // loop on inner cells
+  }    // loop on outer cells
+}
+
+__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells,
+                                     gpuPixelDoublets::CellTracksVector *cellTracks,
+                                     HitContainer *foundNtuplets,
+                                     cms::cuda::AtomicPairCounter *apc,
+                                     Quality *__restrict__ quality,
+                                     unsigned int minHitsPerNtuplet) {
+  // recursive: not obvious to widen
+  auto const &hh = *hhp;
+
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto const &thisCell = cells[idx];
+    if (thisCell.isKilled())
+      continue;  // cut by earlyFishbone
+
+    auto pid = thisCell.layerPairId();
+    auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
+    if (doit) {
+      GPUCACell::TmpTuple stack;
+      stack.reset();
+      thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
+      assert(stack.empty());
+      // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
+    }
+  }
+}
+
+__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
+                                 GPUCACell *__restrict__ cells,
+                                 uint32_t const *nCells) {
+  auto first = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+    auto &thisCell = cells[idx];
+    if (!thisCell.tracks().empty())
+      thisCell.setUsedBit(2);
+  }
+}
+
+__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                         Quality const *__restrict__ quality,
+                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = foundNtuplets->size(it);
+    if (nhits < 3)
+      continue;
+    if (quality[it] == pixelTrack::Quality::dup)
+      continue;
+    assert(quality[it] == pixelTrack::Quality::bad);
+    if (nhits > 5)
+      printf("wrong mult %d %d\n", it, nhits);
+    assert(nhits < 8);
+    tupleMultiplicity->countDirect(nhits);
+  }
+}
+
+__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                        Quality const *__restrict__ quality,
+                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = foundNtuplets->size(it);
+    if (nhits < 3)
+      continue;
+    if (quality[it] == pixelTrack::Quality::dup)
+      continue;
+    assert(quality[it] == pixelTrack::Quality::bad);
+    if (nhits > 5)
+      printf("wrong mult %d %d\n", it, nhits);
+    assert(nhits < 8);
+    tupleMultiplicity->fillDirect(nhits, it);
+  }
+}
+
+__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
+                                      TkSoA const *__restrict__ tracks,
+                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
+                                      Quality *__restrict__ quality) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int it = first, nt = tuples->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+    auto nhits = tuples->size(it);
+    if (nhits == 0)
+      break;  // guard
+
+    // if duplicate: not even fit
+    if (quality[it] == pixelTrack::Quality::dup)
+      continue;
+
+    assert(quality[it] == pixelTrack::Quality::bad);
+
+    // mark doublets as bad
+    if (nhits < 3)
+      continue;
+
+    // if the fit has any invalid parameters, mark it as bad
+    bool isNaN = false;
+    for (int i = 0; i < 5; ++i) {
+      isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
+    }
+    if (isNaN) {
+#ifdef NTUPLE_DEBUG
+      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
+#endif
+      continue;
+    }
+
+    // compute a pT-dependent chi2 cut
+    // default parameters:
+    //   - chi2MaxPt = 10 GeV
+    //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
+    //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
+    // (see CAHitNtupletGeneratorGPU.cc)
+    float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
+    float chi2Cut = cuts.chi2Scale *
+                    (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+    // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
+    if (3.f * tracks->chi2(it) >= chi2Cut) {
+#ifdef NTUPLE_DEBUG
+      printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
+             it,
+             tuples->size(it),
+             tracks->pt(it),
+             tracks->eta(it),
+             3.f * tracks->chi2(it));
+#endif
+      continue;
+    }
+
+    // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+    // default cuts:
+    //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+    //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+    // (see CAHitNtupletGeneratorGPU.cc)
+    auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
+    bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
+                (std::abs(tracks->zip(it)) < region.maxZip);
+
+    if (isOk)
+      quality[it] = pixelTrack::Quality::loose;
+  }
+}
+
+__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      break;  //guard
+    if (quality[idx] != pixelTrack::Quality::loose)
+      continue;
+    atomicAdd(&(counters->nGoodTracks), 1);
+  }
+}
+
+__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      break;  // guard
+    if (quality[idx] != pixelTrack::Quality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->countDirect(*h);
+  }
+}
+
+__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
+                                       Quality const *__restrict__ quality,
+                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      break;  // guard
+    if (quality[idx] != pixelTrack::Quality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->fillDirect(*h, idx);
+  }
+}
+
+__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
+                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                         HitContainer *__restrict__ hitDetIndices) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  // copy offsets
+  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    hitDetIndices->off[idx] = tuples->off[idx];
+  }
+  // fill hit indices
+  auto const &hh = *hhp;
+  auto nhits = hh.nHits();
+  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    assert(tuples->bins[idx] < nhits);
+    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+  }
+}
+
+__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
+                                             CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
+  auto &c = *counters;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple->size(idx) == 0)
+      continue;  // SHALL NOT BE break
+    atomicAdd(&c.nUsedHits, 1);
+    if (hitToTuple->size(idx) > 1)
+      atomicAdd(&c.nDupHits, 1);
+  }
+}
+
+__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const *__restrict__ ptracks,
+                                      Quality *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
+  constexpr auto bad = pixelTrack::Quality::bad;
+  constexpr auto dup = pixelTrack::Quality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  auto &hitToTuple = *phitToTuple;
+  auto const &foundNtuplets = *ptuples;
+  auto const &tracks = *ptracks;
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple.size(idx) < 2)
+      continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      maxNh = std::max(nh, maxNh);
+    }
+    // kill all tracks shorter than maxHn (only triplets???)
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      if (maxNh != nh)
+        quality[*it] = dup;
+    }
+
+    if (maxNh > 3)
+      continue;
+    // for triplets choose best tip!
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
+        mc = std::abs(tracks.tip(it));
+        im = it;
+      }
+    }
+    // mark duplicates
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && it != im)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+  }  // loop over hits
+}
+
+__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                            HitContainer const *__restrict__ ptuples,
+                                            TkSoA const *__restrict__ ptracks,
+                                            Quality const *__restrict__ quality,
+                                            CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
+                                            uint32_t maxPrint,
+                                            int iev) {
+  auto const &foundNtuplets = *ptuples;
+  auto const &tracks = *ptracks;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i < np; i += blockDim.x * gridDim.x) {
+    auto nh = foundNtuplets.size(i);
+    if (nh < 3)
+      continue;
+    printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
+           10000 * iev + i,
+           int(quality[i]),
+           nh,
+           tracks.charge(i),
+           tracks.pt(i),
+           tracks.eta(i),
+           tracks.phi(i),
+           tracks.tip(i),
+           tracks.zip(i),
+           //           asinhf(fit_results[i].par(3)),
+           tracks.chi2(i),
+           *foundNtuplets.begin(i),
+           *(foundNtuplets.begin(i) + 1),
+           *(foundNtuplets.begin(i) + 2),
+           nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1,
+           nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1);
+  }
+}
+
+__global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
+  auto const &c = *counters;
+  printf(
+      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | "
+      "nKilledCells | "
+      "nEmptyCells | nZeroTrackCells ||\n");
+  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+         c.nEvents,
+         c.nHits,
+         c.nCells,
+         c.nTuples,
+         c.nGoodTracks,
+         c.nFitTracks,
+         c.nUsedHits,
+         c.nDupHits,
+         c.nKilledCells,
+         c.nEmptyCells,
+         c.nZeroTrackCells);
+  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
+         c.nEvents,
+         c.nHits / double(c.nEvents),
+         c.nCells / double(c.nEvents),
+         c.nTuples / double(c.nEvents),
+         c.nFitTracks / double(c.nEvents),
+         c.nGoodTracks / double(c.nEvents),
+         c.nUsedHits / double(c.nEvents),
+         c.nDupHits / double(c.nEvents),
+         c.nKilledCells / double(c.nEvents),
+         c.nEmptyCells / double(c.nCells),
+         c.nZeroTrackCells / double(c.nCells));
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
new file mode 100644
index 0000000000000..c2c7c2b869752
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -0,0 +1,229 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDMException.h"
+#include "FWCore/Utilities/interface/isFinite.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+
+namespace {
+
+  template <typename T>
+  T sqr(T x) {
+    return x * x;
+  }
+
+  cAHitNtupletGenerator::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) {
+    auto coeff = pset.getParameter<std::vector<double>>("chi2Coeff");
+    if (coeff.size() != 4) {
+      throw edm::Exception(edm::errors::Configuration,
+                           "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements");
+    }
+    return cAHitNtupletGenerator::QualityCuts{// polynomial coefficients for the pT-dependent chi2 cut
+                                              {(float)coeff[0], (float)coeff[1], (float)coeff[2], (float)coeff[3]},
+                                              // max pT used to determine the chi2 cut
+                                              (float)pset.getParameter<double>("chi2MaxPt"),
+                                              // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                                              (float)pset.getParameter<double>("chi2Scale"),
+                                              // regional cuts for triplets
+                                              {(float)pset.getParameter<double>("tripletMaxTip"),
+                                               (float)pset.getParameter<double>("tripletMinPt"),
+                                               (float)pset.getParameter<double>("tripletMaxZip")},
+                                              // regional cuts for quadruplets
+                                              {(float)pset.getParameter<double>("quadrupletMaxTip"),
+                                               (float)pset.getParameter<double>("quadrupletMinPt"),
+                                               (float)pset.getParameter<double>("quadrupletMaxZip")}};
+  }
+
+}  // namespace
+
+using namespace std;
+
+CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC)
+    : m_params(cfg.getParameter<bool>("onGPU"),
+               cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+               cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
+               cfg.getParameter<bool>("useRiemannFit"),
+               cfg.getParameter<bool>("fit5as4"),
+               cfg.getParameter<bool>("includeJumpingForwardDoublets"),
+               cfg.getParameter<bool>("earlyFishbone"),
+               cfg.getParameter<bool>("lateFishbone"),
+               cfg.getParameter<bool>("idealConditions"),
+               cfg.getParameter<bool>("fillStatistics"),
+               cfg.getParameter<bool>("doClusterCut"),
+               cfg.getParameter<bool>("doZ0Cut"),
+               cfg.getParameter<bool>("doPtCut"),
+               cfg.getParameter<double>("ptmin"),
+               cfg.getParameter<double>("CAThetaCutBarrel"),
+               cfg.getParameter<double>("CAThetaCutForward"),
+               cfg.getParameter<double>("hardCurvCut"),
+               cfg.getParameter<double>("dcaCutInnerTriplet"),
+               cfg.getParameter<double>("dcaCutOuterTriplet"),
+               makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
+#ifdef DUMP_GPU_TK_TUPLES
+  printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+         "tid",
+         "qual",
+         "nh",
+         "charge",
+         "pt",
+         "eta",
+         "phi",
+         "tip",
+         "zip",
+         "chi2",
+         "h1",
+         "h2",
+         "h3",
+         "h4",
+         "h5");
+#endif
+
+  if (m_params.onGPU_) {
+    // allocate pinned host memory only if CUDA is available
+    edm::Service<CUDAService> cs;
+    if (cs and cs->enabled()) {
+      cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
+      cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+    }
+  } else {
+    m_counters = new Counters();
+    memset(m_counters, 0, sizeof(Counters));
+  }
+}
+
+CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() {
+  if (m_params.onGPU_) {
+    // print the gpu statistics and free pinned host memory only if CUDA is available
+    edm::Service<CUDAService> cs;
+    if (cs and cs->enabled()) {
+      if (m_params.doStats_) {
+        // crash on multi-gpu processes
+        CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
+      }
+      cudaFree(m_counters);
+    }
+  } else {
+    if (m_params.doStats_) {
+      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
+    }
+    delete m_counters;
+  }
+}
+
+void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) {
+  // 87 cm/GeV = 1/(3.8T * 0.3)
+  // take less than radius given by the hardPtCut and reject everything below
+  // auto hardCurvCut = 1.f/(0.35 * 87.f);
+  desc.add<double>("ptmin", 0.9f)->setComment("Cut on minimum pt");
+  desc.add<double>("CAThetaCutBarrel", 0.002f)->setComment("Cut on RZ alignement for Barrel");
+  desc.add<double>("CAThetaCutForward", 0.003f)->setComment("Cut on RZ alignment for Forward");
+  desc.add<double>("hardCurvCut", 1.f / (0.35 * 87.f))->setComment("Cut on minimum curvature");
+  desc.add<double>("dcaCutInnerTriplet", 0.15f)->setComment("Cut on origin radius when the inner hit is on BPix1");
+  desc.add<double>("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1");
+  desc.add<bool>("earlyFishbone", true);
+  desc.add<bool>("lateFishbone", false);
+  desc.add<bool>("idealConditions", true);
+  desc.add<bool>("fillStatistics", false);
+  desc.add<unsigned int>("minHitsPerNtuplet", 4);
+  desc.add<unsigned int>("maxNumberOfDoublets", caConstants::maxNumberOfDoublets);
+  desc.add<bool>("includeJumpingForwardDoublets", false);
+  desc.add<bool>("fit5as4", true);
+  desc.add<bool>("doClusterCut", true);
+  desc.add<bool>("doZ0Cut", true);
+  desc.add<bool>("doPtCut", true);
+  desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
+
+  edm::ParameterSetDescription trackQualityCuts;
+  trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
+  trackQualityCuts.add<std::vector<double>>("chi2Coeff", {0.68177776, 0.74609577, -0.08035491, 0.00315399})
+      ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut");
+  trackQualityCuts.add<double>("chi2Scale", 30.)
+      ->setComment(
+          "Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann "
+          "fit)");
+  trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
+  trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");
+  trackQualityCuts.add<double>("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV");
+  trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm");
+  desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+      ->setComment(
+          "Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply \"region "
+          "cuts\" based on the fit results (pT, Tip, Zip).");
+}
+
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
+                                                                    float bfield,
+                                                                    cudaStream_t stream) const {
+  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
+
+  auto* soa = tracks.get();
+
+  CAHitNtupletGeneratorKernelsGPU kernels(m_params);
+  kernels.setCounters(m_counters);
+
+  kernels.allocateOnGPU(stream);
+
+  kernels.buildDoublets(hits_d, stream);
+  kernels.launchKernels(hits_d, soa, stream);
+  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
+
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+  } else {
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+  }
+  kernels.classifyTuples(hits_d, soa, stream);
+
+  return tracks;
+}
+
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
+  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+
+  auto* soa = tracks.get();
+  assert(soa);
+
+  CAHitNtupletGeneratorKernelsCPU kernels(m_params);
+  kernels.setCounters(m_counters);
+  kernels.allocateOnGPU(nullptr);
+
+  kernels.buildDoublets(hits_d, nullptr);
+  kernels.launchKernels(hits_d, soa, nullptr);
+  kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
+
+  if (0 == hits_d.nHits())
+    return tracks;
+
+  // now fit
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
+  } else {
+    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
+  }
+
+  kernels.classifyTuples(hits_d, soa, nullptr);
+
+  return tracks;
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
new file mode 100644
index 0000000000000..564a870f54796
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -0,0 +1,65 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+
+#include <cuda_runtime.h>
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
+
+#include "CAHitNtupletGeneratorKernels.h"
+#include "HelixFitOnGPU.h"
+
+#include "GPUCACell.h"
+
+namespace edm {
+  class Event;
+  class EventSetup;
+  class ParameterSetDescription;
+}  // namespace edm
+
+class CAHitNtupletGeneratorOnGPU {
+public:
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+  using hindex_type = TrackingRecHit2DSOAView::hindex_type;
+
+  using Quality = pixelTrack::Quality;
+  using OutputSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+  using Tuple = HitContainer;
+
+  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
+  using Params = cAHitNtupletGenerator::Params;
+  using Counters = cAHitNtupletGenerator::Counters;
+
+public:
+  CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC)
+      : CAHitNtupletGeneratorOnGPU(cfg, iC) {}
+  CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
+
+  ~CAHitNtupletGeneratorOnGPU();
+
+  static void fillDescriptions(edm::ParameterSetDescription& desc);
+  static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
+
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+
+  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+
+private:
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
+
+  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
+
+  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
+
+  Params m_params;
+
+  Counters* m_counters = nullptr;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
new file mode 100644
index 0000000000000..0fd514e26d223
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -0,0 +1,347 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+// #define ONLY_TRIPLETS_IN_HOLE
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CAConstants.h"
+
+class GPUCACell {
+public:
+  using PtrAsInt = unsigned long long;
+
+  static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit;
+  using OuterHitOfCell = caConstants::OuterHitOfCell;
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
+
+  using Hits = TrackingRecHit2DSOAView;
+  using hindex_type = Hits::hindex_type;
+
+  using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
+
+  using HitContainer = pixelTrack::HitContainer;
+  using Quality = pixelTrack::Quality;
+  static constexpr auto bad = pixelTrack::Quality::bad;
+
+  GPUCACell() = default;
+
+  __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
+                                       CellTracksVector& cellTracks,
+                                       Hits const& hh,
+                                       int layerPairId,
+                                       int doubletId,
+                                       hindex_type innerHitId,
+                                       hindex_type outerHitId) {
+    theInnerHitId = innerHitId;
+    theOuterHitId = outerHitId;
+    theDoubletId_ = doubletId;
+    theLayerPairId_ = layerPairId;
+    theUsed_ = 0;
+
+    // optimization that depends on access pattern
+    theInnerZ = hh.zGlobal(innerHitId);
+    theInnerR = hh.rGlobal(innerHitId);
+
+    // link to default empty
+    theOuterNeighbors = &cellNeighbors[0];
+    theTracks = &cellTracks[0];
+    assert(outerNeighbors().empty());
+    assert(tracks().empty());
+  }
+
+  __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
+    // use smart cache
+    if (outerNeighbors().empty()) {
+      auto i = cellNeighbors.extend();  // maybe wasted....
+      if (i > 0) {
+        cellNeighbors[i].reset();
+#ifdef __CUDACC__
+        auto zero = (PtrAsInt)(&cellNeighbors[0]);
+        atomicCAS((PtrAsInt*)(&theOuterNeighbors),
+                  zero,
+                  (PtrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
+#else
+        theOuterNeighbors = &cellNeighbors[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
+    return outerNeighbors().push_back(t);
+  }
+
+  __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
+    if (tracks().empty()) {
+      auto i = cellTracks.extend();  // maybe wasted....
+      if (i > 0) {
+        cellTracks[i].reset();
+#ifdef __CUDACC__
+        auto zero = (PtrAsInt)(&cellTracks[0]);
+        atomicCAS((PtrAsInt*)(&theTracks), zero, (PtrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
+#else
+        theTracks = &cellTracks[i];
+#endif
+      } else
+        return -1;
+    }
+    __threadfence();
+    return tracks().push_back(t);
+  }
+
+  __device__ __forceinline__ CellTracks& tracks() { return *theTracks; }
+  __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
+  __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
+  __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
+  __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
+  __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
+  __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; }
+  // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; }
+  // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
+
+  __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
+  __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
+
+  __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
+
+  constexpr unsigned int inner_hit_id() const { return theInnerHitId; }
+  constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
+
+  __device__ void print_cell() const {
+    printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
+           theDoubletId_,
+           theLayerPairId_,
+           theInnerHitId,
+           theOuterHitId);
+  }
+
+  __device__ bool check_alignment(Hits const& hh,
+                                  GPUCACell const& otherCell,
+                                  const float ptmin,
+                                  const float hardCurvCut,
+                                  const float caThetaCutBarrel,
+                                  const float caThetaCutForward,
+                                  const float dcaCutInnerTriplet,
+                                  const float dcaCutOuterTriplet) const {
+    // detIndex of the layerStart for the Phase1 Pixel Detector:
+    // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
+    // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
+    auto ri = inner_r(hh);
+    auto zi = inner_z(hh);
+
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
+
+    auto r1 = otherCell.inner_r(hh);
+    auto z1 = otherCell.inner_z(hh);
+    auto isBarrel = otherCell.outer_detIndex(hh) < caConstants::last_barrel_detIndex;
+    bool aligned = areAlignedRZ(r1,
+                                z1,
+                                ri,
+                                zi,
+                                ro,
+                                zo,
+                                ptmin,
+                                isBarrel ? caThetaCutBarrel : caThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+    return (aligned && dcaCut(hh,
+                              otherCell,
+                              otherCell.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                              : dcaCutOuterTriplet,
+                              hardCurvCut));  // FIXME tune cuts
+  }
+
+  __device__ __forceinline__ static bool areAlignedRZ(
+      float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, const float thetaCut) {
+    float radius_diff = std::abs(r1 - ro);
+    float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
+
+    float pMin = ptmin * std::sqrt(distance_13_squared);  // this needs to be divided by
+                                                          // radius_diff later
+
+    float tan_12_13_half_mul_distance_13_squared = fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
+    return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
+  }
+
+  __device__ inline bool dcaCut(Hits const& hh,
+                                GPUCACell const& otherCell,
+                                const float region_origin_radius_plus_tolerance,
+                                const float maxCurv) const {
+    auto x1 = otherCell.inner_x(hh);
+    auto y1 = otherCell.inner_y(hh);
+
+    auto x2 = inner_x(hh);
+    auto y2 = inner_y(hh);
+
+    auto x3 = outer_x(hh);
+    auto y3 = outer_y(hh);
+
+    CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+
+    if (eq.curvature() > maxCurv)
+      return false;
+
+    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
+  }
+
+  __device__ __forceinline__ static bool dcaCutH(float x1,
+                                                 float y1,
+                                                 float x2,
+                                                 float y2,
+                                                 float x3,
+                                                 float y3,
+                                                 const float region_origin_radius_plus_tolerance,
+                                                 const float maxCurv) {
+    CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+
+    if (eq.curvature() > maxCurv)
+      return false;
+
+    return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
+  }
+
+  __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
+    using caConstants::first_ladder_bpx0;
+    using caConstants::max_ladder_bpx0;
+    using caConstants::module_length_bpx0;
+    using caConstants::module_tolerance_bpx0;
+    int p = innerCell.inner_iphi(hh);
+    if (p < 0)
+      p += std::numeric_limits<unsigned short>::max();
+    p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
+    p %= max_ladder_bpx0;
+    auto il = first_ladder_bpx0 + p;
+    auto r0 = hh.averageGeometry().ladderR[il];
+    auto ri = innerCell.inner_r(hh);
+    auto zi = innerCell.inner_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
+    auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
+    auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]);
+    auto z_in_module = z_in_ladder - module_length_bpx0 * int(z_in_ladder / module_length_bpx0);
+    auto gap = z_in_module < module_tolerance_bpx0 || z_in_module > (module_length_bpx0 - module_tolerance_bpx0);
+    return gap;
+  }
+
+  __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
+    using caConstants::first_ladder_bpx4;
+    using caConstants::max_ladder_bpx4;
+    using caConstants::module_length_bpx4;
+    using caConstants::module_tolerance_bpx4;
+    int p = outer_iphi(hh);
+    if (p < 0)
+      p += std::numeric_limits<unsigned short>::max();
+    p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
+    p %= max_ladder_bpx4;
+    auto il = first_ladder_bpx4 + p;
+    auto r4 = hh.averageGeometry().ladderR[il];
+    auto ri = innerCell.inner_r(hh);
+    auto zi = innerCell.inner_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
+    auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
+    auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]);
+    auto z_in_module = z_in_ladder - module_length_bpx4 * int(z_in_ladder / module_length_bpx4);
+    auto gap = z_in_module < module_tolerance_bpx4 || z_in_module > (module_length_bpx4 - module_tolerance_bpx4);
+    auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
+    auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1];
+    return gap || holeP || holeN;
+  }
+
+  // trying to free the track building process from hardcoded layers, leaving
+  // the visit of the graph based on the neighborhood connections between cells.
+  __device__ inline void find_ntuplets(Hits const& hh,
+                                       GPUCACell* __restrict__ cells,
+                                       CellTracksVector& cellTracks,
+                                       HitContainer& foundNtuplets,
+                                       cms::cuda::AtomicPairCounter& apc,
+                                       Quality* __restrict__ quality,
+                                       TmpTuple& tmpNtuplet,
+                                       const unsigned int minHitsPerNtuplet,
+                                       bool startAt0) const {
+    // the building process for a track ends if:
+    // it has no right neighbor
+    // it has no compatible neighbor
+    // the ntuplets is then saved if the number of hits it contains is greater
+    // than a threshold
+
+    tmpNtuplet.push_back_unsafe(theDoubletId_);
+    assert(tmpNtuplet.size() <= 4);
+
+    bool last = true;
+    for (unsigned int otherCell : outerNeighbors()) {
+      if (cells[otherCell].theDoubletId_ < 0)
+        continue;  // killed by earlyFishbone
+      last = false;
+      cells[otherCell].find_ntuplets(
+          hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
+    }
+    if (last) {  // if long enough save...
+      if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
+#ifdef ONLY_TRIPLETS_IN_HOLE
+        // triplets accepted only pointing to the hole
+        if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
+            ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
+#endif
+        {
+          hindex_type hits[6];
+          auto nh = 0U;
+          for (auto c : tmpNtuplet) {
+            hits[nh++] = cells[c].theInnerHitId;
+          }
+          hits[nh] = theOuterHitId;
+          auto it = foundNtuplets.bulkFill(apc, hits, tmpNtuplet.size() + 1);
+          if (it >= 0) {  // if negative is overflow....
+            for (auto c : tmpNtuplet)
+              cells[c].addTrack(it, cellTracks);
+            quality[it] = bad;  // initialize to bad
+          }
+        }
+      }
+    }
+    tmpNtuplet.pop_back();
+    assert(tmpNtuplet.size() < 4);
+  }
+
+  // Cell status management
+  __device__ __forceinline__ void kill() { theDoubletId_ = -1; }
+  __device__ __forceinline__ bool isKilled() const { return theDoubletId_ < 0; }
+
+  __device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; }
+
+  __device__ __forceinline__ bool unused() const { return !theUsed_; }
+  __device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; }
+
+private:
+  CellNeighbors* theOuterNeighbors;
+  CellTracks* theTracks;
+
+  int32_t theDoubletId_;
+  int16_t theLayerPairId_;
+  uint16_t theUsed_;  // tbd
+
+  float theInnerZ;
+  float theInnerR;
+  hindex_type theInnerHitId;
+  hindex_type theOuterHitId;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
new file mode 100644
index 0000000000000..880bdb47dfb5c
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -0,0 +1,16 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HelixFitOnGPU.h"
+
+void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
+                                  TupleMultiplicity const *tupleMultiplicity,
+                                  OutputSoA *helix_fit_results) {
+  tuples_ = tuples;
+  tupleMultiplicity_ = tupleMultiplicity;
+  outputSoa_ = helix_fit_results;
+
+  assert(tuples_);
+  assert(tupleMultiplicity_);
+  assert(outputSoa_);
+}
+
+void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
new file mode 100644
index 0000000000000..938994840f8c0
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -0,0 +1,68 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
+#define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+
+#include "CAConstants.h"
+
+namespace riemannFit {
+  // in case of memory issue can be made smaller
+  constexpr uint32_t maxNumberOfConcurrentFits = caConstants::maxNumberOfTuples;
+  constexpr uint32_t stride = maxNumberOfConcurrentFits;
+  using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
+  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride, stride> >;
+  using Matrix6x4f = Eigen::Matrix<float, 6, 4>;
+  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride, stride> >;
+
+  // hits
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride, stride> >;
+  // errors
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride, stride> >;
+  // fast fit
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride> >;
+
+}  // namespace riemannFit
+
+class HelixFitOnGPU {
+public:
+  using HitsView = TrackingRecHit2DSOAView;
+
+  using Tuples = pixelTrack::HitContainer;
+  using OutputSoA = pixelTrack::TrackSoA;
+
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
+
+  explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
+  ~HelixFitOnGPU() { deallocateOnGPU(); }
+
+  void setBField(double bField) { bField_ = bField; }
+  void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+
+  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+
+  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
+  void deallocateOnGPU();
+
+private:
+  static constexpr uint32_t maxNumberOfConcurrentFits_ = riemannFit::maxNumberOfConcurrentFits;
+
+  // fowarded
+  Tuples const *tuples_ = nullptr;
+  TupleMultiplicity const *tupleMultiplicity_ = nullptr;
+  OutputSoA *outputSoa_;
+  float bField_;
+
+  const bool fit5as4_;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
new file mode 100644
index 0000000000000..491dd0df2004f
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -0,0 +1,113 @@
+#include "RiemannFitOnGPU.h"
+
+void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
+  assert(tuples_);
+
+  //  Fit internals
+  auto hitsGPU =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
+  auto circle_fit_resultsGPU_holder =
+      std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit));
+  riemannFit::CircleFit *circle_fit_resultsGPU = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    kernel_FastFit<3>(
+        tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+
+    kernel_CircleFit<3>(tupleMultiplicity_,
+                        3,
+                        bField_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
+
+    kernel_LineFit<3>(tupleMultiplicity_,
+                      3,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get(),
+                      circle_fit_resultsGPU,
+                      offset);
+
+    // quads
+    kernel_FastFit<4>(
+        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+
+    kernel_CircleFit<4>(tupleMultiplicity_,
+                        4,
+                        bField_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
+
+    kernel_LineFit<4>(tupleMultiplicity_,
+                      4,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get(),
+                      circle_fit_resultsGPU,
+                      offset);
+
+    if (fit5as4_) {
+      // penta
+      kernel_FastFit<4>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+
+      kernel_CircleFit<4>(tupleMultiplicity_,
+                          5,
+                          bField_,
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          circle_fit_resultsGPU,
+                          offset);
+
+      kernel_LineFit<4>(tupleMultiplicity_,
+                        5,
+                        bField_,
+                        outputSoa_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
+
+    } else {
+      // penta all 5
+      kernel_FastFit<5>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+
+      kernel_CircleFit<5>(tupleMultiplicity_,
+                          5,
+                          bField_,
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          circle_fit_resultsGPU,
+                          offset);
+
+      kernel_LineFit<5>(tupleMultiplicity_,
+                        5,
+                        bField_,
+                        outputSoa_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
+    }
+  }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
new file mode 100644
index 0000000000000..90af2ac13730b
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -0,0 +1,131 @@
+#include "RiemannFitOnGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples,
+                                         cudaStream_t stream) {
+  assert(tuples_);
+
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
+  auto circle_fit_resultsGPU_holder =
+      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), stream);
+  riemannFit::CircleFit *circle_fit_resultsGPU_ = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    kernel_FastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_CircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                  3,
+                                                                  bField_,
+                                                                  hitsGPU.get(),
+                                                                  hits_geGPU.get(),
+                                                                  fast_fit_resultsGPU.get(),
+                                                                  circle_fit_resultsGPU_,
+                                                                  offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_LineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                3,
+                                                                bField_,
+                                                                outputSoa_,
+                                                                hitsGPU.get(),
+                                                                hits_geGPU.get(),
+                                                                fast_fit_resultsGPU.get(),
+                                                                circle_fit_resultsGPU_,
+                                                                offset);
+    cudaCheck(cudaGetLastError());
+
+    // quads
+    kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      4,
+                                                                      bField_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+    cudaCheck(cudaGetLastError());
+
+    kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    4,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU.get(),
+                                                                    hits_geGPU.get(),
+                                                                    fast_fit_resultsGPU.get(),
+                                                                    circle_fit_resultsGPU_,
+                                                                    offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // penta
+      kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+      cudaCheck(cudaGetLastError());
+    } else {
+      // penta all 5
+      kernel_FastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_CircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
+      cudaCheck(cudaGetLastError());
+
+      kernel_LineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
+      cudaCheck(cudaGetLastError());
+    }
+  }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
new file mode 100644
index 0000000000000..5b661bc3be028
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -0,0 +1,187 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+
+#include "HelixFitOnGPU.h"
+
+using HitsOnGPU = TrackingRecHit2DSOAView;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
+
+template <int N>
+__global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
+                               caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                               uint32_t nHits,
+                               HitsOnGPU const *__restrict__ hhp,
+                               double *__restrict__ phits,
+                               float *__restrict__ phits_ge,
+                               double *__restrict__ pfast_fit,
+                               uint32_t offset) {
+  constexpr uint32_t hitsInFit = N;
+
+  assert(hitsInFit <= nHits);
+
+  assert(pfast_fit);
+  assert(foundNtuplets);
+  assert(tupleMultiplicity);
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+
+#ifdef RIEMANN_DEBUG
+  if (0 == local_start)
+    printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+#endif
+
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it from the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    assert(tkid < foundNtuplets->nbins());
+
+    assert(foundNtuplets->size(tkid) == nHits);
+
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    // Prepare data structure
+    auto const *hitId = foundNtuplets->begin(tkid);
+    for (unsigned int i = 0; i < hitsInFit; ++i) {
+      auto hit = hitId[i];
+      // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
+      float ge[6];
+      hhp->cpeParams()
+          .detParams(hhp->detectorIndex(hit))
+          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+      // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+
+      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+    }
+    riemannFit::fastFit(hits, fast_fit);
+
+    // no NaN here....
+    assert(fast_fit(0) == fast_fit(0));
+    assert(fast_fit(1) == fast_fit(1));
+    assert(fast_fit(2) == fast_fit(2));
+    assert(fast_fit(3) == fast_fit(3));
+  }
+}
+
+template <int N>
+__global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                 uint32_t nHits,
+                                 double bField,
+                                 double *__restrict__ phits,
+                                 float *__restrict__ phits_ge,
+                                 double *__restrict__ pfast_fit_input,
+                                 riemannFit::CircleFit *circle_fit,
+                                 uint32_t offset) {
+  assert(circle_fit);
+  assert(N <= nHits);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+
+    riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+    riemannFit::loadCovariance2D(hits_ge, hits_cov);
+
+    circle_fit[local_idx] = riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, bField, true);
+
+#ifdef RIEMANN_DEBUG
+//    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid,
+//         circle_fit[local_idx].par(0), circle_fit[local_idx].par(1), circle_fit[local_idx].par(2));
+#endif
+  }
+}
+
+template <int N>
+__global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                               uint32_t nHits,
+                               double bField,
+                               OutputSoA *results,
+                               double *__restrict__ phits,
+                               float *__restrict__ phits_ge,
+                               double *__restrict__ pfast_fit_input,
+                               riemannFit::CircleFit *__restrict__ circle_fit,
+                               uint32_t offset) {
+  assert(results);
+  assert(circle_fit);
+  assert(N <= nHits);
+
+  // same as above...
+
+  // look in bin for this hit multiplicity
+  auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
+       local_idx += gridDim.x * blockDim.x) {
+    auto tuple_idx = local_idx + offset;
+    if (tuple_idx >= tupleMultiplicity->size(nHits))
+      break;
+
+    // get it for the ntuple container (one to one to helix)
+    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+    auto const &line_fit = riemannFit::lineFit(hits, hits_ge, circle_fit[local_idx], fast_fit, bField, true);
+
+    riemannFit::fromCircleToPerigee(circle_fit[local_idx]);
+
+    results->stateAtBS.copyFromCircle(
+        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid);
+    results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2));
+    results->eta(tkid) = asinhf(line_fit.par(0));
+    results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
+
+#ifdef RIEMANN_DEBUG
+    printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+           N,
+           nHits,
+           tkid,
+           circle_fit[local_idx].par(0),
+           circle_fit[local_idx].par(1),
+           circle_fit[local_idx].par(2));
+    printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
+    printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
+           circle_fit[local_idx].chi2,
+           line_fit.chi2,
+           circle_fit[local_idx].cov(0, 0),
+           circle_fit[local_idx].cov(1, 1),
+           circle_fit[local_idx].cov(2, 2),
+           line_fit.cov(0, 0),
+           line_fit.cov(1, 1));
+#endif
+  }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
new file mode 100644
index 0000000000000..09cd5c18e65ae
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -0,0 +1,91 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "GPUCACell.h"
+
+namespace gpuPixelDoublets {
+
+  //  __device__
+  //  __forceinline__
+  __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp,
+                           GPUCACell* cells,
+                           uint32_t const* __restrict__ nCells,
+                           GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
+                           uint32_t nHits,
+                           bool checkTrack) {
+    constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
+
+    auto const& hh = *hhp;
+
+    // x run faster...
+    auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
+    auto firstX = threadIdx.x;
+
+    float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
+    uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
+    uint32_t cc[maxCellsPerHit];
+
+    for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) {
+      auto const& vc = isOuterHitOfCell[idy];
+      auto size = vc.size();
+      if (size < 2)
+        continue;
+      // if alligned kill one of the two.
+      // in principle one could try to relax the cut (only in r-z?) for jumping-doublets
+      auto const& c0 = cells[vc[0]];
+      auto xo = c0.outer_x(hh);
+      auto yo = c0.outer_y(hh);
+      auto zo = c0.outer_z(hh);
+      auto sg = 0;
+      for (int32_t ic = 0; ic < size; ++ic) {
+        auto& ci = cells[vc[ic]];
+        if (ci.unused())
+          continue;  // for triplets equivalent to next
+        if (checkTrack && ci.tracks().empty())
+          continue;
+        cc[sg] = vc[ic];
+        d[sg] = ci.inner_detIndex(hh);
+        x[sg] = ci.inner_x(hh) - xo;
+        y[sg] = ci.inner_y(hh) - yo;
+        z[sg] = ci.inner_z(hh) - zo;
+        n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
+        ++sg;
+      }
+      if (sg < 2)
+        continue;
+      // here we parallelize
+      for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) {
+        auto& ci = cells[cc[ic]];
+        for (auto jc = ic + 1; jc < sg; ++jc) {
+          auto& cj = cells[cc[jc]];
+          // must be different detectors (in the same layer)
+          //        if (d[ic]==d[jc]) continue;
+          // || l[ic]!=l[jc]) continue;
+          auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
+          if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
+            // alligned:  kill farthest  (prefer consecutive layers)
+            if (n[ic] > n[jc]) {
+              ci.kill();
+              break;
+            } else {
+              cj.kill();
+            }
+          }
+        }  //cj
+      }    // ci
+    }      // hits
+  }
+}  // namespace gpuPixelDoublets
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
new file mode 100644
index 0000000000000..6de3f1a51acaa
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -0,0 +1,130 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
+
+#include "RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h"
+
+#define CONSTANT_VAR __constant__
+
+namespace gpuPixelDoublets {
+
+  constexpr int nPairs = 13 + 2 + 4;
+  static_assert(nPairs <= caConstants::maxNumberOfLayerPairs);
+
+  // start constants
+  // clang-format off
+
+  CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
+      0, 1, 0, 4, 0, 7,              // BPIX1 (3)
+      1, 2, 1, 4, 1, 7,              // BPIX2 (5)
+      4, 5, 7, 8,                    // FPIX1 (8)
+      2, 3, 2, 4, 2, 7, 5, 6, 8, 9,  // BPIX3 & FPIX2 (13)
+      0, 2, 1, 3,                    // Jumping Barrel (15)
+      0, 5, 0, 8,                    // Jumping Forward (BPIX1,FPIX2)
+      4, 6, 7, 9                     // Jumping Forward (19)
+  };
+
+  constexpr int16_t phi0p05 = 522;  // round(521.52189...) = phi2short(0.05);
+  constexpr int16_t phi0p06 = 626;  // round(625.82270...) = phi2short(0.06);
+  constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
+
+  CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05,
+                                             phi0p07,
+                                             phi0p07,
+                                             phi0p05,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p06,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05,
+                                             phi0p05};
+  //   phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06};  // relaxed cuts
+
+  CONSTANT_VAR float const minz[nPairs] = {
+      -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.};
+  CONSTANT_VAR float const maxz[nPairs] = {
+      20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.};
+  CONSTANT_VAR float const maxr[nPairs] = {
+      20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.};
+
+  // end constants
+  // clang-format on
+
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
+
+  __global__ void initDoublets(GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                               int nHits,
+                               CellNeighborsVector* cellNeighbors,
+                               CellNeighbors* cellNeighborsContainer,
+                               CellTracksVector* cellTracks,
+                               CellTracks* cellTracksContainer) {
+    assert(isOuterHitOfCell);
+    int first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = first; i < nHits; i += gridDim.x * blockDim.x)
+      isOuterHitOfCell[i].reset();
+
+    if (0 == first) {
+      cellNeighbors->construct(caConstants::maxNumOfActiveDoublets, cellNeighborsContainer);
+      cellTracks->construct(caConstants::maxNumOfActiveDoublets, cellTracksContainer);
+      auto i = cellNeighbors->extend();
+      assert(0 == i);
+      (*cellNeighbors)[0].reset();
+      i = cellTracks->extend();
+      assert(0 == i);
+      (*cellTracks)[0].reset();
+    }
+  }
+
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
+  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
+
+  __global__
+#ifdef __CUDACC__
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP)
+#endif
+      void getDoubletsFromHisto(GPUCACell* cells,
+                                uint32_t* nCells,
+                                CellNeighborsVector* cellNeighbors,
+                                CellTracksVector* cellTracks,
+                                TrackingRecHit2DSOAView const* __restrict__ hhp,
+                                GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                int nActualPairs,
+                                bool ideal_cond,
+                                bool doClusterCut,
+                                bool doZ0Cut,
+                                bool doPtCut,
+                                uint32_t maxNumOfDoublets) {
+    auto const& __restrict__ hh = *hhp;
+    doubletsFromHisto(layerPairs,
+                      nActualPairs,
+                      cells,
+                      nCells,
+                      cellNeighbors,
+                      cellTracks,
+                      hh,
+                      isOuterHitOfCell,
+                      phicuts,
+                      minz,
+                      maxz,
+                      maxr,
+                      ideal_cond,
+                      doClusterCut,
+                      doZ0Cut,
+                      doPtCut,
+                      maxNumOfDoublets);
+  }
+
+}  // namespace gpuPixelDoublets
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
new file mode 100644
index 0000000000000..a12dee0785b36
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -0,0 +1,243 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "CAConstants.h"
+#include "GPUCACell.h"
+
+namespace gpuPixelDoublets {
+
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
+
+  __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
+                                                    uint32_t nPairs,
+                                                    GPUCACell* cells,
+                                                    uint32_t* nCells,
+                                                    CellNeighborsVector* cellNeighbors,
+                                                    CellTracksVector* cellTracks,
+                                                    TrackingRecHit2DSOAView const& __restrict__ hh,
+                                                    GPUCACell::OuterHitOfCell* isOuterHitOfCell,
+                                                    int16_t const* __restrict__ phicuts,
+                                                    float const* __restrict__ minz,
+                                                    float const* __restrict__ maxz,
+                                                    float const* __restrict__ maxr,
+                                                    bool ideal_cond,
+                                                    bool doClusterCut,
+                                                    bool doZ0Cut,
+                                                    bool doPtCut,
+                                                    uint32_t maxNumOfDoublets) {
+    // ysize cuts (z in the barrel)  times 8
+    // these are used if doClusterCut is true
+    constexpr int minYsizeB1 = 36;
+    constexpr int minYsizeB2 = 28;
+    constexpr int maxDYsize12 = 28;
+    constexpr int maxDYsize = 20;
+    constexpr int maxDYPred = 20;
+    constexpr float dzdrFact = 8 * 0.0285 / 0.015;  // from dz/dr to "DY"
+
+    bool isOuterLadder = ideal_cond;
+
+    using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
+
+    auto const& __restrict__ phiBinner = hh.phiBinner();
+    uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
+    assert(offsets);
+
+    auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
+
+    // nPairsMax to be optimized later (originally was 64).
+    // If it should be much bigger, consider using a block-wide parallel prefix scan,
+    // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+    const int nPairsMax = caConstants::maxNumberOfLayerPairs;
+    assert(nPairs <= nPairsMax);
+    __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
+    __shared__ uint32_t ntot;
+    if (threadIdx.y == 0 && threadIdx.x == 0) {
+      innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
+      for (uint32_t i = 1; i < nPairs; ++i) {
+        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(layerPairs[2 * i]);
+      }
+      ntot = innerLayerCumulativeSize[nPairs - 1];
+    }
+    __syncthreads();
+
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+
+    uint32_t pairLayerId = 0;  // cannot go backward
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
+      while (j >= innerLayerCumulativeSize[pairLayerId++])
+        ;
+      --pairLayerId;  // move to lower_bound ??
+
+      assert(pairLayerId < nPairs);
+      assert(j < innerLayerCumulativeSize[pairLayerId]);
+      assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
+
+      uint8_t inner = layerPairs[2 * pairLayerId];
+      uint8_t outer = layerPairs[2 * pairLayerId + 1];
+      assert(outer > inner);
+
+      auto hoff = PhiBinner::histOff(outer);
+
+      auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
+      i += offsets[inner];
+
+      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
+
+      assert(i >= offsets[inner]);
+      assert(i < offsets[inner + 1]);
+
+      // found hit corresponding to our cuda thread, now do the job
+      auto mi = hh.detectorIndex(i);
+      if (mi > gpuClustering::maxNumModules)
+        continue;  // invalid
+
+      /* maybe clever, not effective when zoCut is on
+      auto bpos = (mi%8)/4;  // if barrel is 1 for z>0
+      auto fpos = (outer>3) & (outer<7);
+      if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
+      */
+
+      auto mez = hh.zGlobal(i);
+
+      if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
+        continue;
+
+      int16_t mes = -1;  // make compiler happy
+      if (doClusterCut) {
+        // if ideal treat inner ladder as outer
+        if (inner == 0)
+          assert(mi < 96);
+        isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
+
+        // in any case we always test mes>0 ...
+        mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1;
+
+        if (inner == 0 && outer > 3)  // B1 and F1
+          if (mes > 0 && mes < minYsizeB1)
+            continue;                 // only long cluster  (5*8)
+        if (inner == 1 && outer > 3)  // B2 and F1
+          if (mes > 0 && mes < minYsizeB2)
+            continue;
+      }
+      auto mep = hh.iphi(i);
+      auto mer = hh.rGlobal(i);
+
+      // all cuts: true if fails
+      constexpr float z0cut = 12.f;      // cm
+      constexpr float hardPtCut = 0.5f;  // GeV
+      // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius = hardPtCut * 87.78f;
+      constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
+      auto ptcut = [&](int j, int16_t idphi) {
+        auto r2t4 = minRadius2T4;
+        auto ri = mer;
+        auto ro = hh.rGlobal(j);
+        auto dphi = short2phi(idphi);
+        return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
+      };
+      auto z0cutoff = [&](int j) {
+        auto zo = hh.zGlobal(j);
+        auto ro = hh.rGlobal(j);
+        auto dr = ro - mer;
+        return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
+      };
+
+      auto zsizeCut = [&](int j) {
+        auto onlyBarrel = outer < 4;
+        auto so = hh.clusterSizeY(j);
+        auto dy = inner == 0 ? maxDYsize12 : maxDYsize;
+        // in the barrel cut on difference in size
+        // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well)
+        // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ...
+        auto zo = hh.zGlobal(j);
+        auto ro = hh.rGlobal(j);
+        return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy
+                          : (inner < 4) && mes > 0 &&
+                                std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred;
+      };
+
+      auto iphicut = phicuts[pairLayerId];
+
+      auto kl = PhiBinner::bin(int16_t(mep - iphicut));
+      auto kh = PhiBinner::bin(int16_t(mep + iphicut));
+      auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
+
+#ifdef GPU_DEBUG
+      int tot = 0;
+      int nmin = 0;
+      int tooMany = 0;
+#endif
+
+      auto khh = kh;
+      incr(khh);
+      for (auto kk = kl; kk != khh; incr(kk)) {
+#ifdef GPU_DEBUG
+        if (kk != kl && kk != kh)
+          nmin += phiBinner.size(kk + hoff);
+#endif
+        auto const* __restrict__ p = phiBinner.begin(kk + hoff);
+        auto const* __restrict__ e = phiBinner.end(kk + hoff);
+        p += first;
+        for (; p < e; p += stride) {
+          auto oi = __ldg(p);
+          assert(oi >= offsets[outer]);
+          assert(oi < offsets[outer + 1]);
+          auto mo = hh.detectorIndex(oi);
+          if (mo > gpuClustering::maxNumModules)
+            continue;  //    invalid
+
+          if (doZ0Cut && z0cutoff(oi))
+            continue;
+
+          auto mop = hh.iphi(oi);
+          uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
+          if (idphi > iphicut)
+            continue;
+
+          if (doClusterCut && zsizeCut(oi))
+            continue;
+          if (doPtCut && ptcut(oi, idphi))
+            continue;
+
+          auto ind = atomicAdd(nCells, 1);
+          if (ind >= maxNumOfDoublets) {
+            atomicSub(nCells, 1);
+            break;
+          }  // move to SimpleVector??
+          // int layerPairId, int doubletId, int innerHitId, int outerHitId)
+          cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi);
+          isOuterHitOfCell[oi].push_back(ind);
+#ifdef GPU_DEBUG
+          if (isOuterHitOfCell[oi].full())
+            ++tooMany;
+          ++tot;
+#endif
+        }
+      }
+#ifdef GPU_DEBUG
+      if (tooMany > 0)
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
+#endif
+    }  // loop in block...
+  }
+
+}  // namespace gpuPixelDoublets
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
diff --git a/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py
new file mode 100644
index 0000000000000..c72c07ae5a721
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+from RecoPixelVertexing.PixelTriplets.caHitQuadrupletDefaultEDProducer_cfi import caHitQuadrupletDefaultEDProducer as _caHitQuadrupletDefaultEDProducer
+
+caHitQuadrupletEDProducer = _caHitQuadrupletDefaultEDProducer.clone()
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 6d6f1553b32f3..d480d7408b9e2 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -16,3 +16,14 @@
 <bin file="PixelTriplets_InvPrbl_prec.cpp">
   <use name="RecoPixelVertexing/PixelTriplets"/>
 </bin>
+
+<bin file="fastDPHI_t.cpp">
+</bin>
+
+<bin file="CircleEq_t.cpp">
+</bin>
+
+<bin file="CAsizes_t.cpp">
+  <use name="cuda"/>
+  <use name="eigen"/>
+</bin>
diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
new file mode 100644
index 0000000000000..5cf2e6526b860
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
@@ -0,0 +1,25 @@
+#include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h"
+
+#include <typeinfo>
+#include <iostream>
+
+template <typename T>
+void print() {
+  std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl;
+}
+
+int main() {
+  using namespace caConstants;
+
+  print<GPUCACell>();
+  print<CellNeighbors>();
+  print<CellTracks>();
+  print<OuterHitOfCell>();
+  print<TuplesContainer>();
+  print<HitToTuple>();
+  print<TupleMultiplicity>();
+
+  print<CellNeighborsVector>();
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp
new file mode 100644
index 0000000000000..504f9c144b284
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp
@@ -0,0 +1,77 @@
+#include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
+#include <cassert>
+
+struct OriCircle {
+  using T = float;
+
+  float radius = 0;
+  float x_center = 0;
+  float y_center = 0;
+
+  constexpr OriCircle(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); }
+
+  // dca to origin
+  constexpr T dca0() const { return std::sqrt(x_center * x_center + y_center * y_center) - radius; }
+
+  // dca to given point
+  constexpr T dca(T x, T y) const {
+    x -= x_center;
+    y -= y_center;
+    return std::sqrt(x * x + y * y) - radius;
+  }
+
+  constexpr void compute(T x1, T y1, T x2, T y2, T x3, T y3) {
+    auto det = (x1 - x2) * (y2 - y3) - (x2 - x3) * (y1 - y2);
+
+    auto offset = x2 * x2 + y2 * y2;
+
+    auto bc = (x1 * x1 + y1 * y1 - offset) * 0.5f;
+
+    auto cd = (offset - x3 * x3 - y3 * y3) * 0.5f;
+
+    auto idet = 1.f / det;
+
+    x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
+    y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;
+
+    radius = std::sqrt((x2 - x_center) * (x2 - x_center) + (y2 - y_center) * (y2 - y_center));
+  }
+};
+
+#include <iostream>
+
+template <typename T>
+bool equal(T a, T b) {
+  //  return float(a-b)==0;
+  return std::abs(float(a - b)) < std::abs(0.01f * a);
+}
+
+int main() {
+  float r1 = 4, r2 = 8, r3 = 15;
+  for (float phi = -3; phi < 3.1; phi += 0.5) {
+    float x1 = r1 * cos(phi);
+    float x2 = r2 * cos(phi);
+    float y1 = r1 * sin(phi);
+    float y2 = r2 * sin(phi);
+    for (float phi3 = phi - 0.31; phi3 < phi + 0.31; phi3 += 0.05) {
+      float x3 = r3 * cos(phi3);
+      float y3 = r3 * sin(phi3);
+
+      OriCircle ori(x1, y1, x2, y2, x3, y3);
+      CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+      // std::cout << "r " << ori.radius <<' '<< eq.radius() << std::endl;
+      assert(equal(ori.radius, std::abs(eq.radius())));
+      auto c = eq.center();
+      auto dir = eq.cosdir();
+      assert(equal(1.f, dir.first * dir.first + dir.second * dir.second));
+      assert(equal(ori.x_center, c.first));
+      assert(equal(ori.y_center, c.second));
+      // std::cout << "dca " << ori.dca0() <<' '<< eq.radius()*eq.dca0() << std::endl;
+      assert(equal(std::abs(ori.dca0()), std::abs(eq.radius() * eq.dca0())));
+      // std::cout << "dca " << ori.dca(1.,1.) <<' '<< eq.radius()*eq.dca(1.,1.) << std::endl;
+      assert(equal(std::abs(ori.dca(1., 1.)), std::abs(eq.radius() * eq.dca(1., 1.))));
+    }
+  }
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp
new file mode 100644
index 0000000000000..8538970a196ff
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp
@@ -0,0 +1,165 @@
+// this test documents the derivation of the fast deltaphi used in gpu doublet code..
+//
+//
+//
+#include <cmath>
+#include <algorithm>
+#include <numeric>
+#include <cassert>
+
+/**
+| 1) circle is parameterized as:                                              |
+|    C*[(X-Xp)**2+(Y-Yp)**2] - 2*alpha*(X-Xp) - 2*beta*(Y-Yp) = 0             |
+|    Xp,Yp is a point on the track (Yp is at the center of the chamber);      |
+|    C = 1/r0 is the curvature  ( sign of C is charge of particle );          |
+|    alpha & beta are the direction cosines of the radial vector at Xp,Yp     |
+|    i.e.  alpha = C*(X0-Xp),                                                 |
+|          beta  = C*(Y0-Yp),                                                 |
+|    where center of circle is at X0,Y0.                                      |
+|    Alpha > 0                                                                |
+|    Slope dy/dx of tangent at Xp,Yp is -alpha/beta.                          |
+| 2) the z dimension of the helix is parameterized by gamma = dZ/dSperp       |
+|    this is also the tangent of the pitch angle of the helix.                |
+|    with this parameterization, (alpha,beta,gamma) rotate like a vector.     |
+| 3) For tracks going inward at (Xp,Yp), C, alpha, beta, and gamma change sign|
+|
+*/
+
+template <typename T>
+class FastCircle {
+public:
+  FastCircle() {}
+  FastCircle(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); }
+
+  void compute(T x1, T y1, T x2, T y2, T x3, T y3);
+
+  T m_xp;
+  T m_yp;
+  T m_c;
+  T m_alpha;
+  T m_beta;
+};
+
+template <typename T>
+void FastCircle<T>::compute(T x1, T y1, T x2, T y2, T x3, T y3) {
+  bool flip = std::abs(x3 - x1) > std::abs(y3 - y1);
+
+  auto x1p = x1 - x2;
+  auto y1p = y1 - y2;
+  auto d12 = x1p * x1p + y1p * y1p;
+  auto x3p = x3 - x2;
+  auto y3p = y3 - y2;
+  auto d32 = x3p * x3p + y3p * y3p;
+
+  if (flip) {
+    std::swap(x1p, y1p);
+    std::swap(x3p, y3p);
+  }
+
+  auto num = x1p * y3p - y1p * x3p;  // num also gives correct sign for CT
+  auto det = d12 * y3p - d32 * y1p;
+  if (std::abs(det) == 0) {
+    // and why we flip????
+  }
+  auto ct = num / det;
+  auto sn = det > 0 ? T(1.) : T(-1.);
+  auto st2 = (d12 * x3p - d32 * x1p) / det;
+  auto seq = T(1.) + st2 * st2;
+  auto al2 = sn / std::sqrt(seq);
+  auto be2 = -st2 * al2;
+  ct *= T(2.) * al2;
+
+  if (flip) {
+    std::swap(x1p, y1p);
+    std::swap(al2, be2);
+    al2 = -al2;
+    be2 = -be2;
+    ct = -ct;
+  }
+
+  m_xp = x1;
+  m_yp = y1;
+  m_c = ct;
+  m_alpha = al2 - ct * x1p;
+  m_beta = be2 - ct * y1p;
+}
+
+// compute curvature given two points (and origin)
+float fastDPHI(float ri, float ro, float dphi) {
+  /*
+  x3=0 y1=0 x1=0;
+  y3=ro
+  */
+
+  // auto x2 = ri*dphi;
+  // auto y2 = ri*(1.f-0.5f*dphi*dphi);
+
+  /*
+  auto x1p = x1-x2;
+  auto y1p = y1-y2;
+  auto d12 = x1p*x1p + y1p*y1p;
+  auto x3p = x3-x2;
+  auto y3p = y3-y2;
+  auto d32 = x3p*x3p + y3p*y3p;
+  */
+
+  /*
+  auto x1p = -x2;
+  auto y1p = -y2;
+  auto d12 = ri*ri;
+  auto x3p = -x2;
+  auto y3p = ro-y2;
+  auto d32 = ri*ri + ro*ro - 2.f*ro*y2;
+  */
+
+  // auto rat = (ro -2.f*y2);
+  // auto det =  ro - ri - (ro - 2.f*ri -0.5f*ro)*dphi*dphi;
+
+  //auto det2 = (ro-ri)*(ro-ri) -2.*(ro-ri)*(ro - 2.f*ri -0.5f*ro)*dphi*dphi;
+  // auto seq = det2 +  dphi*dphi*(ro-2.f*ri)*(ro-2.f*ri);    // *rat2;
+  // auto seq = (ro-ri)*(ro-ri) +  dphi*dphi*ri*ro;
+
+  // and little by little simplifing and removing higher over terms
+  // we get
+  auto r2 = (ro - ri) * (ro - ri) / (dphi * dphi) + ri * ro;
+
+  // d2 = (ro-ri)*(ro-ri)/(4.f*r2 -ri*ro);
+  // return -2.f*dphi/std::sqrt(seq);
+
+  return -1.f / std::sqrt(r2 / 4.f);
+}
+
+#include <iostream>
+
+template <typename T>
+bool equal(T a, T b) {
+  //  return float(a-b)==0;
+  return std::abs(float(a - b)) < std::abs(0.01f * a);
+}
+
+int n = 0;
+void go(float ri, float ro, float dphi, bool print = false) {
+  ++n;
+  float x3 = 0.f, y3 = ro;
+  float x2 = ri * sin(dphi);
+  float y2 = ri * cos(dphi);
+
+  FastCircle<float> c(0, 0, x2, y2, x3, y3);
+
+  auto cc = fastDPHI(ri, ro, dphi);
+  if (print)
+    std::cout << c.m_c << ' ' << cc << std::endl;
+  assert(equal(c.m_c, cc));
+}
+
+int main() {
+  go(4., 7., 0.1, true);
+
+  for (float r1 = 2; r1 < 15; r1 += 1)
+    for (float dr = 0.5; dr < 10; dr += 0.5)
+      for (float dphi = 0.02; dphi < 0.2; dphi += 0.2)
+        go(r1, r1 + dr, dphi);
+
+  std::cout << "done " << n << std::endl;
+  return 0;
+};
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
index 427799cb122b5..99b91b2587bcf 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="CUDADataFormats/Vertex"/>
 <use name="CommonTools/Clustering1D"/>
 <use name="DataFormats/BeamSpot"/>
 <use name="DataFormats/GeometryCommonDetAlgo"/>
@@ -15,10 +16,12 @@
 <use name="FWCore/Utilities"/>
 <use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalTracker/ClusterParameterEstimator"/>
 <use name="RecoLocalTracker/Records"/>
 <use name="RecoPixelVertexing/PixelVertexFinding"/>
 <use name="SimDataFormats/PileupSummaryInfo"/>
-<library file="*.cc" name="RecoPixelVertexingPixelVertexFindingPlugins">
+<library file="*.cc *.cu" name="RecoPixelVertexingPixelVertexFindingPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
new file mode 100644
index 0000000000000..e9054dbf17c53
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -0,0 +1,125 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "gpuVertexFinder.h"
+
+class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
+public:
+  explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig);
+  ~PixelVertexProducerCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  bool m_OnGPU;
+
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
+
+  const gpuVertexFinder::Producer m_gpuAlgo;
+
+  // Tracking cuts before sending tracks to vertex algo
+  const float m_ptMin;
+};
+
+PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
+    : m_OnGPU(conf.getParameter<bool>("onGPU")),
+      m_gpuAlgo(conf.getParameter<bool>("oneKernel"),
+                conf.getParameter<bool>("useDensity"),
+                conf.getParameter<bool>("useDBSCAN"),
+                conf.getParameter<bool>("useIterative"),
+                conf.getParameter<int>("minT"),
+                conf.getParameter<double>("eps"),
+                conf.getParameter<double>("errmax"),
+                conf.getParameter<double>("chi2max")),
+      m_ptMin(conf.getParameter<double>("PtMin"))  // 0.5 GeV
+{
+  if (m_OnGPU) {
+    tokenGPUTrack_ =
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
+  } else {
+    tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
+  }
+}
+
+void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  // Only one of these three algos can be used at once.
+  // Maybe this should become a Plugin Factory
+  desc.add<bool>("onGPU", true);
+  desc.add<bool>("oneKernel", true);
+  desc.add<bool>("useDensity", true);
+  desc.add<bool>("useDBSCAN", false);
+  desc.add<bool>("useIterative", false);
+
+  desc.add<int>("minT", 2);          // min number of neighbours to be "core"
+  desc.add<double>("eps", 0.07);     // max absolute distance to cluster
+  desc.add<double>("errmax", 0.01);  // max error to be "seed"
+  desc.add<double>("chi2max", 9.);   // max normalized distance to cluster
+
+  desc.add<double>("PtMin", 0.5);
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+
+  auto label = "pixelVertexCUDA";
+  descriptions.add(label, desc);
+}
+
+void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  if (m_OnGPU) {
+    edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+    iEvent.getByToken(tokenGPUTrack_, hTracks);
+
+    cms::cuda::ScopedContextProduce ctx{*hTracks};
+    auto const* tracks = ctx.get(*hTracks).get();
+
+    assert(tracks);
+
+    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+
+  } else {
+    auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+    assert(tracks);
+
+    /*
+    auto const & tsoa = *tracks;
+    auto maxTracks = tsoa.stride();
+    std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+    int32_t nt = 0;
+    for (int32_t it = 0; it < maxTracks; ++it) {
+      auto nHits = tsoa.nHits(it);
+      assert(nHits==int(tsoa.hitIndices.size(it)));
+      if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+      nt++;
+    }
+    std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+    */
+
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+  }
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
new file mode 100644
index 0000000000000..e642e3fd734f9
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -0,0 +1,175 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+
+#include "DataFormats/VertexReco/interface/Vertex.h"
+#include "DataFormats/VertexReco/interface/VertexFwd.h"
+
+class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
+public:
+  using IndToEdm = std::vector<uint16_t>;
+
+  explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~PixelVertexProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+  edm::EDGetTokenT<ZVertexHeterogeneous> tokenVertex_;
+  edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
+  edm::EDGetTokenT<reco::TrackCollection> tokenTracks_;
+  edm::EDGetTokenT<IndToEdm> tokenIndToEdm_;
+};
+
+PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf)
+    : tokenVertex_(consumes<ZVertexHeterogeneous>(conf.getParameter<edm::InputTag>("src"))),
+      tokenBeamSpot_(consumes<reco::BeamSpot>(conf.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTracks_(consumes<reco::TrackCollection>(conf.getParameter<edm::InputTag>("TrackCollection"))),
+      tokenIndToEdm_(consumes<IndToEdm>(conf.getParameter<edm::InputTag>("TrackCollection"))) {
+  produces<reco::VertexCollection>();
+}
+
+void PixelVertexProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("TrackCollection", edm::InputTag("pixelTracks"));
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelVertexSoA"));
+
+  descriptions.add("pixelVertexFromSoA", desc);
+}
+
+void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &) const {
+  auto vertexes = std::make_unique<reco::VertexCollection>();
+
+  edm::Handle<reco::TrackCollection> trackCollection;
+  iEvent.getByToken(tokenTracks_, trackCollection);
+  auto const &tracks = *(trackCollection.product());
+  edm::Handle<IndToEdm> indToEdmH;
+  iEvent.getByToken(tokenIndToEdm_, indToEdmH);
+  auto const &indToEdm = *indToEdmH;
+
+  edm::Handle<reco::BeamSpot> bsHandle;
+  iEvent.getByToken(tokenBeamSpot_, bsHandle);
+
+  float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0;
+  std::vector<int32_t> itrk;
+  if (!bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) ";
+  } else {
+    const reco::BeamSpot &bs = *bsHandle;
+    x0 = bs.x0();
+    y0 = bs.y0();
+    z0 = bs.z0();
+    dxdz = bs.dxdz();
+    dydz = bs.dydz();
+  }
+
+  auto const &soa = *(iEvent.get(tokenVertex_).get());
+
+  int nv = soa.nvFinal;
+
+  // std::cout << "converting " << nv << " vertices " << " from " << indToEdm.size() << " tracks" << std::endl;
+
+  std::set<uint16_t> uind;  // fort verifing index consistency
+  for (int j = nv - 1; j >= 0; --j) {
+    auto i = soa.sortInd[j];  // on gpu sorted in ascending order....
+    assert(i < nv);
+    uind.insert(i);
+    assert(itrk.empty());
+    auto z = soa.zv[i];
+    auto x = x0 + dxdz * z;
+    auto y = y0 + dydz * z;
+    z += z0;
+    reco::Vertex::Error err;
+    err(2, 2) = 1.f / soa.wv[i];
+    err(2, 2) *= 2.;  // artifically inflate error
+    //Copy also the tracks (no intention to be efficient....)
+    for (auto k = 0U; k < indToEdm.size(); ++k) {
+      if (soa.idv[k] == int16_t(i))
+        itrk.push_back(k);
+    }
+    auto nt = itrk.size();
+    if (nt == 0) {
+      std::cout << "vertex " << i << " with no tracks..." << std::endl;
+      continue;
+    }
+    if (nt < 2) {
+      itrk.clear();
+      continue;
+    }  // remove outliers
+    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt);
+    auto &v = (*vertexes).back();
+    for (auto it : itrk) {
+      assert(it < int(indToEdm.size()));
+      auto k = indToEdm[it];
+      if (k > tracks.size()) {
+        edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k;
+        continue;
+      }
+      auto tk = reco::TrackRef(trackCollection, k);
+      v.add(reco::TrackBaseRef(tk));
+    }
+    itrk.clear();
+  }
+
+  LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n";
+  for (unsigned int i = 0; i < vertexes->size(); ++i) {
+    LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize()
+                                    << " tracks with a position of " << (*vertexes)[i].z() << " +- "
+                                    << std::sqrt((*vertexes)[i].covariance(2, 2));
+  }
+
+  // legacy logic....
+  if (vertexes->empty() && bsHandle.isValid()) {
+    const reco::BeamSpot &bs = *bsHandle;
+
+    GlobalError bse(bs.rotatedCovariance3D());
+    if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) {
+      AlgebraicSymMatrix33 we;
+      we(0, 0) = 10000;
+      we(1, 1) = 10000;
+      we(2, 2) = 10000;
+      vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix()
+                                          << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    } else {
+      vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    }
+  } else if (vertexes->empty() && !bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned.";
+  }
+
+  iEvent.put(std::move(vertexes));
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
new file mode 100644
index 0000000000000..0cadf24580cf7
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -0,0 +1,65 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~PixelVertexSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
+
+  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+};
+
+PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
+    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<ZVertexHeterogeneous>()) {}
+
+void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelVertexCUDA"));
+  descriptions.add("pixelVertexSoA", desc);
+}
+
+void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                     edm::EventSetup const& iSetup,
+                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_soa = inputData.toHostAsync(ctx.stream());
+}
+
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // No copies....
+  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+}
+
+DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
new file mode 100644
index 0000000000000..b32c7d5b613db
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -0,0 +1,234 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  //
+  // based on Rodrighez&Laio algo
+  //
+  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
+                                                         gpuVertexFinder::WorkSpace* pws,
+                                                         int minT,      // min number of neighbours to be "seed"
+                                                         float eps,     // max absolute distance to cluster
+                                                         float errmax,  // max error to be "seed"
+                                                         float chi2max  // max normalized distance to cluster
+  ) {
+    using namespace gpuVertexFinder;
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find closest above me .... (we ignore the possibility of two j at same distance from i)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // (break natural order???)
+        mdist = dist;
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+#ifdef GPU_DEBUG
+    __syncthreads();
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto minJ = i;
+      auto mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        mdist = dist;
+        minJ = j;
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+      // should belong to the same cluster...
+      assert(iv[i] == iv[minJ]);
+      assert(nn[i] <= nn[iv[i]]);
+    }
+    __syncthreads();
+#endif
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
+                                               gpuVertexFinder::WorkSpace* pws,
+                                               int minT,      // min number of neighbours to be "seed"
+                                               float eps,     // max absolute distance to cluster
+                                               float errmax,  // max error to be "seed"
+                                               float chi2max  // max normalized distance to cluster
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
new file mode 100644
index 0000000000000..ffd7fdc948bf8
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -0,0 +1,242 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksDBSCAN(ZVertices* pdata,
+                                      WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "core"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find NN with smaller z...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      float mz = zt[i];
+      auto loop = [&](uint32_t j) {
+        if (zt[j] >= mz)
+          return;
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        mz = zt[j];
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      assert(zt[iv[i]] <= zt[i]);
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        // they should belong to the same cluster, isn't it?
+        if (iv[i] != iv[j]) {
+          printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
+          printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
+          ;
+        }
+        assert(iv[i] == iv[j]);
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+    __syncthreads();
+#endif
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
new file mode 100644
index 0000000000000..49da86e941867
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -0,0 +1,213 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksIterative(ZVertices* pdata,
+                                         WorkSpace* pws,
+                                         int minT,      // min number of neighbours to be "core"
+                                         float eps,     // max absolute distance to cluster
+                                         float errmax,  // max error to be "seed"
+                                         float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ int nloops;
+    nloops = 0;
+
+    __syncthreads();
+
+    // cluster seeds only
+    bool more = true;
+    while (__syncthreads_or(more)) {
+      if (1 == nloops % 2) {
+        for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+          auto m = iv[i];
+          while (m != iv[m])
+            m = iv[m];
+          iv[i] = m;
+        }
+      } else {
+        more = false;
+        for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+          auto p = hist.begin() + k;
+          auto i = (*p);
+          auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
+          if (nn[i] < minT)
+            continue;  // DBSCAN core rule
+          auto loop = [&](uint32_t j) {
+            assert(i != j);
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+              return;
+            auto old = atomicMin(&iv[j], iv[i]);
+            if (old != iv[i]) {
+              // end the loop only if no changes were applied
+              more = true;
+            }
+            atomicMin(&iv[i], old);
+          };
+          ++p;
+          for (; p < hist.end(be); ++p)
+            loop(*p);
+        }  // for i
+      }
+      if (threadIdx.x == 0)
+        ++nloops;
+    }  // while
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](int j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
new file mode 100644
index 0000000000000..4487cb12ea17b
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -0,0 +1,113 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
+                                              WorkSpace* pws,
+                                              float chi2Max  // for outlier rejection
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    assert(nvFinal <= nvIntermediate);
+    nvFinal = nvIntermediate;
+    auto foundClusters = nvFinal;
+
+    // zero
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      zv[i] = 0;
+      wv[i] = 0;
+      chi2[i] = 0;
+    }
+
+    // only for test
+    __shared__ int noise;
+    if (verbose && 0 == threadIdx.x)
+      noise = 0;
+
+    __syncthreads();
+
+    // compute cluster location
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990) {
+        if (verbose)
+          atomicAdd(&noise, 1);
+        continue;
+      }
+      assert(iv[i] >= 0);
+      assert(iv[i] < int(foundClusters));
+      auto w = 1.f / ezt2[i];
+      atomicAdd(&zv[iv[i]], zt[i] * w);
+      atomicAdd(&wv[iv[i]], w);
+    }
+
+    __syncthreads();
+    // reuse nn
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      assert(wv[i] > 0.f);
+      zv[i] /= wv[i];
+      nn[i] = -1;  // ndof
+    }
+    __syncthreads();
+
+    // compute chi2
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+
+      auto c2 = zv[iv[i]] - zt[i];
+      c2 *= c2 / ezt2[i];
+      if (c2 > chi2Max) {
+        iv[i] = 9999;
+        continue;
+      }
+      atomicAdd(&chi2[iv[i]], c2);
+      atomicAdd(&nn[iv[i]], 1);
+    }
+    __syncthreads();
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
+      if (nn[i] > 0)
+        wv[i] *= float(nn[i]) / chi2[i];
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto clusters ", foundClusters);
+    if (verbose && 0 == threadIdx.x)
+      printf("and %d noise\n", noise);
+  }
+
+  __global__ void fitVerticesKernel(ZVertices* pdata,
+                                    WorkSpace* pws,
+                                    float chi2Max  // for outlier rejection
+  ) {
+    fitVertices(pdata, pws, chi2Max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
new file mode 100644
index 0000000000000..89cc9a3844f76
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -0,0 +1,73 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#ifdef __CUDA_ARCH__
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+#endif
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ ptt2 = ws.ptt2;
+    uint32_t const& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ iv = ws.iv;
+    float* __restrict__ ptv2 = data.ptv2;
+    uint16_t* __restrict__ sortInd = data.sortInd;
+
+    // if (threadIdx.x == 0)
+    //    printf("sorting %d vertices\n",nvFinal);
+
+    if (nvFinal < 1)
+      return;
+
+    // fill indexing
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      data.idv[ws.itrk[i]] = iv[i];
+    }
+
+    // can be done asynchronoisly at the end of previous event
+    for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) {
+      ptv2[i] = 0;
+    }
+    __syncthreads();
+
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+      atomicAdd(&ptv2[iv[i]], ptt2[i]);
+    }
+    __syncthreads();
+
+    if (1 == nvFinal) {
+      if (threadIdx.x == 0)
+        sortInd[0] = 0;
+      return;
+    }
+#ifdef __CUDA_ARCH__
+    __shared__ uint16_t sws[1024];
+    // sort using only 16 bits
+    radixSort<float, 2>(ptv2, sortInd, sws, nvFinal);
+#else
+    for (uint16_t i = 0; i < nvFinal; ++i)
+      sortInd[i] = i;
+    std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
+#endif
+  }
+
+  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
new file mode 100644
index 0000000000000..694915ab02157
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -0,0 +1,139 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float const* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    // one vertex per block
+    for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
+      if (nn[kv] < 4)
+        continue;
+      if (chi2[kv] < maxChi2 * float(nn[kv]))
+        continue;
+
+      constexpr int MAXTK = 512;
+      assert(nn[kv] < MAXTK);
+      if (nn[kv] >= MAXTK)
+        continue;                      // too bad FIXME
+      __shared__ uint32_t it[MAXTK];   // track index
+      __shared__ float zz[MAXTK];      // z pos
+      __shared__ uint8_t newV[MAXTK];  // 0 or 1
+      __shared__ float ww[MAXTK];      // z weight
+
+      __shared__ uint32_t nq;  // number of track for this vertex
+      nq = 0;
+      __syncthreads();
+
+      // copy to local
+      for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
+        if (iv[k] == int(kv)) {
+          auto old = atomicInc(&nq, MAXTK);
+          zz[old] = zt[k] - zv[kv];
+          newV[old] = zz[old] < 0 ? 0 : 1;
+          ww[old] = 1.f / ezt2[k];
+          it[old] = k;
+        }
+      }
+
+      __shared__ float znew[2], wnew[2];  // the new vertices
+
+      __syncthreads();
+      assert(int(nq) == nn[kv] + 1);
+
+      int maxiter = 20;
+      // kt-min....
+      bool more = true;
+      while (__syncthreads_or(more)) {
+        more = false;
+        if (0 == threadIdx.x) {
+          znew[0] = 0;
+          znew[1] = 0;
+          wnew[0] = 0;
+          wnew[1] = 0;
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto i = newV[k];
+          atomicAdd(&znew[i], zz[k] * ww[k]);
+          atomicAdd(&wnew[i], ww[k]);
+        }
+        __syncthreads();
+        if (0 == threadIdx.x) {
+          znew[0] /= wnew[0];
+          znew[1] /= wnew[1];
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto d0 = fabs(zz[k] - znew[0]);
+          auto d1 = fabs(zz[k] - znew[1]);
+          auto newer = d0 < d1 ? 0 : 1;
+          more |= newer != newV[k];
+          newV[k] = newer;
+        }
+        --maxiter;
+        if (maxiter <= 0)
+          more = false;
+      }
+
+      // avoid empty vertices
+      if (0 == wnew[0] || 0 == wnew[1])
+        continue;
+
+      // quality cut
+      auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
+
+      auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
+
+      if (verbose && 0 == threadIdx.x)
+        printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
+
+      if (chi2Dist < 4)
+        continue;
+
+      // get a new global vertex
+      __shared__ uint32_t igv;
+      if (0 == threadIdx.x)
+        igv = atomicAdd(&ws.nvIntermediate, 1);
+      __syncthreads();
+      for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+        if (1 == newV[k])
+          iv[it[k]] = igv;
+      }
+
+    }  // loop on vertices
+  }
+
+  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    splitVertices(pdata, pws, maxChi2);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
new file mode 100644
index 0000000000000..6cd86c93a6737
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -0,0 +1,83 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+
+#include <cstddef>
+#include <cstdint>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+
+namespace gpuVertexFinder {
+
+  using ZVertices = ZVertexSoA;
+  using TkSoA = pixelTrack::TrackSoA;
+
+  // workspace used in the vertex reco algos
+  struct WorkSpace {
+    static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
+    static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
+
+    uint32_t ntrks;            // number of "selected tracks"
+    uint16_t itrk[MAXTRACKS];  // index of original track
+    float zt[MAXTRACKS];       // input track z at bs
+    float ezt2[MAXTRACKS];     // input error^2 on the above
+    float ptt2[MAXTRACKS];     // input pt^2 on the above
+    uint8_t izt[MAXTRACKS];    // interized z-position of input tracks
+    int32_t iv[MAXTRACKS];     // vertex index for each associated track
+
+    uint32_t nvIntermediate;  // the number of vertices after splitting pruning etc.
+
+    __host__ __device__ void init() {
+      ntrks = 0;
+      nvIntermediate = 0;
+    }
+  };
+
+  __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
+    pdata->init();
+    pws->init();
+  }
+
+  class Producer {
+  public:
+    using ZVertices = ZVertexSoA;
+    using WorkSpace = gpuVertexFinder::WorkSpace;
+    using TkSoA = pixelTrack::TrackSoA;
+
+    Producer(bool oneKernel,
+             bool useDensity,
+             bool useDBSCAN,
+             bool useIterative,
+             int iminT,      // min number of neighbours to be "core"
+             float ieps,     // max absolute distance to cluster
+             float ierrmax,  // max error to be "seed"
+             float ichi2max  // max normalized distance to cluster
+             )
+        : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
+          useDensity_(useDensity),
+          useDBSCAN_(useDBSCAN),
+          useIterative_(useIterative),
+          minT(iminT),
+          eps(ieps),
+          errmax(ierrmax),
+          chi2max(ichi2max) {}
+
+    ~Producer() = default;
+
+    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
+
+  private:
+    const bool oneKernel_;
+    const bool useDensity_;
+    const bool useDBSCAN_;
+    const bool useIterative_;
+
+    int minT;       // min number of neighbours to be "core"
+    float eps;      // max absolute distance to cluster
+    float errmax;   // max error to be "seed"
+    float chi2max;  // max normalized distance to cluster
+  };
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
new file mode 100644
index 0000000000000..ae423dd375e06
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -0,0 +1,169 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
+#include "gpuFitVertices.h"
+#include "gpuSortByPt2.h"
+#include "gpuSplitVertices.h"
+
+namespace gpuVertexFinder {
+
+  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
+    assert(ptracks);
+    assert(soa);
+    auto const& tracks = *ptracks;
+    auto const& fit = tracks.stateAtBS;
+    auto const* quality = tracks.qualityData();
+
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = tracks.nHits(idx);
+      if (nHits == 0)
+        break;  // this is a guard: maybe we need to move to nTracks...
+
+      // initialize soa...
+      soa->idv[idx] = -1;
+
+      if (nHits < 4)
+        continue;  // no triplets
+      if (quality[idx] != pixelTrack::Quality::loose)
+        continue;
+
+      auto pt = tracks.pt(idx);
+
+      if (pt < ptMin)
+        continue;
+
+      auto& data = *pws;
+      auto it = atomicAdd(&data.ntrks, 1);
+      data.itrk[it] = idx;
+      data.zt[it] = tracks.zip(idx);
+      data.ezt2[it] = fit.covariance(idx)(14);
+      data.ptt2[it] = pt * pt;
+    }
+  }
+
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                        gpuVertexFinder::WorkSpace* pws,
+                                        int minT,      // min number of neighbours to be "seed"
+                                        float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                        float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, 50.);
+    __syncthreads();
+    splitVertices(pdata, pws, 9.f);
+    __syncthreads();
+    fitVertices(pdata, pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#else
+  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, 50.);
+  }
+
+  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+    fitVertices(pdata, pws, 5000.);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#endif
+
+#ifdef __CUDACC__
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+    // std::cout << "producing Vertices on GPU" << std::endl;
+    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+#else
+  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
+    // std::cout << "producing Vertices on  CPU" <<    std::endl;
+    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+#endif
+    assert(tksoa);
+    auto* soa = vertices.get();
+    assert(soa);
+
+#ifdef __CUDACC__
+    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+#else
+    auto ws_d = std::make_unique<WorkSpace>();
+#endif
+
+#ifdef __CUDACC__
+    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    auto blockSize = 128;
+    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
+    cudaCheck(cudaGetLastError());
+#else
+    init(soa, ws_d.get());
+    loadTracks(tksoa, soa, ws_d.get(), ptMin);
+#endif
+
+#ifdef __CUDACC__
+    if (oneKernel_) {
+      // implemented only for density clustesrs
+#ifndef THREE_KERNELS
+      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+#else
+      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+#endif
+    } else {  // five kernels
+      if (useDensity_) {
+        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useDBSCAN_) {
+        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useIterative_) {
+        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      }
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
+      cudaCheck(cudaGetLastError());
+      sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+    }
+    cudaCheck(cudaGetLastError());
+#else  // __CUDACC__
+    if (useDensity_) {
+      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useDBSCAN_) {
+      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useIterative_) {
+      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    }
+    // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+    fitVertices(soa, ws_d.get(), 50.);
+    // one block per vertex!
+    splitVertices(soa, ws_d.get(), 9.f);
+    fitVertices(soa, ws_d.get(), 5000.);
+    sortByPt2(soa, ws_d.get());
+#endif
+
+    return vertices;
+  }
+
+}  // namespace gpuVertexFinder
+
+#undef FROM
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index 77a9f367b9d9b..903c2a894ff86 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -18,5 +18,3 @@
        refToPSet_ = cms.string('pvClusterComparer')
     )
 )
-
-
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index 0f4f4dee63832..f5c154b298574 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -2,8 +2,41 @@
 <use name="root"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
-<use name="MagneticField/Records"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="MagneticField/Engine"/>
-<use name="TrackingTools/TransientTrack"/>
+<use name="MagneticField/Records"/>
 <use name="RecoVertex/KalmanVertexFit"/>
 <use name="SimDataFormats/Track"/>
+<use name="TrackingTools/TransientTrack"/>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderByDensity_t">
+  <flags CXXFLAGS="-g  -DGPU_DEBUG"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderDBSCAN_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_DBSCAN"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderIterative_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
+  <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+</bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
new file mode 100644
index 0000000000000..e3298f8c5761b
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -0,0 +1,347 @@
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#ifdef USE_DBSCAN
+#include "../plugins/gpuClusterTracksDBSCAN.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
+#elif USE_ITERATIVE
+#include "../plugins/gpuClusterTracksIterative.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
+#else
+#include "../plugins/gpuClusterTracksByDensity.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
+#endif
+#include "../plugins/gpuFitVertices.h"
+#include "../plugins/gpuSortByPt2.h"
+#include "../plugins/gpuSplitVertices.h"
+
+#ifdef ONE_KERNEL
+#ifdef __CUDACC__
+__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+) {
+  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  __syncthreads();
+  fitVertices(pdata, pws, 50.);
+  __syncthreads();
+  splitVertices(pdata, pws, 9.f);
+  __syncthreads();
+  fitVertices(pdata, pws, 5000.);
+  __syncthreads();
+  sortByPt2(pdata, pws);
+}
+#endif
+#endif
+
+struct Event {
+  std::vector<float> zvert;
+  std::vector<uint16_t> itrack;
+  std::vector<float> ztrack;
+  std::vector<float> eztrack;
+  std::vector<float> pttrack;
+  std::vector<uint16_t> ivert;
+};
+
+struct ClusterGenerator {
+  explicit ClusterGenerator(float nvert, float ntrack)
+      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
+
+  void operator()(Event& ev) {
+    int nclus = clusGen(reng);
+    ev.zvert.resize(nclus);
+    ev.itrack.resize(nclus);
+    for (auto& z : ev.zvert) {
+      z = 3.5f * gauss(reng);
+    }
+
+    ev.ztrack.clear();
+    ev.eztrack.clear();
+    ev.ivert.clear();
+    for (int iv = 0; iv < nclus; ++iv) {
+      auto nt = trackGen(reng);
+      ev.itrack[nclus] = nt;
+      for (int it = 0; it < nt; ++it) {
+        auto err = errgen(reng);  // reality is not flat....
+        ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
+        ev.eztrack.push_back(err * err);
+        ev.ivert.push_back(iv);
+        ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
+        ev.pttrack.back() *= ev.pttrack.back();
+      }
+    }
+    // add noise
+    auto nt = 2 * trackGen(reng);
+    for (int it = 0; it < nt; ++it) {
+      auto err = 0.03f;
+      ev.ztrack.push_back(rgen(reng));
+      ev.eztrack.push_back(err * err);
+      ev.ivert.push_back(9999);
+      ev.pttrack.push_back(0.5f + ptGen(reng));
+      ev.pttrack.back() *= ev.pttrack.back();
+    }
+  }
+
+  std::mt19937 reng;
+  std::uniform_real_distribution<float> rgen;
+  std::uniform_real_distribution<float> errgen;
+  std::poisson_distribution<int> clusGen;
+  std::poisson_distribution<int> trackGen;
+  std::normal_distribution<float> gauss;
+  std::exponential_distribution<float> ptGen;
+};
+
+// a macro SORRY
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
+#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
+
+__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
+  auto const& __restrict__ data = *pdata;
+  auto const& __restrict__ ws = *pws;
+  printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
+}
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+
+  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+#else
+  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
+#endif
+
+  Event ev;
+
+  float eps = 0.1f;
+  std::array<float, 3> par{{eps, 0.01f, 9.0f}};
+  for (int nav = 30; nav < 80; nav += 20) {
+    ClusterGenerator gen(nav, 10);
+
+    for (int i = 8; i < 20; ++i) {
+      auto kk = i / 4;  // M param
+
+      gen(ev);
+
+#ifdef __CUDACC__
+      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+#else
+      onGPU_d->init();
+      ws_d->init();
+#endif
+
+      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+      auto nt = ev.ztrack.size();
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+#else
+      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
+      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+#endif
+
+      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+
+      if ((i % 4) == 0)
+        par = {{eps, 0.02f, 12.0f}};
+      if ((i % 4) == 1)
+        par = {{eps, 0.02f, 9.0f}};
+      if ((i % 4) == 2)
+        par = {{eps, 0.01f, 9.0f}};
+      if ((i % 4) == 3)
+        par = {{0.7f * eps, 0.01f, 9.0f}};
+
+      uint32_t nv = 0;
+#ifdef __CUDACC__
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
+
+#ifdef ONE_KERNEL
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#else
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#endif
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
+
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+#else
+      print(onGPU_d.get(), ws_d.get());
+      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.get(), ws_d.get());
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
+#endif
+
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
+
+      float* zv = nullptr;
+      float* wv = nullptr;
+      float* ptv2 = nullptr;
+      int32_t* nn = nullptr;
+      uint16_t* ind = nullptr;
+
+      // keep chi2 separated...
+      float chi2[2 * nv];  // make space for splitting...
+
+#ifdef __CUDACC__
+      float hzv[2 * nv];
+      float hwv[2 * nv];
+      float hptv2[2 * nv];
+      int32_t hnn[2 * nv];
+      uint16_t hind[2 * nv];
+
+      zv = hzv;
+      wv = hwv;
+      ptv2 = hptv2;
+      nn = hnn;
+      ind = hind;
+#else
+      zv = onGPU_d->zv;
+      wv = onGPU_d->wv;
+      ptv2 = onGPU_d->ptv2;
+      nn = onGPU_d->ndof;
+      ind = onGPU_d->sortInd;
+#endif
+
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+#ifdef __CUDACC__
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+#ifdef __CUDACC__
+      // one vertex per block!!!
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+      nv = ws_d->nvIntermediate;
+#endif
+      std::cout << "after split " << nv << std::endl;
+
+#ifdef __CUDACC__
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cudaCheck(cudaGetLastError());
+
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
+      sortByPt2(onGPU_d.get(), ws_d.get());
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
+
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+#endif
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+      {
+        auto mx = std::minmax_element(wv, wv + nv);
+        std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
+      }
+
+      {
+        auto mx = std::minmax_element(ptv2, ptv2 + nv);
+        std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
+        std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
+                  << ind[nv - 1] << std::endl;
+      }
+
+      float dd[nv];
+      for (auto kv = 0U; kv < nv; ++kv) {
+        auto zr = zv[kv];
+        auto md = 500.0f;
+        for (auto zs : ev.ztrack) {
+          auto d = std::abs(zr - zs);
+          md = std::min(d, md);
+        }
+        dd[kv] = md;
+      }
+      if (i == 6) {
+        for (auto d : dd)
+          std::cout << d << ' ';
+        std::cout << std::endl;
+      }
+      auto mx = std::minmax_element(dd, dd + nv);
+      float rms = 0;
+      for (auto d : dd)
+        rms += d * d;
+      rms = std::sqrt(rms) / (nv - 1);
+      std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+    }  // loop on events
+  }    // lopp on ave vert
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
new file mode 100644
index 0000000000000..a7906fe0d03f5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
new file mode 100644
index 0000000000000..a7906fe0d03f5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"
diff --git a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
new file mode 100644
index 0000000000000..24774bbda649c
--- /dev/null
+++ b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
@@ -0,0 +1,59 @@
+import FWCore.ParameterSet.Config as cms
+
+# Customise the Pixel-only reconstruction to run on GPU
+#
+# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU.
+def customizePixelOnlyForProfilingGPUOnly(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('caHitNtupletCUDA', 'pixelVertexCUDA')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the Pixel-only reconstruction to run on GPU, and copy the data to the host
+#
+# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU,
+# and copy all the products to the host in SoA format.
+#
+# The same customisation can be also used on the SoA CPU workflow, running up to the
+# tracks and vertices on the CPU in SoA format, without conversion to legacy format.
+def customizePixelOnlyForProfilingGPUWithHostCopy(process):
+
+  #? process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('pixelTrackSoA', 'pixelVertexSoA')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the Pixel-only reconstruction to run on GPU, copy the data to the host,
+# and convert to legacy format
+#
+# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU;
+# copy all the products to the host in SoA format; and convert them to legacy format.
+#
+# The same customisation can be also used on the CPU workflow, running up to the
+# tracks and vertices on the CPU.
+def customizePixelOnlyForProfiling(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('pixelTracks', 'pixelVertices')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
index 4dc0bfafbb439..65c849c69bbdf 100644
--- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
+++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml
@@ -1,19 +1,15 @@
-<use name="PhysicsTools/TensorFlow"/>
+<use name="cuda"/>
 <use name="CommonTools/RecoAlgos"/>
-<use name="DataFormats/L1TrackTrigger"/>
-<use name="RecoTracker/TkSeedGenerator"/>
-<use name="RecoTracker/TkTrackingRegions"/>
-<use name="RecoPixelVertexing/PixelTriplets"/>
-<use name="RecoPixelVertexing/PixelTrackFitting"/>
-<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
 <use name="CommonTools/Utils"/>
 <use name="DataFormats/BeamSpot"/>
 <use name="DataFormats/Common"/>
+<use name="DataFormats/L1TrackTrigger"/>
 <use name="DataFormats/Math"/>
 <use name="DataFormats/SiPixelDetId"/>
 <use name="DataFormats/SiStripDetId"/>
 <use name="DataFormats/TrackReco"/>
 <use name="DataFormats/TrackerCommon"/>
+<use name="DataFormats/TrackerRecHit2D"/>
 <use name="DataFormats/TrackingRecHit"/>
 <use name="DataFormats/TrajectorySeed"/>
 <use name="DataFormats/TrajectoryState"/>
@@ -26,14 +22,21 @@
 <use name="Geometry/CommonDetUnit"/>
 <use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
+<use name="HeterogeneousCore/CUDACore"/>
 <use name="MagneticField/Engine"/>
 <use name="MagneticField/Records"/>
 <use name="MagneticField/UniformEngine"/>
+<use name="PhysicsTools/TensorFlow"/>
+<use name="RecoPixelVertexing/PixelLowPtUtilities"/>
+<use name="RecoPixelVertexing/PixelTrackFitting"/>
+<use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/MeasurementDet"/>
 <use name="RecoTracker/Record"/>
 <use name="RecoTracker/SpecialSeedGenerators" source_only="1"/>
 <use name="RecoTracker/TkHitPairs"/>
+<use name="RecoTracker/TkSeedGenerator"/>
 <use name="RecoTracker/TkSeedingLayers"/>
+<use name="RecoTracker/TkTrackingRegions"/>
 <use name="RecoTracker/TransientTrackingRecHit"/>
 <use name="TrackingTools/GeomPropagators"/>
 <use name="TrackingTools/KalmanUpdators"/>
@@ -44,5 +47,4 @@
 <use name="TrackingTools/TransientTrackingRecHit"/>
 <library file="*.cc" name="RecoTrackerTkSeedGeneratorPlugins">
   <flags EDM_PLUGIN="1"/>
-  <use name="DataFormats/TrackerRecHit2D"/>
 </library>
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
new file mode 100644
index 0000000000000..0e5823fc46c46
--- /dev/null
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -0,0 +1,170 @@
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrackingRecHit/interface/InvalidTrackingRecHit.h"
+#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/CommonDetUnit/interface/GeomDet.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+#include "TrackingTools/MaterialEffects/interface/PropagatorWithMaterial.h"
+#include "TrackingTools/Records/interface/TrackingComponentsRecord.h"
+#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryState/interface/TrajectoryStateTransform.h"
+
+/*
+  produces seeds directly from cuda produced tuples
+*/
+class SeedProducerFromSoA : public edm::global::EDProducer<> {
+public:
+  explicit SeedProducerFromSoA(const edm::ParameterSet& iConfig);
+  ~SeedProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  // Event data tokens
+  const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  // Event setup tokens
+  const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
+  const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> trackerDigiGeometryToken_;
+  const edm::ESGetToken<Propagator, TrackingComponentsRecord> trackerPropagatorToken_;
+  int32_t minNumberOfHits_;
+};
+
+SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig)
+    : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
+      idealMagneticFieldToken_(esConsumes()),
+      trackerDigiGeometryToken_(esConsumes()),
+      trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))),
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
+
+{
+  produces<TrajectorySeedCollection>();
+}
+
+void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelTrackSoA"));
+  desc.add<int>("minNumberOfHits", 0);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  // std::cout << "Converting gpu helix to trajectory seed" << std::endl;
+  auto result = std::make_unique<TrajectorySeedCollection>();
+
+  auto const& fieldESH = iSetup.getHandle(idealMagneticFieldToken_);
+  auto const& tracker = iSetup.getHandle(trackerDigiGeometryToken_);
+  auto const& dus = tracker->detUnits();
+
+  auto const& propagatorHandle = iSetup.getHandle(trackerPropagatorToken_);
+  const Propagator* propagator = &(*propagatorHandle);
+
+  const auto& bsh = iEvent.get(tBeamSpot_);
+  // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
+  GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
+
+  const auto& tsoa = *(iEvent.get(tokenTrack_));
+
+  auto const* quality = tsoa.qualityData();
+  auto const& fit = tsoa.stateAtBS;
+  auto const& detIndices = tsoa.detIndices;
+  auto maxTracks = tsoa.stride();
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
+
+    auto q = quality[it];
+    if (q != pixelTrack::Quality::loose)
+      continue;  // FIXME
+    if (nHits < minNumberOfHits_)
+      continue;
+    ++nt;
+
+    // fill hits with invalid just to hold the detId
+    auto b = detIndices.begin(it);
+    edm::OwnVector<TrackingRecHit> hits;
+    for (int iHit = 0; iHit < nHits; ++iHit) {
+      auto const* det = dus[*(b + iHit)];
+      // FIXME at some point get a proper type ...
+      hits.push_back(new InvalidTrackingRecHit(*det, TrackingRecHit::bad));
+    }
+
+    // mind: this values are respect the beamspot!
+
+    float phi = tsoa.phi(it);
+
+    riemannFit::Vector5d ipar, opar;
+    riemannFit::Matrix5d icov, ocov;
+    fit.copyToDense(ipar, icov, it);
+    riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
+
+    LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
+    AlgebraicSymMatrix55 m;
+    for (int i = 0; i < 5; ++i)
+      for (int j = i; j < 5; ++j)
+        m(i, j) = ocov(i, j);
+
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
+
+    Plane impPointPlane(bs, rot);
+    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()),
+                                  impPointPlane.toGlobal(lpar.momentum()),
+                                  lpar.charge(),
+                                  fieldESH.product());
+
+    JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, *fieldESH.product());
+
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
+
+    FreeTrajectoryState fts(gp, CurvilinearTrajectoryError(mo));
+
+    auto const& lastHit = hits.back();
+
+    TrajectoryStateOnSurface outerState = propagator->propagate(fts, *lastHit.surface());
+
+    if (!outerState.isValid()) {
+      edm::LogError("SeedFromGPU") << " was trying to create a seed from:\n"
+                                   << fts << "\n propagating to: " << lastHit.geographicalId().rawId();
+      continue;
+    }
+
+    auto const& pTraj = trajectoryStateTransform::persistentState(outerState, lastHit.geographicalId().rawId());
+
+    result->emplace_back(pTraj, hits, alongMomentum);
+  }
+
+  iEvent.put(std::move(result));
+}
+
+DEFINE_FWK_MODULE(SeedProducerFromSoA);
diff --git a/SimTracker/TrackerHitAssociation/BuildFile.xml b/SimTracker/TrackerHitAssociation/BuildFile.xml
index aa66f443cabb9..5ea8794eda917 100644
--- a/SimTracker/TrackerHitAssociation/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/BuildFile.xml
@@ -5,6 +5,7 @@
 <use name="SimDataFormats/TrackingHit"/>
 <use name="SimDataFormats/TrackerDigiSimLink"/>
 <use name="DataFormats/TrackerRecHit2D"/>
+<use name="CUDADataFormats/Common"/>
 <use name="TrackingTools/TransientTrackingRecHit"/>
 <use name="DataFormats/SiPixelDetId"/>
 <use name="DataFormats/DetId"/>
@@ -18,6 +19,7 @@
 <use name="clhep"/>
 <use name="boost"/>
 <use name="root"/>
+<use name="cuda"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h b/SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h
new file mode 100644
index 0000000000000..86fe89f05b7d2
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h
@@ -0,0 +1,69 @@
+#ifndef SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h
+#define SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace trackerHitAssociationHeterogeneous {
+
+  struct ClusterSLView {
+    using Clus2TP = std::array<uint32_t, 7>;
+
+    Clus2TP* links_d;
+    uint32_t* tkId_d;
+    uint32_t* tkId2_d;
+    uint32_t* n1_d;
+    uint32_t* n2_d;
+  };
+
+  template <typename Traits>
+  class Product {
+  public:
+    template <typename T>
+    using unique_ptr = typename Traits::template unique_ptr<T>;
+
+    Product() = default;
+    ~Product() = default;
+    Product(Product const&) = delete;
+    Product(Product&&) = default;
+
+    Product(int nlinks, int nhits, cudaStream_t stream);
+
+    ClusterSLView& view() { return m_view; }
+    ClusterSLView const& view() const { return m_view; }
+
+    int nLinks() const { return m_nLinks; }
+    int nHits() const { return m_nHits; }
+
+  private:
+    static constexpr uint32_t n32 = 4;
+
+    unique_ptr<uint32_t[]> m_storeTP;  //!
+    unique_ptr<uint32_t[]> m_store32;  //!
+
+    ClusterSLView m_view;  //!
+
+    int m_nLinks;
+    int m_nHits;
+  };
+
+  template <typename Traits>
+  Product<Traits>::Product(int nlinks, int nhits, cudaStream_t stream) : m_nLinks(nlinks), m_nHits(nhits) {
+    m_storeTP = Traits::template make_device_unique<uint32_t[]>(m_nLinks * 7, stream);
+    m_store32 = Traits::template make_device_unique<uint32_t[]>(m_nHits * n32, stream);
+
+    auto get32 = [&](int i) { return m_store32.get() + i * m_nHits; };
+
+    m_view.links_d = (ClusterSLView::Clus2TP*)(m_storeTP.get());
+    m_view.tkId_d = get32(0);
+    m_view.tkId2_d = get32(1);
+    m_view.n1_d = get32(2);
+    m_view.n2_d = get32(3);
+  }
+
+  using ProductCUDA = Product<cms::cudacompat::GPUTraits>;
+
+}  // namespace trackerHitAssociationHeterogeneous
+
+#endif  // SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h
diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
index ecda84011006b..186f04cbd611d 100644
--- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
@@ -1,5 +1,10 @@
+<use name="cuda"/>
+<use name="Geometry/Records" />
+<use name="Geometry/TrackerGeometryBuilder" />
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="SimTracker/TrackerHitAssociation"/>
 <use name="SimDataFormats/TrackingAnalysis"/>
-<library file="*.cc" name="SimTrackerTrackerHitAssociationPlugins">
+<library file="*.cc *.cu" name="SimTrackerTrackerHitAssociationPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
new file mode 100644
index 0000000000000..0aab26d9cc091
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -0,0 +1,224 @@
+#include <atomic>
+#include <limits>
+#include <mutex>
+
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+#include "ClusterSLOnGPU.h"
+
+using ClusterSLView = trackerHitAssociationHeterogeneous::ClusterSLView;
+using Clus2TP = ClusterSLView::Clus2TP;
+
+// #define DUMP_TK2
+
+__global__ void simLink(const SiPixelDigisCUDA::DeviceConstView* dd,
+                        uint32_t ndigis,
+                        TrackingRecHit2DSOAView const* hhp,
+                        ClusterSLView sl,
+                        uint32_t n) {
+  constexpr uint32_t invTK = 0;  // std::numeric_limits<int32_t>::max();
+  using gpuClustering::invalidModuleId;
+  using gpuClustering::maxNumModules;
+
+  auto const& hh = *hhp;
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i >= ndigis)
+    return;
+
+  auto id = dd->moduleInd(i);
+  if (invalidModuleId == id)
+    return;
+  assert(id < maxNumModules);
+
+  auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i));
+  auto first = hh.hitsModuleStart(id);
+  auto cl = first + dd->clus(i);
+  assert(cl < maxNumModules * blockDim.x);
+
+  const Clus2TP me{{id, ch, 0, 0, 0, 0, 0}};
+
+  auto less = [] __host__ __device__(Clus2TP const& a, Clus2TP const& b) -> bool {
+    // in this context we do not care of [2]
+    return a[0] < b[0] or ((not(b[0] < a[0])) and (a[1] < b[1]));
+  };
+
+  auto equal = [] __host__ __device__(Clus2TP const& a, Clus2TP const& b) -> bool {
+    // in this context we do not care of [2]
+    return a[0] == b[0] and a[1] == b[1];
+  };
+
+  auto const* b = sl.links_d;
+  auto const* e = b + n;
+
+  auto p = cuda_std::lower_bound(b, e, me, less);
+  int32_t j = p - sl.links_d;
+  assert(j >= 0);
+
+  auto getTK = [&](int i) {
+    auto const& l = sl.links_d[i];
+    return l[2];
+  };
+
+  j = std::min(int(j), int(n - 1));
+  if (equal(me, sl.links_d[j])) {
+    auto const itk = j;
+    auto const tk = getTK(j);
+    auto old = atomicCAS(&sl.tkId_d[cl], invTK, itk);
+    if (invTK == old or tk == getTK(old)) {
+      atomicAdd(&sl.n1_d[cl], 1);
+    } else {
+      auto old = atomicCAS(&sl.tkId2_d[cl], invTK, itk);
+      if (invTK == old or tk == getTK(old))
+        atomicAdd(&sl.n2_d[cl], 1);
+    }
+  }
+}
+
+__global__ void doZero(uint32_t nhits, ClusterSLView sl) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i > nhits)
+    return;
+
+  sl.tkId_d[i] = 0;
+  sl.n1_d[i] = 0;
+  sl.tkId2_d[i] = 0;
+  sl.n2_d[i] = 0;
+}
+
+__global__ void dumpLink(int first, int ev, TrackingRecHit2DSOAView const* hhp, uint32_t nhits, ClusterSLView sl) {
+  auto i = first + blockIdx.x * blockDim.x + threadIdx.x;
+  if (i > nhits)
+    return;
+
+  auto const& hh = *hhp;
+
+  auto const& tk1 = sl.links_d[sl.tkId_d[i]];
+
+#ifdef DUMP_TK2
+  auto const& tk2 = sl.links_d[sl.tkId2_d[i]];
+
+  printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
+#else
+  printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d %d\n",
+#endif
+         ev,
+         i,
+         hh.detectorIndex(i),
+         hh.charge(i),
+         hh.xGlobal(i),
+         hh.yGlobal(i),
+         hh.zGlobal(i),
+         hh.rGlobal(i),
+         hh.iphi(i),
+         hh.clusterSizeX(i),
+         hh.clusterSizeY(i),
+         tk1[2],
+         tk1[3],
+         tk1[4],
+         tk1[5],
+         tk1[6],
+         sl.n1_d[i]
+#ifdef DUMP_TK2
+         ,
+         tk2[2],
+         tk2[3],
+         tk2[4],
+         tk2[5],
+         tk2[6],
+         sl.n2_d[i]
+#endif
+  );
+}
+
+namespace clusterSLOnGPU {
+
+  void printCSVHeader() {
+#ifdef DUMP_TK2
+    printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+#else
+    printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+#endif
+           "ev",
+           "ind",
+           "det",
+           "charge",
+           "xg",
+           "yg",
+           "zg",
+           "rg",
+           "iphi",
+           "xsize",
+           "ysize",
+           "tkId",
+           "pt",
+           "eta",
+           "z0",
+           "r0",
+           "n1"
+#ifdef DUMP_TK2
+           ,
+           "tkId2",
+           "pt2",
+           "eta",
+           "z02",
+           "r02",
+           "n2"
+#endif
+    );
+  }
+
+  std::atomic<int> evId(0);
+  std::once_flag doneCSVHeader;
+
+  Kernel::Kernel(bool dump) : doDump(dump) {
+    if (doDump)
+      std::call_once(doneCSVHeader, printCSVHeader);
+  }
+
+  trackerHitAssociationHeterogeneous::ProductCUDA Kernel::makeAsync(SiPixelDigisCUDA const& dd,
+                                                                    uint32_t ndigis,
+                                                                    HitsOnCPU const& hh,
+                                                                    Clus2TP const* digi2tp,
+                                                                    uint32_t nhits,
+                                                                    uint32_t nlinks,
+                                                                    cudaStream_t stream) const {
+    trackerHitAssociationHeterogeneous::ProductCUDA product(nlinks, nhits, stream);
+    auto& csl = product.view();
+
+    cudaCheck(cudaMemcpyAsync(csl.links_d, digi2tp, sizeof(Clus2TP) * nlinks, cudaMemcpyDefault, stream));
+
+    if (0 == nhits)
+      return product;
+
+    int ev = ++evId;
+    int threadsPerBlock = 256;
+
+    int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    doZero<<<blocks, threadsPerBlock, 0, stream>>>(nhits, csl);
+    cudaCheck(cudaGetLastError());
+
+    blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
+    simLink<<<blocks, threadsPerBlock, 0, stream>>>(dd.view(), ndigis, hh.view(), csl, nlinks);
+    cudaCheck(cudaGetLastError());
+
+    if (doDump) {
+      cudaStreamSynchronize(stream);  // flush previous printf
+      // one line == 200B so each kernel can print only 5K lines....
+      blocks = 16;
+      for (int first = 0; first < int(nhits); first += blocks * threadsPerBlock) {
+        dumpLink<<<blocks, threadsPerBlock, 0, stream>>>(first, ev, hh.view(), nhits, csl);
+        cudaCheck(cudaGetLastError());
+        cudaStreamSynchronize(stream);
+      }
+    }
+    cudaCheck(cudaGetLastError());
+
+    return product;
+  }
+
+}  // namespace clusterSLOnGPU
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
new file mode 100644
index 0000000000000..3109e6ed45a76
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -0,0 +1,36 @@
+#ifndef SimTracker_TrackerHitAssociation_plugins_ClusterSLOnGPU_h
+#define SimTracker_TrackerHitAssociation_plugins_ClusterSLOnGPU_h
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h"
+
+namespace clusterSLOnGPU {
+
+  using ClusterSLView = trackerHitAssociationHeterogeneous::ClusterSLView;
+  using Clus2TP = ClusterSLView::Clus2TP;
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+
+  class Kernel {
+  public:
+    explicit Kernel(bool dump);
+    ~Kernel() {}
+    trackerHitAssociationHeterogeneous::ProductCUDA makeAsync(SiPixelDigisCUDA const& dd,
+                                                              uint32_t ndigis,
+                                                              HitsOnCPU const& hh,
+                                                              Clus2TP const* digi2tp,
+                                                              uint32_t nhits,
+                                                              uint32_t nlinks,
+                                                              cudaStream_t stream) const;
+
+  private:
+  public:
+    bool doDump;
+  };
+}  // namespace clusterSLOnGPU
+
+#endif  // SimTracker_TrackerHitAssociation_plugins_ClusterSLOnGPU_h
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc
new file mode 100644
index 0000000000000..35337151eda91
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc
@@ -0,0 +1,227 @@
+#include <memory>
+#include <vector>
+#include <utility>
+
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "DataFormats/Phase2TrackerCluster/interface/Phase2TrackerCluster1D.h"
+#include "DataFormats/Phase2TrackerDigi/interface/Phase2TrackerDigi.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiPixelDetId/interface/PixelChannelIdentifier.h"
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "SimDataFormats/Track/interface/SimTrackContainer.h"
+#include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h"
+#include "SimDataFormats/TrackerDigiSimLink/interface/StripDigiSimLink.h"
+#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticle.h"
+#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticleFwd.h"
+#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
+
+#include "ClusterSLOnGPU.h"
+
+class ClusterTPAssociationProducerCUDA : public edm::global::EDProducer<> {
+public:
+  typedef std::vector<OmniClusterRef> OmniClusterCollection;
+
+  using ClusterSLGPU = trackerHitAssociationHeterogeneous::ClusterSLView;
+  using Clus2TP = ClusterSLGPU::Clus2TP;
+  using ProductCUDA = trackerHitAssociationHeterogeneous::ProductCUDA;
+
+  explicit ClusterTPAssociationProducerCUDA(const edm::ParameterSet &);
+  ~ClusterTPAssociationProducerCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+  std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> makeMap(const edm::Event &iEvent) const;
+
+  template <typename T>
+  std::vector<std::pair<uint32_t, EncodedEventId>> getSimTrackId(const edm::Handle<edm::DetSetVector<T>> &simLinks,
+                                                                 const DetId &detId,
+                                                                 uint32_t channel) const;
+
+  edm::EDGetTokenT<edm::DetSetVector<PixelDigiSimLink>> sipixelSimLinksToken_;
+  edm::EDGetTokenT<edm::DetSetVector<StripDigiSimLink>> sistripSimLinksToken_;
+  edm::EDGetTokenT<edm::DetSetVector<PixelDigiSimLink>> siphase2OTSimLinksToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<SiPixelCluster>> pixelClustersToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<SiStripCluster>> stripClustersToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<Phase2TrackerCluster1D>> phase2OTClustersToken_;
+  edm::EDGetTokenT<TrackingParticleCollection> trackingParticleToken_;
+
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tGpuDigis;
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tGpuHits;
+
+  edm::EDPutTokenT<cms::cuda::Product<ProductCUDA>> tokenGPUProd_;
+
+  clusterSLOnGPU::Kernel m_gpuAlgo;
+};
+
+ClusterTPAssociationProducerCUDA::ClusterTPAssociationProducerCUDA(const edm::ParameterSet &cfg)
+    : sipixelSimLinksToken_(
+          consumes<edm::DetSetVector<PixelDigiSimLink>>(cfg.getParameter<edm::InputTag>("pixelSimLinkSrc"))),
+      sistripSimLinksToken_(
+          consumes<edm::DetSetVector<StripDigiSimLink>>(cfg.getParameter<edm::InputTag>("stripSimLinkSrc"))),
+      siphase2OTSimLinksToken_(
+          consumes<edm::DetSetVector<PixelDigiSimLink>>(cfg.getParameter<edm::InputTag>("phase2OTSimLinkSrc"))),
+      pixelClustersToken_(
+          consumes<edmNew::DetSetVector<SiPixelCluster>>(cfg.getParameter<edm::InputTag>("pixelClusterSrc"))),
+      stripClustersToken_(
+          consumes<edmNew::DetSetVector<SiStripCluster>>(cfg.getParameter<edm::InputTag>("stripClusterSrc"))),
+      phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D>>(
+          cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
+      trackingParticleToken_(
+          consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
+      tGpuDigis(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(
+          cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+      tGpuHits(consumes<cms::cuda::Product<TrackingRecHit2DCUDA>>(
+          cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
+      m_gpuAlgo(cfg.getParameter<bool>("dumpCSV")) {
+  tokenGPUProd_ = produces<cms::cuda::Product<ProductCUDA>>();
+}
+
+void ClusterTPAssociationProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("simTrackSrc", edm::InputTag("g4SimHits"));
+  desc.add<edm::InputTag>("pixelSimLinkSrc", edm::InputTag("simSiPixelDigis"));
+  desc.add<edm::InputTag>("stripSimLinkSrc", edm::InputTag("simSiStripDigis"));
+  desc.add<edm::InputTag>("phase2OTSimLinkSrc", edm::InputTag("simSiPixelDigis", "Tracker"));
+  desc.add<edm::InputTag>("pixelClusterSrc", edm::InputTag("siPixelClusters"));
+  desc.add<edm::InputTag>("stripClusterSrc", edm::InputTag("siStripClusters"));
+  desc.add<edm::InputTag>("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters"));
+  desc.add<edm::InputTag>("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth"));
+  desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersPreSplittingCUDA"));
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
+
+  desc.add<bool>("dumpCSV", false);
+
+  descriptions.add("tpClusterProducerCUDADefault", desc);
+}
+
+std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> ClusterTPAssociationProducerCUDA::makeMap(
+    const edm::Event &iEvent) const {
+  // TrackingParticle
+  edm::Handle<TrackingParticleCollection> TPCollectionH;
+  iEvent.getByToken(trackingParticleToken_, TPCollectionH);
+
+  // prepare temporary map between SimTrackId and TrackingParticle index
+  std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
+  for (TrackingParticleCollection::size_type itp = 0; itp < TPCollectionH.product()->size(); ++itp) {
+    TrackingParticleRef trackingParticle(TPCollectionH, itp);
+
+    // SimTracks inside TrackingParticle
+    EncodedEventId eid(trackingParticle->eventId());
+    for (auto itrk = trackingParticle->g4Track_begin(); itrk != trackingParticle->g4Track_end(); ++itrk) {
+      std::pair<uint32_t, EncodedEventId> trkid(itrk->trackId(), eid);
+      //std::cout << "creating map for id: " << trkid.first << " with tp: " << trackingParticle.key() << std::endl;
+      mapping.insert(std::make_pair(trkid, trackingParticle));
+    }
+  }
+  return mapping;
+}
+
+void ClusterTPAssociationProducerCUDA::produce(edm::StreamID streamID,
+                                               edm::Event &iEvent,
+                                               const edm::EventSetup &iSetup) const {
+  edm::ESHandle<TrackerGeometry> geom;
+  iSetup.get<TrackerDigiGeometryRecord>().get(geom);
+
+  // Pixel DigiSimLink
+  edm::Handle<edm::DetSetVector<PixelDigiSimLink>> sipixelSimLinks;
+  //  iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks);
+  iEvent.getByToken(sipixelSimLinksToken_, sipixelSimLinks);
+
+  // TrackingParticle
+  edm::Handle<TrackingParticleCollection> TPCollectionH;
+  iEvent.getByToken(trackingParticleToken_, TPCollectionH);
+
+  auto mapping = makeMap(iEvent);
+
+  edm::Handle<cms::cuda::Product<SiPixelDigisCUDA>> gd;
+  iEvent.getByToken(tGpuDigis, gd);
+  edm::Handle<cms::cuda::Product<TrackingRecHit2DCUDA>> gh;
+  iEvent.getByToken(tGpuHits, gh);
+
+  cms::cuda::ScopedContextProduce ctx{*gd};
+  auto const &gDigis = ctx.get(*gd);
+  auto const &gHits = ctx.get(*gh);
+  auto ndigis = gDigis.nDigis();
+  auto nhits = gHits.nHits();
+
+  std::vector<Clus2TP> digi2tp;
+  digi2tp.push_back({{0, 0, 0, 0, 0, 0, 0}});  // put at 0 0
+  for (auto const &links : *sipixelSimLinks) {
+    DetId detId(links.detId());
+    const GeomDetUnit *genericDet = geom->idToDetUnit(detId);
+    uint32_t gind = genericDet->index();
+    for (auto const &link : links) {
+      if (link.fraction() < 0.5f) {
+        continue;
+      }
+      auto tkid = std::make_pair(link.SimTrackId(), link.eventId());
+      auto ipos = mapping.find(tkid);
+      if (ipos != mapping.end()) {
+        uint32_t pt = 1000 * (*ipos).second->pt();
+        uint32_t eta = 10000 * (*ipos).second->eta();
+        uint32_t z0 = 10000 * (*ipos).second->vz();  // in um
+        uint32_t r0 = 10000 * std::sqrt((*ipos).second->vx() * (*ipos).second->vx() +
+                                        (*ipos).second->vy() * (*ipos).second->vy());  // in um
+        digi2tp.push_back({{gind, uint32_t(link.channel()), (*ipos).second.key(), pt, eta, z0, r0}});
+      }
+    }
+  }
+
+  std::sort(digi2tp.begin(), digi2tp.end());
+
+  ctx.emplace(iEvent,
+              tokenGPUProd_,
+              m_gpuAlgo.makeAsync(gDigis, ndigis, gHits, digi2tp.data(), nhits, digi2tp.size(), ctx.stream()));
+}
+
+template <typename T>
+std::vector<std::pair<uint32_t, EncodedEventId>>
+//std::pair<uint32_t, EncodedEventId>
+ClusterTPAssociationProducerCUDA::getSimTrackId(const edm::Handle<edm::DetSetVector<T>> &simLinks,
+                                                const DetId &detId,
+                                                uint32_t channel) const {
+  //std::pair<uint32_t, EncodedEventId> simTrkId;
+  std::vector<std::pair<uint32_t, EncodedEventId>> simTrkId;
+  auto isearch = simLinks->find(detId);
+  if (isearch != simLinks->end()) {
+    // Loop over DigiSimLink in this det unit
+    edm::DetSet<T> link_detset = (*isearch);
+    for (typename edm::DetSet<T>::const_iterator it = link_detset.data.begin(); it != link_detset.data.end(); ++it) {
+      if (channel == it->channel()) {
+        simTrkId.push_back(std::make_pair(it->SimTrackId(), it->eventId()));
+      }
+    }
+  }
+  return simTrkId;
+}
+
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+DEFINE_FWK_MODULE(ClusterTPAssociationProducerCUDA);
diff --git a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
index 8757a67226fb8..890d05c4fc093 100644
--- a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
+++ b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
@@ -18,3 +18,6 @@
     stripSimLinkSrc = "mixData:StripDigiSimLink",
     phase2OTSimLinkSrc = "mixData:Phase2OTDigiSimLink",
 )
+
+from SimTracker.TrackerHitAssociation.tpClusterProducerCUDADefault_cfi import tpClusterProducerCUDADefault as _tpClusterProducerCUDA
+tpClusterProducerCUDA = _tpClusterProducerCUDA.clone()
diff --git a/SimTracker/TrackerHitAssociation/src/classes.h b/SimTracker/TrackerHitAssociation/src/classes.h
index 457b6683d5cea..c8f98cd38ca81 100644
--- a/SimTracker/TrackerHitAssociation/src/classes.h
+++ b/SimTracker/TrackerHitAssociation/src/classes.h
@@ -5,6 +5,8 @@
 #include "DataFormats/Common/interface/AssociationMap.h"
 #include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
 #include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h"
 
 #include "DataFormats/Common/interface/AssociationMap.h"
 namespace SimTracker_TrackerHitAssociation {
diff --git a/SimTracker/TrackerHitAssociation/src/classes_def.xml b/SimTracker/TrackerHitAssociation/src/classes_def.xml
index f801d25b176e0..e9701e768fe75 100644
--- a/SimTracker/TrackerHitAssociation/src/classes_def.xml
+++ b/SimTracker/TrackerHitAssociation/src/classes_def.xml
@@ -20,4 +20,9 @@
   <class name="edm::Wrapper<std::map<TrackingParticleRef, std::vector<OmniClusterRef> > >" persistent="false" /> 
   <class name="std::map<OmniClusterRef, std::vector<TrackingParticleRef> >" persistent="false" /> 
   <class name="edm::Wrapper<std::map<OmniClusterRef, std::vector<TrackingParticleRef> > >" persistent="false" /> 
+
+  <class name="trackerHitAssociationHeterogeneous::ProductCUDA" persistent="false"/>
+  <class name="cms::cuda::Product<trackerHitAssociationHeterogeneous::ProductCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<trackerHitAssociationHeterogeneous::ProductCUDA>>" persistent="false"/>
+
 </lcgdict>
diff --git a/SimTracker/TrackerHitAssociation/test/BuildFile.xml b/SimTracker/TrackerHitAssociation/test/BuildFile.xml
index a0dc6b61844a0..df2be2331d810 100644
--- a/SimTracker/TrackerHitAssociation/test/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/test/BuildFile.xml
@@ -2,12 +2,14 @@
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/Utilities"/>
+<use name="HeterogeneousCore/CUDACore"/>
 <use name="SimTracker/TrackerHitAssociation"/>
 <use name="SimDataFormats/Track"/>
 <use name="SimDataFormats/Vertex"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
 <use name="Geometry/Records"/>
 <use name="boost"/>
+<use name="cuda"/>
 <use name="root"/>
 <library file="*.cc" name="SimTrackerTrackerHitAssociationTestModules">
   <flags EDM_PLUGIN="1"/>
diff --git a/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc
new file mode 100644
index 0000000000000..9c7a2e3e4828b
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc
@@ -0,0 +1,66 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h"
+
+class ClusterTPCUDAdump : public edm::global::EDAnalyzer<> {
+public:
+  using ClusterSLGPU = trackerHitAssociationHeterogeneous::ClusterSLView;
+  using Clus2TP = ClusterSLGPU::Clus2TP;
+  using ProductCUDA = trackerHitAssociationHeterogeneous::ProductCUDA;
+
+  explicit ClusterTPCUDAdump(const edm::ParameterSet& iConfig);
+  ~ClusterTPCUDAdump() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
+  const bool m_onGPU;
+  edm::EDGetTokenT<cms::cuda::Product<ProductCUDA>> tokenGPU_;
+};
+
+ClusterTPCUDAdump::ClusterTPCUDAdump(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+  if (m_onGPU) {
+    tokenGPU_ = consumes<cms::cuda::Product<ProductCUDA>>(iConfig.getParameter<edm::InputTag>("clusterTP"));
+  } else {
+  }
+}
+
+void ClusterTPCUDAdump::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const {
+  if (m_onGPU) {
+    auto const& hctp = iEvent.get(tokenGPU_);
+    cms::cuda::ScopedContextProduce ctx{hctp};
+
+    auto const& ctp = ctx.get(hctp);
+    auto const& soa = ctp.view();
+    assert(soa.links_d);
+  } else {
+  }
+}
+
+void ClusterTPCUDAdump::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("onGPU", true);
+  desc.add<edm::InputTag>("clusterTP", edm::InputTag("tpClusterProducerCUDAPreSplitting"));
+  descriptions.add("clusterTPCUDAdump", desc);
+}
+
+DEFINE_FWK_MODULE(ClusterTPCUDAdump);
diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
index df73b303d5061..54fa0364fe239 100644
--- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
+++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py
@@ -350,9 +350,9 @@ def _addNoFlow(module):
 
 
 postProcessorTrackTrackingOnly = postProcessorTrack.clone()
-postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackBHadron/*","Tracking/TrackSeeding/*", "Tracking/PixelTrack/*"])
+postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackBHadron/*", "Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"])
 postProcessorTrackSummaryTrackingOnly = postProcessorTrackSummary.clone()
-postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackBHadron","Tracking/TrackSeeding", "Tracking/PixelTrack"])
+postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackBHadron", "Tracking/TrackSeeding", "Tracking/PixelTrack", "Tracking/PixelTrackFromPV", "Tracking/PixelTrackFromPVAllTP", "Tracking/PixelTrackBHadron"])
 
 postProcessorTrackSequenceTrackingOnly = cms.Sequence(
     postProcessorTrackTrackingOnly+
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index 9882bf2a9ca7f..21678c35e4b44 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -522,6 +522,11 @@ def _getMVASelectors(postfix):
 # Built tracks, in the standard sequence mainly for monitoring the track selection MVA
 tpClusterProducerPreSplitting = tpClusterProducer.clone(pixelClusterSrc = "siPixelClustersPreSplitting")
 quickTrackAssociatorByHitsPreSplitting = quickTrackAssociatorByHits.clone(cluster2TPSrc = "tpClusterProducerPreSplitting")
+
+tpClusterProducerCUDAPreSplitting = tpClusterProducerCUDA.clone(
+   pixelClusterSrc = "siPixelClustersPreSplitting"
+)
+
 _trackValidatorSeedingBuilding = trackValidator.clone( # common for built tracks and seeds (in trackingOnly)
     associators = ["quickTrackAssociatorByHits"],
     UseAssociators = True,
@@ -701,6 +706,16 @@ def _uniqueFirstLayers(layerList):
     VertexAssociatorByPositionAndTracks,
     trackingParticleNumberOfLayersProducer
 )
+
+#gpu tp ???
+from Configuration.ProcessModifiers.gpu_cff import gpu
+tpClusterProducerPreSplittingCUDA = cms.Task(
+  tpClusterProducerCUDAPreSplitting
+)
+_tracksValidationTruth_gpu = tracksValidationTruth.copy()
+_tracksValidationTruth_gpu.add(tpClusterProducerPreSplittingCUDA)
+gpu.toReplaceWith(tracksValidationTruth,_tracksValidationTruth_gpu)
+
 fastSim.toModify(tracksValidationTruth, lambda x: x.remove(tpClusterProducer))
 
 tracksPreValidation = cms.Task(
@@ -974,9 +989,17 @@ def _uniqueFirstLayers(layerList):
     trackAssociation = "trackingParticlePixelTrackAsssociation"
 )
 
+_pixelTracksCustom = dict(
+    src = "pixelTracks",
+    vertexTag = "pixelVertices",
+)
+pixelTracksPt09 = generalTracksPt09.clone(quality = ["undefQuality"], **_pixelTracksCustom)
+pixelTracksFromPV = generalTracksFromPV.clone(quality = "undefQuality", **_pixelTracksCustom)
+pixelTracksFromPVPt09 = pixelTracksPt09.clone(src = "pixelTracksFromPV")
+
 trackValidatorPixelTrackingOnly = trackValidator.clone(
     dirName = "Tracking/PixelTrack/",
-    label = ["pixelTracks"],
+    label = ["pixelTracks", "pixelTracksPt09"],
     doResolutionPlotsForLabels = [],
     trackCollectionForDrCalculation = "pixelTracks",
     associators = ["trackingParticlePixelTrackAsssociation"],
@@ -985,16 +1008,59 @@ def _uniqueFirstLayers(layerList):
     dodEdxPlots = False,
     cores = cms.InputTag(""),
 )
+trackValidatorFromPVPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone(
+    dirName = "Tracking/PixelTrackFromPV/",
+    label = ["pixelTracksFromPV", "pixelTracksFromPVPt09"],
+    label_tp_effic = "trackingParticlesSignal",
+    label_tp_fake = "trackingParticlesSignal",
+    label_tp_effic_refvector = True,
+    label_tp_fake_refvector = True,
+    trackCollectionForDrCalculation = "pixelTracksFromPV",
+    doPlotsOnlyForTruePV = True,
+    doPVAssociationPlots = False,
+    doResolutionPlotsForLabels = ["disabled"],
+)
+trackValidatorFromPVAllTPPixelTrackingOnly = trackValidatorFromPVPixelTrackingOnly.clone(
+    dirName = "Tracking/PixelTrackFromPVAllTP/",
+    label_tp_effic = trackValidatorPixelTrackingOnly.label_tp_effic.value(),
+    label_tp_fake = trackValidatorPixelTrackingOnly.label_tp_fake.value(),
+    label_tp_effic_refvector = False,
+    label_tp_fake_refvector = False,
+    doSimPlots = False,
+    doSimTrackPlots = False,
+)
+trackValidatorBHadronPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone(
+    dirName = "Tracking/PixelTrackBHadron/",
+    label_tp_effic = "trackingParticlesBHadron",
+    label_tp_effic_refvector = True,
+    doSimPlots = True,
+    doRecoTrackPlots = False, # Fake rate is defined wrt. all TPs, and that is already included in trackValidator
+    dodEdxPlots = False,
+)
+
 
 tracksValidationTruthPixelTrackingOnly = tracksValidationTruth.copy()
 tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation)
 tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks)
+tracksValidationTruthPixelTrackingOnly.add(trackingParticlesBHadron)
+
+tracksPreValidationPixelTrackingOnly = cms.Task(
+    tracksValidationTruthPixelTrackingOnly,
+    trackingParticlesSignal,
+    pixelTracksPt09,
+    pixelTracksFromPV, 
+    pixelTracksFromPVPt09,
+)
 tracksValidationPixelTrackingOnly = cms.Sequence(
-    trackValidatorPixelTrackingOnly,
-    tracksValidationTruthPixelTrackingOnly
+    trackValidatorPixelTrackingOnly +
+    trackValidatorFromPVPixelTrackingOnly +
+    trackValidatorFromPVAllTPPixelTrackingOnly +
+    trackValidatorBHadronPixelTrackingOnly,
+    tracksPreValidationPixelTrackingOnly
 )
 
 
+
 ### Lite mode (only generalTracks and HP)
 trackValidatorLite = trackValidator.clone(
     label = ["generalTracks", "cutsRecoTracksHp"]
diff --git a/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py b/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py
index c020d894c8d4b..04f9e52ee18a7 100644
--- a/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py
+++ b/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py
@@ -12,7 +12,7 @@
     ptMin = cms.double(0.9),
     ptMax = cms.double(1e100),
     maxRapidity = cms.double(2.5),
-    tip = cms.double(3.5),
+    tip = cms.double(2.0),
     minPhi = cms.double(-3.2),
     maxPhi = cms.double(3.2),
     invertRapidityCut = cms.bool(False)
diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py
index a9fed5cc12975..3985f8edc9abf 100644
--- a/Validation/RecoTrack/python/plotting/html.py
+++ b/Validation/RecoTrack/python/plotting/html.py
@@ -63,8 +63,14 @@ def _allToHP(s):
     return s.replace("All", "High purity")
 def _allToBTV(s):
     return s.replace("All", "BTV-like")
+def _allPtCut(s):
+    return s.replace("All tracks", "Tracks pT &gt; 0.9 GeV")
 def _ptCut(s):
     return s.replace("Tracks", "Tracks pT &gt; 0.9 GeV").replace("tracks", "tracks pT &gt; 0.9 GeV")
+def _allToPixel(s):
+    return s.replace("All", "Pixel")
+def _toPixel(s):
+    return s.replace("Tracks", "Pixel tracks")
 _trackQualityNameOrder = collections.OrderedDict([
     ("seeding_seeds", "Seeds"),
     ("seeding_seedsa", "Seeds A"),
@@ -75,8 +81,8 @@ def _ptCut(s):
     ("building_", "Built tracks"),
     ("", _allName),
     ("highPurity", _allToHP(_allName)),
-    ("Pt09", "Tracks pT &gt; 0.9 GeV"),
-    ("highPurityPt09", "High purity tracks pT &gt; 0.9 GeV"),
+    ("Pt09", _allPtCut(_allName)),
+    ("highPurityPt09", _ptCut(_allToHP(_allName))),
     ("ByOriginalAlgo", _toOriAlgo(_allName)),
     ("highPurityByOriginalAlgo", _toOriAlgo(_toHP(_allName))),
     ("ByAlgoMask", _toAlgoMask(_allName)),
@@ -120,6 +126,15 @@ def _ptCut(s):
     ("displaced_highPurityByOriginalAlgo", _toOriAlgo(_allToHP(_displacedName))),
     ("displaced_ByAlgoMask", _toAlgoMask(_displacedName)),
     ("displaced_highPurityByAlgoMask", _toAlgoMask(_allToHP(_displacedName))),
+    # Pixel tracks
+    ("pixel_", _allToPixel(_allName)),
+    ("pixel_Pt09", _ptCut(_allToPixel(_allName))),
+    ("pixelFromPV_", _toPixel(_fromPVName)),
+    ("pixelFromPV_Pt09", _ptCut(_toPixel(_fromPVName))),
+    ("pixelFromPVAllTP_", _toPixel(_fromPVAllTPName)),
+    ("pixelFromPVAllTP_Pt09", _ptCut(_toPixel(_fromPVAllTPName))),
+    ("pixelbhadron_", _allToPixel(_bhadronName)),
+    ("pixelbhadron_Pt09", _ptCut(_allToPixel(_bhadronName))),
 ])
 
 _trackAlgoName = {
@@ -134,6 +149,7 @@ def _ptCut(s):
     "iter7" : "Iterative Step 7",
     "iter9" : "Iterative Step 9",
     "iter10": "Iterative Step 10",
+    "pixel": "Pixel tracks",
 }
 
 _trackAlgoOrder = [
@@ -169,6 +185,7 @@ def _ptCut(s):
     'iter7',
     'iter9',
     'iter10',
+    "pixel",
 ]
 
 _pageNameMap = {
@@ -186,10 +203,10 @@ def _ptCut(s):
     # These are for the summary page
     ("seeding_seeds", "Seeds"),
     ("building", "Built tracks"),
-    ("", "All tracks"),
-    ("Pt09", "All tracks (pT&gt;0.9 GeV)"),
-    ("highPurity", "High purity tracks"),
-    ("highPurityPt09", "High purity tracks (pT&gt;0.9 GeV)"),
+    ("", _allName),
+    ("Pt09", _allPtCut(_allName)),
+    ("highPurity", _allToHP(_allName)),
+    ("highPurityPt09", _ptCut(_allToHP(_allName))),
     ("tpPtLess09", _tpPtLess09Name),
     ("tpPtLess09_highPurity", _allToHP(_tpPtLess09Name)),
     ("tpEtaGreater2p7", _tpEtaGreater2p7Name),
@@ -209,7 +226,14 @@ def _ptCut(s):
     ("displaced", _displacedName),
     ("displaced_highPurity", _allToHP(_displacedName)),
     # Pixel tracks
-    ("pixel", "Pixel tracks"),
+    ("pixel", _allToPixel(_allName)),
+    ("pixelPt09", _ptCut(_allToPixel(_allName))),
+    ("pixelFromPV", _toPixel(_fromPVName)),
+    ("pixelFromPVPt09", _ptCut(_toPixel(_fromPVName))),
+    ("pixelFromPVAllTP", _toPixel(_fromPVAllTPName)),
+    ("pixelFromPVAllTPPt09", _ptCut(_toPixel(_fromPVAllTPName))),
+    ("pixelbhadron", _allToPixel(_bhadronName)),
+    ("pixelbhadronPt09", _ptCut(_allToPixel(_bhadronName))),
     # These are for vertices
     ("genvertex", "Gen vertices"),
     ("pixelVertices", "Pixel vertices"),
@@ -233,6 +257,7 @@ def _ptCut(s):
 _fromPVAllTP2Legend = "Tracks from reco PV (another method), fake rate numerator contains all TrackingParticles (separates fake tracks from pileup tracks)"
 _fromPVAllTPPt2Legend = "Tracks (pT &gt; 0.9 GeV) from reco PV (another method), fake rate numerator contains all TrackingParticles (separates fake tracks from pileup tracks)"
 _bhadronLegend = "All tracks, efficiency denominator contains only TrackingParticles from B-hadron decays"
+_bhadronPtLegend = "Tracks (pT &gt; 0.9 GeV), efficiency denominator contains only TrackingParticles from B-hadron decays"
 
 def _sectionNameLegend():
     return {
@@ -258,6 +283,12 @@ def _sectionNameLegend():
         "bhadron_": _bhadronLegend,
         "bhadron_highPurity": _allToHP(_bhadronLegend),
         "bhadron_btvLike": _bhadronLegend.replace("All tracks", _btvLegend),
+        "pixelFromPV_": _fromPVLegend,
+        "pixelFromPV_Pt09": _fromPVPtLegend,
+        "pixelFromPVAllTP_": _fromPVAllTPLegend,
+        "pixelFromPVAllTP_Pt09": _fromPVAllTPPtLegend,
+        "pixelbhadron_": _bhadronLegend,
+        "pixelbhadron_Pt09": _bhadronPtLegend,
     }
 
 class Table:
@@ -701,7 +732,7 @@ def __init__(self, sample, title, fastVsFull, pileupComparison):
         self._timingPage = PageSet(*params)
         self._pfPages = PageSet(*params)
         self._hltPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0])
-        self._pixelPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0])
+        self._pixelPages = TrackingPageSet(*params)
         self._otherPages = PageSet(*params)
 
         self._purposePageMap = {
diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py
index 7f5d84738ee81..fda80114d7e8f 100644
--- a/Validation/RecoTrack/python/plotting/trackingPlots.py
+++ b/Validation/RecoTrack/python/plotting/trackingPlots.py
@@ -615,6 +615,8 @@ def _mapCollectionToAlgoQuality(collName):
     prefixes = ["cutsreco", "cutsrecofrompv", "cutsrecofrompv2", "cutsrecofrompvalltp", "cutsrecoetagreater2p7"]
     if collNameLow in ["general", "generalfrompv", "generaletagreater2p7"]+prefixes:
         algo = "ootb"
+    elif collNameLow in ["pixel", "pixelfrompv", "pixelfrompvalltp"]:
+        algo = "pixel"
     else:
         def testColl(coll):
             for pfx in prefixes:
@@ -939,6 +941,7 @@ class HighPurityPt09: pass
     class BTVLike: pass
     class AK4PFJets: pass
     class Pixel: pass
+    class PixelPt09: pass
 
     def __init__(self, section, collection=GeneralTracks):
         self._collection = collection
@@ -981,6 +984,8 @@ def _getN(hname):
                 return _getAlgoQuality(data, "ak4PFJets", "")
             elif self._collection == TrackingSummaryTable.Pixel:
                 return _getAlgoQuality(data, "pixel", "")
+            elif self._collection == TrackingSummaryTable.PixelPt09:
+                return _getAlgoQuality(data, "pixel", "Pt09")
             else:
                 raise Exception("Collection not recognized, %s" % str(self._collection))
         def _formatOrNone(num, func):
@@ -1354,11 +1359,21 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only
 _appendTrackingPlots("TrackBHadron", "bhadron", _simBasedPlots+_recoBasedPlots, onlyForBHadron=True)
 _appendTrackingPlots("TrackDisplaced", "displaced", _simBasedPlots+_recoBasedPlots)
 # Pixel tracks
-_common = dict(purpose=PlotPurpose.Pixel, page="pixel")
-plotter.append("pixelTrack", _trackingFolders("PixelTrack"), TrackingPlotFolder(*(_simBasedPlots+_recoBasedPlots), **_common))
-plotterExt.append("pixelTrack", _trackingFolders("PixelTrack"), TrackingPlotFolder(*_extendedPlots, **_common))
-plotter.append("pixelTrack_summary",  _trackingFolders("PixelTrack"), PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section="pixel"))
-plotter.appendTable("pixelTrack_summary", _trackingFolders("PixelTrack"), TrackingSummaryTable(section="pixel", collection=TrackingSummaryTable.Pixel))
+def _appendPixelTrackingPlots(lastDirName, name):
+    _common = dict(purpose=PlotPurpose.Pixel, page="pixel")
+    _folders = _trackingFolders(lastDirName)
+
+    plotter.append(name, _folders, TrackingPlotFolder(*(_simBasedPlots+_recoBasedPlots), **_common))
+    plotterExt.append(name, _folders, TrackingPlotFolder(*_extendedPlots, **_common))
+
+    plotter.append(name+"_summary",  _folders, PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section=name))
+    plotter.append(name+"_summary",  _folders, PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section=name+"Pt09"))
+    plotter.appendTable(name+"_summary", _folders, TrackingSummaryTable(section=name, collection=TrackingSummaryTable.Pixel))
+    plotter.appendTable(name+"_summary", _folders, TrackingSummaryTable(section=name+"Pt09", collection=TrackingSummaryTable.PixelPt09))
+_appendPixelTrackingPlots("PixelTrack", "pixel")
+_appendPixelTrackingPlots("PixelTrackFromPV", "pixelFromPV")
+_appendPixelTrackingPlots("PixelTrackFromPVAllTP", "pixelFromPVAllTP")
+_appendPixelTrackingPlots("PixelTrackBHadron", "pixelbhadron")
 
 
 # MiniAOD