diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 3f2a551bc320f..8cfa5c9f5ffde 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -92,6 +92,11 @@ namespace cms {
         return cms::cuda::make_host_unique<T>(stream);
       }
 
+      template <typename T>
+      static auto make_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(size, stream);
+      }
+
       template <typename T>
       static auto make_host_unique(cudaStream_t stream) {
         return cms::cuda::make_host_unique<T>(stream);
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
index 70d00ae584279..78406cd241473 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
@@ -12,7 +12,10 @@
 class SiPixelDigisCUDASOAView {
 public:
   friend class SiPixelDigisCUDA;
-  friend class SiPixelRecHitSoAFromLegacy;
+
+  template <typename TrackerTraits>
+  friend class SiPixelRecHitSoAFromLegacyT;
+
   enum class StorageLocation {
     kCLUS = 0,
     kPDIGI = 2,
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index 3ee5af80353dd..f9e9b3a37c63f 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -3,7 +3,9 @@
 
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
-using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+template <typename TrackerTraits>
+using PixelTrackHeterogeneousT = HeterogeneousSoA<pixelTrack::TrackSoAT<TrackerTraits>>;
 
-#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
\ No newline at end of file
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
index 356ea3eddeb7f..b5b1df0d5118a 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -5,12 +5,13 @@
 #include <algorithm>
 
 #include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h"
 
 namespace pixelTrack {
+
   enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
   constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
   const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
@@ -18,20 +19,24 @@ namespace pixelTrack {
     auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
     return static_cast<Quality>(qp);
   }
+
 }  // namespace pixelTrack
 
-template <int32_t S>
+template <typename TrackerTraits>
 class TrackSoAHeterogeneousT {
 public:
+  static constexpr int32_t S = TrackerTraits::maxNumberOfTuples;
+  static constexpr int32_t H = TrackerTraits::maxHitsOnTrack;  // Average hits rather than max?
   static constexpr int32_t stride() { return S; }
 
+  using hindex_type = uint32_t;  //TrackerTraits::hindex_type ?
+
   using Quality = pixelTrack::Quality;
-  using hindex_type = uint32_t;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, H * S>;
 
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
-private:
+protected:
   eigenSoA::ScalarSoA<uint8_t, S> quality_;
 
 public:
@@ -56,9 +61,9 @@ class TrackSoAHeterogeneousT {
     // layers are in order and we assume tracks are either forward or backward
     auto pdet = detIndices.begin(i);
     int nl = 1;
-    auto ol = phase1PixelTopology::getLayer(*pdet);
+    auto ol = pixelTopology::getLayer<TrackerTraits>(*pdet);
     for (; pdet < detIndices.end(i); ++pdet) {
-      auto il = phase1PixelTopology::getLayer(*pdet);
+      auto il = pixelTopology::getLayer<TrackerTraits>(*pdet);
       if (il != ol)
         ++nl;
       ol = il;
@@ -90,17 +95,100 @@ class TrackSoAHeterogeneousT {
 
 namespace pixelTrack {
 
-#ifdef GPU_SMALL_EVENTS
-  // kept for testing and debugging
-  constexpr uint32_t maxNumber() { return 2 * 1024; }
-#else
-  // tested on MC events with 55-75 pileup events
-  constexpr uint32_t maxNumber() { return 32 * 1024; }
-#endif
+  template <typename TrackerTraits>
+  using TrackSoAT = TrackSoAHeterogeneousT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using HitContainerT = typename TrackSoAHeterogeneousT<TrackerTraits>::HitContainer;
+
+  //Used only to ease classes definitions
+  using TrackSoAPhase1 = TrackSoAHeterogeneousT<pixelTopology::Phase1>;
+  using TrackSoAPhase2 = TrackSoAHeterogeneousT<pixelTopology::Phase2>;
+
+  template <typename TrackerTraits, typename Enable = void>
+  struct QualityCutsT {};
+
+  template <typename TrackerTraits>
+  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> {
+    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
+    float chi2Coeff[4];
+    float chi2MaxPt;  // GeV
+    float chi2Scale;
+
+    struct Region {
+      float maxTip;  // cm
+      float minPt;   // GeV
+      float maxZip;  // cm
+    };
+
+    Region triplet;
+    Region quadruplet;
+
+    __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
+                                         int nHits,
+                                         int it) const {
+      // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+      // default cuts:
+      //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+      //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+      // (see CAHitNtupletGeneratorGPU.cc)
+      auto const &region = (nHits > 3) ? quadruplet : triplet;
+      return (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
+             (std::abs(tracks->zip(it)) < region.maxZip);
+    }
 
-  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
-  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
-  using HitContainer = TrackSoA::HitContainer;
+    __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
+                                              int it) const {
+      auto roughLog = [](float x) {
+        // max diff [0.5,12] at 1.25 0.16143
+        // average diff  0.0662998
+        union IF {
+          uint32_t i;
+          float f;
+        };
+        IF z;
+        z.f = x;
+        uint32_t lsb = 1 < 21;
+        z.i += lsb;
+        z.i >>= 21;
+        auto f = z.i & 3;
+        int ex = int(z.i >> 2) - 127;
+
+        // log2(1+0.25*f)
+        // averaged over bins
+        const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
+        return float(ex) + frac[f];
+      };
+
+      float pt = std::min<float>(tracks->pt(it), chi2MaxPt);
+      float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]);
+      if (tracks->chi2(it) >= chi2Cut) {
+#ifdef NTUPLE_FIT_DEBUG
+        printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks->pt(it), tracks->eta(it), tracks->chi2(it));
+#endif
+        return true;
+      }
+      return false;
+    }
+  };
+
+  template <typename TrackerTraits>
+  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> {
+    float maxChi2;
+    float minPt;
+    float maxTip;
+    float maxZip;
+
+    __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
+                                         int nHits,
+                                         int it) const {
+      return (std::abs(tracks->tip(it)) < maxTip) and (tracks->pt(it) > minPt) and (std::abs(tracks->zip(it)) < maxZip);
+    }
+    __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
+                                              int it) const {
+      return tracks->chi2(it) >= maxChi2;
+    }
+  };
 
 }  // namespace pixelTrack
 
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 9c80ae91baf29..5216c19dded65 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,6 +1,15 @@
 <lcgdict>
-  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
-  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
-  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+
+  <class name="pixelTrack::TrackSoAPhase1" persistent="false"/>
+  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoAPhase1>>" persistent="false" />
+  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase1>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase1>>>" persistent="false"/>
+  <class name="HeterogeneousSoA<pixelTrack::TrackSoAPhase1>" persistent="false" />
+
+  <class name="pixelTrack::TrackSoAPhase2" persistent="false"/>
+  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoAPhase2>>" persistent="false" />
+  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase2>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase2>>>" persistent="false"/>
+  <class name="HeterogeneousSoA<pixelTrack::TrackSoAPhase2>" persistent="false" />
+
 </lcgdict>
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index 8ce37f280ac6c..ad78daa8354e2 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -4,10 +4,10 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h"
 
-template <typename Traits>
-class TrackingRecHit2DHeterogeneous {
-public:
+namespace {
   enum class Storage32 {
     kXLocal = 0,
     kYLocal = 1,
@@ -28,37 +28,45 @@ class TrackingRecHit2DHeterogeneous {
     kXSize = 2,
     kYSize = 3,
   };
+}  // namespace
+
+template <typename Traits, typename TrackerTraits>
+class TrackingRecHit2DHeterogeneousT {
+public:
+  template <typename>
+  friend class TrackingRecHit2DHostT;
 
   template <typename T>
   using unique_ptr = typename Traits::template unique_ptr<T>;
 
-  using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
+  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using PhiBinner = typename TrackingRecHit2DSOAView::PhiBinner;
+  using AverageGeometry = typename TrackingRecHit2DSOAView::AverageGeometry;
 
-  TrackingRecHit2DHeterogeneous() = default;
+  TrackingRecHit2DHeterogeneousT() = default;
 
-  explicit TrackingRecHit2DHeterogeneous(
-      uint32_t nHits,
-      bool isPhase2,
-      int32_t offsetBPIX2,
-      pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-      uint32_t const* hitsModuleStart,
-      cudaStream_t stream,
-      TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
+  explicit TrackingRecHit2DHeterogeneousT(uint32_t nHits,
+                                          int32_t offsetBPIX2,
+                                          pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
+                                          uint32_t const* hitsModuleStart,
+                                          cudaStream_t stream = nullptr);
 
-  explicit TrackingRecHit2DHeterogeneous(
-      float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
-  ~TrackingRecHit2DHeterogeneous() = default;
+  explicit TrackingRecHit2DHeterogeneousT(cms::cuda::host::unique_ptr<float[]>& store32,
+                                          cms::cuda::host::unique_ptr<uint16_t[]>& store16,
+                                          uint32_t* modules,
+                                          int nHits,
+                                          cudaStream_t stream = nullptr);
+  ~TrackingRecHit2DHeterogeneousT() = default;
 
-  TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete;
-  TrackingRecHit2DHeterogeneous& operator=(const TrackingRecHit2DHeterogeneous&) = delete;
-  TrackingRecHit2DHeterogeneous(TrackingRecHit2DHeterogeneous&&) = default;
-  TrackingRecHit2DHeterogeneous& operator=(TrackingRecHit2DHeterogeneous&&) = default;
+  TrackingRecHit2DHeterogeneousT(const TrackingRecHit2DHeterogeneousT&) = delete;
+  TrackingRecHit2DHeterogeneousT& operator=(const TrackingRecHit2DHeterogeneousT&) = delete;
+  TrackingRecHit2DHeterogeneousT(TrackingRecHit2DHeterogeneousT&&) = default;
+  TrackingRecHit2DHeterogeneousT& operator=(TrackingRecHit2DHeterogeneousT&&) = default;
 
   TrackingRecHit2DSOAView* view() { return m_view.get(); }
   TrackingRecHit2DSOAView const* view() const { return m_view.get(); }
 
   auto nHits() const { return m_nHits; }
-  auto nMaxModules() const { return m_nMaxModules; }
   auto offsetBPIX2() const { return m_offsetBPIX2; }
 
   auto hitsModuleStart() const { return m_hitsModuleStart; }
@@ -74,10 +82,7 @@ class TrackingRecHit2DHeterogeneous {
   cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
   cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
 
-  // needs specialization for Host
-  void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
-
-private:
+protected:
   static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
   static constexpr uint32_t n32 = 10;                // number of elements in m_store32
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
@@ -85,8 +90,8 @@ class TrackingRecHit2DHeterogeneous {
   unique_ptr<uint16_t[]> m_store16;  //!
   unique_ptr<float[]> m_store32;     //!
 
-  unique_ptr<TrackingRecHit2DSOAView::PhiBinner> m_PhiBinnerStore;              //!
-  unique_ptr<TrackingRecHit2DSOAView::AverageGeometry> m_AverageGeometryStore;  //!
+  unique_ptr<PhiBinner> m_PhiBinnerStore;              //!
+  unique_ptr<AverageGeometry> m_AverageGeometryStore;  //!
 
   unique_ptr<TrackingRecHit2DSOAView> m_view;  //!
 
@@ -95,39 +100,86 @@ class TrackingRecHit2DHeterogeneous {
 
   uint32_t const* m_hitsModuleStart;  // needed for legacy, this is on GPU!
 
-  uint32_t m_nMaxModules;
   // needed as kernel params...
   PhiBinner* m_phiBinner;
-  PhiBinner::index_type* m_phiBinnerStorage;
+  typename PhiBinner::index_type* m_phiBinnerStorage;
   uint32_t* m_hitsLayerStart;
   int16_t* m_iphi;
 };
 
-using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
-using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
-using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
+//Inherit and overload only what we need to overload, remember to use this->
+//GPU
+template <typename TrackerTraits>
+class TrackingRecHit2DGPUT : public TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits> {
+public:
+  using TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits>::TrackingRecHit2DHeterogeneousT;
+
+  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
+};
+
+//CPU
+template <typename TrackerTraits>
+class TrackingRecHit2DCPUT : public TrackingRecHit2DHeterogeneousT<cms::cudacompat::CPUTraits, TrackerTraits> {
+public:
+  using TrackingRecHit2DHeterogeneousT<cms::cudacompat::CPUTraits, TrackerTraits>::TrackingRecHit2DHeterogeneousT;
+
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
+};
+
+//HOST
+template <typename TrackerTraits>
+class TrackingRecHit2DHostT : public TrackingRecHit2DHeterogeneousT<cms::cudacompat::HostTraits, TrackerTraits> {
+public:
+  ~TrackingRecHit2DHostT() = default;
+  TrackingRecHit2DHostT() = default;
+
+  explicit TrackingRecHit2DHostT(uint32_t nHits,
+                                 int32_t offsetBPIX2,
+                                 pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
+                                 uint32_t const* hitsModuleStart,
+                                 cudaStream_t stream = nullptr)
+      : TrackingRecHit2DHeterogeneousT<cms::cudacompat::HostTraits, TrackerTraits>(
+            nHits, offsetBPIX2, cpeParams, hitsModuleStart, stream) {}
+
+  explicit TrackingRecHit2DHostT(cms::cuda::host::unique_ptr<float[]>& store32,
+                                 cms::cuda::host::unique_ptr<uint16_t[]>& store16,
+                                 uint32_t* modules,
+                                 int nHits,
+                                 cudaStream_t stream = nullptr)
+      : TrackingRecHit2DHeterogeneousT<cms::cudacompat::HostTraits, TrackerTraits>(
+            store32, store16, modules, nHits, stream) {}
+
+  explicit TrackingRecHit2DHostT(uint32_t nHits,
+                                 int32_t offsetBPIX2,
+                                 pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
+                                 uint32_t const* hitsModuleStart,
+                                 cudaStream_t stream,
+                                 TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits> const* input);
+};
 
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-template <typename Traits>
-TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
+template <typename Traits, typename TrackerTraits>
+TrackingRecHit2DHeterogeneousT<Traits, TrackerTraits>::TrackingRecHit2DHeterogeneousT(
     uint32_t nHits,
-    bool isPhase2,
     int32_t offsetBPIX2,
-    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+    pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
     uint32_t const* hitsModuleStart,
-    cudaStream_t stream,
-    TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
+    cudaStream_t stream)
     : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
+  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
 
-  m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
 
   view->m_nHits = nHits;
-  view->m_nMaxModules = m_nMaxModules;
   m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);  // leave it on host and pass it by value?
-  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_AverageGeometryStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::AverageGeometry>(stream);
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_cpeParams = cpeParams;
   view->m_hitsModuleStart = hitsModuleStart;
@@ -148,29 +200,20 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
 
-  // host copy is "reduced"  (to be reviewed at some point)
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
-    // it has to compile for ALL cases
-    copyFromGPU(input, stream);
-  } else {
-    assert(input == nullptr);
-
-    auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
-
-    m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
-    m_store32 = Traits::template make_unique<float[]>(nHits * n32 + nL + 1, stream);
-    m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
-  }
+  m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
+  m_store32 = Traits::template make_unique<float[]>(nHits * n32 + TrackerTraits::numberOfLayers + 1, stream);
+  m_PhiBinnerStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::PhiBinner>(stream);
 
-  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
-  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
+  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
+  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) ==
+                sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type));
 
   auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
 
   // copy all the pointers
   m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
   m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+      reinterpret_cast<typename TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
 
   view->m_xl = get32(Storage32::kXLocal);
   view->m_yl = get32(Storage32::kYLocal);
@@ -178,23 +221,20 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   view->m_yerr = get32(Storage32::kYerror);
   view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
 
-  if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
-    assert(input == nullptr);
-    view->m_xg = get32(Storage32::kXGlobal);
-    view->m_yg = get32(Storage32::kYGlobal);
-    view->m_zg = get32(Storage32::kZGlobal);
-    view->m_rg = get32(Storage32::kRGlobal);
+  view->m_xg = get32(Storage32::kXGlobal);
+  view->m_yg = get32(Storage32::kYGlobal);
+  view->m_zg = get32(Storage32::kZGlobal);
+  view->m_rg = get32(Storage32::kRGlobal);
 
-    auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
-    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
+  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
 
-    view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
-    view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
-    view->m_detInd = get16(Storage16::kDetId);
+  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+  view->m_detInd = get16(Storage16::kDetId);
 
-    m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
-  }
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
 
   // transfer view
   if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
@@ -204,10 +244,67 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   }
 }
 
+template <typename TrackerTraits>
+TrackingRecHit2DHostT<TrackerTraits>::TrackingRecHit2DHostT(
+    uint32_t nHits,
+    int32_t offsetBPIX2,
+    pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
+    uint32_t const* hitsModuleStart,
+    cudaStream_t stream,
+    TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits> const* input) {
+  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+
+  this->m_nHits = nHits;
+  this->m_offsetBPIX2 = offsetBPIX2;
+  this->m_hitsModuleStart = hitsModuleStart;
+
+  auto view = cms::cuda::make_host_unique<TrackingRecHit2DSOAView>(stream);
+
+  view->m_nHits = nHits;
+  this->m_view =
+      cms::cuda::make_host_unique<TrackingRecHit2DSOAView>(stream);  // leave it on host and pass it by value?
+  this->m_AverageGeometryStore = cms::cuda::make_host_unique<typename TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  view->m_averageGeometry = this->m_AverageGeometryStore.get();
+  view->m_cpeParams = cpeParams;
+  view->m_hitsModuleStart = hitsModuleStart;
+
+  // if empy do not bother
+  if (0 == nHits) {
+    this->m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
+    return;
+  }
+
+  this->m_store32 = cms::cuda::make_host_unique<float[]>(5 * input->nHits(), stream);
+  cms::cuda::copyAsync(this->m_store32, input->m_store32, 5 * input->nHits(), stream);
+
+  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
+  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) ==
+                sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type));
+
+  auto get32 = [&](Storage32 i) { return this->m_store32.get() + static_cast<int>(i) * nHits; };
+
+  // copy all the pointers
+  this->m_phiBinner = view->m_phiBinner = this->m_PhiBinnerStore.get();
+  this->m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<typename TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
+
+  this->m_view = std::move(view);
+}
+
 //this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
-template <typename Traits>
-TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
-    float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
+template <typename Traits, typename TrackerTraits>
+TrackingRecHit2DHeterogeneousT<Traits, TrackerTraits>::TrackingRecHit2DHeterogeneousT(
+    cms::cuda::host::unique_ptr<float[]>& store32,
+    cms::cuda::host::unique_ptr<uint16_t[]>& store16,
+    uint32_t* modules,
+    int nHits,
+    cudaStream_t stream)
     : m_nHits(nHits), m_hitsModuleStart(modules) {
   auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
 
@@ -226,19 +323,20 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
 
   m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
   m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
-  m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
-  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_PhiBinnerStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::PhiBinner>(stream);
+  m_AverageGeometryStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::AverageGeometry>(stream);
 
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_hitsModuleStart = m_hitsModuleStart;
 
   //store transfer
   if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_store16, store16, stream);
-    cms::cuda::copyAsync(m_store32, store32, stream);
+    cms::cuda::copyAsync(m_store16, store16, static_cast<int>(n16 * nHits), stream);
+    cms::cuda::copyAsync(m_store32, store32, static_cast<int>(n32 * nHits), stream);
+
   } else {
-    std::copy(store32, store32 + nHits * n32, m_store32.get());  // want to copy it
-    std::copy(store16, store16 + nHits * n16, m_store16.get());
+    std::copy(store32.get(), store32.get() + nHits * n32, m_store32.get());  // want to copy it
+    std::copy(store16.get(), store16.get() + nHits * n16, m_store16.get());
   }
 
   //getters
@@ -258,7 +356,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
 
   m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
   m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+      reinterpret_cast<typename TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
 
   //Store 16
   view->m_detInd = get16(Storage16::kDetId);
@@ -274,4 +372,13 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   }
 }
 
-#endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
+//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code.
+using TrackingRecHit2DGPUPhase1 = TrackingRecHit2DGPUT<pixelTopology::Phase1>;
+using TrackingRecHit2DCPUPhase1 = TrackingRecHit2DCPUT<pixelTopology::Phase1>;
+using TrackingRecHit2DHostPhase1 = TrackingRecHit2DHostT<pixelTopology::Phase1>;
+
+using TrackingRecHit2DGPUPhase2 = TrackingRecHit2DGPUT<pixelTopology::Phase2>;
+using TrackingRecHit2DCPUPhase2 = TrackingRecHit2DCPUT<pixelTopology::Phase2>;
+using TrackingRecHit2DHostPhase2 = TrackingRecHit2DHostT<pixelTopology::Phase2>;
+
+#endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneousT_h
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
index f252ca94d2296..8fd2bc54cfad7 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
@@ -1,17 +1,20 @@
-#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
-#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
+#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h
+#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 
 // a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
-class TrackingRecHit2DReduced {
+template <typename TrackerTraits>
+class TrackingRecHit2DReducedT {
+  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+
 public:
   using HLPstorage = HostProduct<float[]>;
   using HIDstorage = HostProduct<uint16_t[]>;
 
   template <typename UP32, typename UP16>
-  TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits)
+  TrackingRecHit2DReducedT(UP32&& istore32, UP16&& istore16, int nhits)
       : m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) {
     auto get32 = [&](int i) { return const_cast<float*>(m_store32.get()) + i * nhits; };
 
@@ -26,15 +29,15 @@ class TrackingRecHit2DReduced {
   }
 
   // view only!
-  TrackingRecHit2DReduced(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}
+  TrackingRecHit2DReducedT(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}
 
-  TrackingRecHit2DReduced() = default;
-  ~TrackingRecHit2DReduced() = default;
+  TrackingRecHit2DReducedT() = default;
+  ~TrackingRecHit2DReducedT() = default;
 
-  TrackingRecHit2DReduced(const TrackingRecHit2DReduced&) = delete;
-  TrackingRecHit2DReduced& operator=(const TrackingRecHit2DReduced&) = delete;
-  TrackingRecHit2DReduced(TrackingRecHit2DReduced&&) = default;
-  TrackingRecHit2DReduced& operator=(TrackingRecHit2DReduced&&) = default;
+  TrackingRecHit2DReducedT(const TrackingRecHit2DReducedT&) = delete;
+  TrackingRecHit2DReducedT& operator=(const TrackingRecHit2DReducedT&) = delete;
+  TrackingRecHit2DReducedT(TrackingRecHit2DReducedT&&) = default;
+  TrackingRecHit2DReducedT& operator=(TrackingRecHit2DReducedT&&) = default;
 
   TrackingRecHit2DSOAView& view() { return m_view; }
   TrackingRecHit2DSOAView const& view() const { return m_view; }
@@ -50,4 +53,7 @@ class TrackingRecHit2DReduced {
   int m_nHits;
 };
 
+using TrackingRecHit2DReducedPhase1 = TrackingRecHit2DReducedT<pixelTopology::Phase1>;
+using TrackingRecHit2DReducedPhase2 = TrackingRecHit2DReducedT<pixelTopology::Phase2>;
+
 #endif
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
index 39ee136189955..59b7cb1337fdf 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
@@ -10,27 +10,34 @@
 #include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h"
 
 namespace pixelCPEforGPU {
-  struct ParamsOnGPU;
+  template <typename TrackerTraits>
+  struct ParamsOnGPUT;
 }
 
-class TrackingRecHit2DSOAView {
+template <typename TrackerTraits>
+class TrackingRecHit2DSOAViewT {
 public:
   using Status = SiPixelHitStatus;
   static_assert(sizeof(Status) == sizeof(uint8_t));
 
-  using hindex_type = uint32_t;  // if above is <=2^32
-
-  using PhiBinner = cms::cuda::
-      HistoContainer<int16_t, 256, -1, 8 * sizeof(int16_t), hindex_type, pixelTopology::maxLayers>;  //28 for phase2 geometry
-
-  using AverageGeometry = pixelTopology::AverageGeometry;
-
+  using hindex_type = typename TrackerTraits::hindex_type;
+  using PhiBinner = cms::cuda::HistoContainer<int16_t,
+                                              256,
+                                              -1,
+                                              8 * sizeof(int16_t),
+                                              hindex_type,
+                                              TrackerTraits::numberOfLayers>;  //28 for phase2 geometry
+  using AverageGeometry = pixelTopology::AverageGeometryT<TrackerTraits>;
+  using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>;
+
+  template <typename, typename>
+  friend class TrackingRecHit2DHeterogeneousT;
   template <typename>
-  friend class TrackingRecHit2DHeterogeneous;
-  friend class TrackingRecHit2DReduced;
+  friend class TrackingRecHit2DHostT;
+  // template <typename>
+  // friend class TrackingRecHit2DReducedT;
 
   __device__ __forceinline__ uint32_t nHits() const { return m_nHits; }
-  __device__ __forceinline__ uint32_t nMaxModules() const { return m_nMaxModules; }
 
   __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; }
   __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); }
@@ -75,7 +82,7 @@ class TrackingRecHit2DSOAView {
   __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; }
   __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); }
 
-  __device__ __forceinline__ pixelCPEforGPU::ParamsOnGPU const& cpeParams() const { return *m_cpeParams; }
+  __device__ __forceinline__ ParamsOnGPU const& cpeParams() const { return *m_cpeParams; }
 
   __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); }
 
@@ -88,6 +95,9 @@ class TrackingRecHit2DSOAView {
   __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; }
   __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; }
 
+  __device__ __forceinline__ bool clusterCut(int i, int o, bool debug = false) const { return false; }
+  __device__ __forceinline__ bool zSizeCut(int i, int o, bool debug = false) const { return false; }
+
 private:
   // local coord
   float *m_xl, *m_yl;
@@ -106,17 +116,16 @@ class TrackingRecHit2DSOAView {
 
   // supporting objects
   // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise
-  AverageGeometry* m_averageGeometry;              // owned by TrackingRecHit2DHeterogeneous
-  pixelCPEforGPU::ParamsOnGPU const* m_cpeParams;  // forwarded from setup, NOT owned
-  uint32_t const* m_hitsModuleStart;               // forwarded from clusters
+  AverageGeometry* m_averageGeometry;  // owned by TrackingRecHit2DHeterogeneous
+  ParamsOnGPU const* m_cpeParams;      // forwarded from setup, NOT owned
+  uint32_t const* m_hitsModuleStart;   // forwarded from clusters
 
   uint32_t* m_hitsLayerStart;
 
   PhiBinner* m_phiBinner;
-  PhiBinner::index_type* m_phiBinnerStorage;
+  typename PhiBinner::index_type* m_phiBinnerStorage;
 
   uint32_t m_nHits;
-  uint32_t m_nMaxModules;
 };
 
 #endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
index fc6a05ba9ed3e..05c3eba3d8bde 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
@@ -4,38 +4,46 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
-template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
+template <typename TrackerTraits>
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPUT<TrackerTraits>::localCoordToHostAsync(
+    cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(5 * this->nHits(), stream);
+  cms::cuda::copyAsync(ret, this->m_store32, 5 * this->nHits(), stream);
   return ret;
 }
 
-template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
+template <typename TrackerTraits>
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPUT<TrackerTraits>::store32ToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(this->n32) * this->nHits(), stream);
+  cms::cuda::copyAsync(ret, this->m_store32, static_cast<int>(this->n32) * this->nHits(), stream);
   return ret;
 }
 
-template <>
-cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
+template <typename TrackerTraits>
+cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPUT<TrackerTraits>::store16ToHostAsync(
+    cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(this->n16) * this->nHits(), stream);
+  cms::cuda::copyAsync(ret, this->m_store16, static_cast<int>(this->n16) * this->nHits(), stream);
   return ret;
 }
 
-template <>
-cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);
-  cudaCheck(
-      cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream));
+template <typename TrackerTraits>
+cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPUT<TrackerTraits>::hitsModuleStartToHostAsync(
+    cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(TrackerTraits::numberOfModules + 1, stream);
+  cudaCheck(cudaMemcpyAsync(ret.get(),
+                            this->m_hitsModuleStart,
+                            sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1),
+                            cudaMemcpyDefault,
+                            stream));
   return ret;
 }
 
-// the only specialization needed
-template <>
-void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
-  assert(input);
-  m_store32 = input->localCoordToHostAsync(stream);
-}
+template class TrackingRecHit2DGPUT<pixelTopology::Phase1>;
+template class TrackingRecHit2DGPUT<pixelTopology::Phase2>;
+
+template class TrackingRecHit2DCPUT<pixelTopology::Phase1>;
+template class TrackingRecHit2DCPUT<pixelTopology::Phase2>;
+
+template class TrackingRecHit2DHostT<pixelTopology::Phase1>;
+template class TrackingRecHit2DHostT<pixelTopology::Phase2>;
diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h
index abecfb38797de..b9a20695712e3 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes.h
+++ b/CUDADataFormats/TrackingRecHit/src/classes.h
@@ -1,9 +1,9 @@
-#ifndef CUDADataFormats_SiPixelCluster_src_classes_h
-#define CUDADataFormats_SiPixelCluster_src_classes_h
+#ifndef CUDADataFormats_TrackingRecHit_src_classes_h
+#define CUDADataFormats_TrackingRecHit_src_classes_h
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
-#endif  // CUDADataFormats_SiPixelCluster_src_classes_h
+#endif  // CUDADataFormats_TrackingRecHit_src_classes_h
diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
index f633d77c48ef7..4287860ee8495 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml
+++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
@@ -1,10 +1,22 @@
 <lcgdict>
-  <class name="TrackingRecHit2DCPU" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DCPU>" persistent="false"/>
-  <class name="TrackingRecHit2DHost" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DHost>" persistent="false"/>
-  <class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
-  <class name="TrackingRecHit2DReduced" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/>
+  <class name="TrackingRecHit2DCPUPhase1" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHit2DCPUPhase1>" persistent="false"/>
+  <class name="TrackingRecHit2DHostPhase1" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHit2DHostPhase1>" persistent="false"/>
+  <class name="cms::cuda::Product<TrackingRecHit2DGPUPhase1>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPUPhase1>>" persistent="false"/>
+
+  <class name="TrackingRecHit2DCPUPhase2" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHit2DCPUPhase2>" persistent="false"/>
+  <class name="TrackingRecHit2DHostPhase2" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHit2DHostPhase2>" persistent="false"/>
+  <class name="cms::cuda::Product<TrackingRecHit2DGPUPhase2>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPUPhase2>>" persistent="false"/>
+
+  <class name="TrackingRecHit2DReducedPhase1" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHit2DReducedPhase1>" persistent="false"/>
+
+  <class name="TrackingRecHit2DReducedPhase2" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHit2DReducedPhase2>" persistent="false"/>
+
 </lcgdict>
diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
index ce49c46fffba0..f064563aa7051 100644
--- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
+++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
@@ -1,4 +1,5 @@
 <use name="CUDADataFormats/TrackingRecHit"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 <iftool name="cuda-gcc-support">
 <bin file="TrackingRecHit2DCUDA_t.cpp TrackingRecHit2DCUDA_t.cu" name="TrackingRecHit2DCUDA_t"/>
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h
new file mode 100644
index 0000000000000..b2da57c2471ae
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h
@@ -0,0 +1,26 @@
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+
+namespace testTrackingRecHit2D {
+
+  template <typename TrackerTraits>
+  __global__ void fill(TrackingRecHit2DSOAViewT<TrackerTraits>* phits) {
+    assert(phits);
+    auto& hits = *phits;
+    assert(hits.nHits() == 200);
+
+    int i = threadIdx.x;
+    if (i > 200)
+      return;
+  }
+
+  template <typename TrackerTraits>
+  __global__ void verify(TrackingRecHit2DSOAViewT<TrackerTraits> const* phits) {
+    assert(phits);
+    auto const& hits = *phits;
+    assert(hits.nHits() == 200);
+
+    int i = threadIdx.x;
+    if (i > 200)
+      return;
+  }
+}  // namespace testTrackingRecHit2D
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
index 8aca68e294469..0d910273933dc 100644
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
@@ -6,9 +6,9 @@
 
 namespace testTrackingRecHit2D {
 
-  void runKernels(TrackingRecHit2DSOAView* hits);
-
-}
+  template <typename TrackerTraits>
+  void runKernels(TrackingRecHit2DSOAViewT<TrackerTraits>* hits);
+}  // namespace testTrackingRecHit2D
 
 int main() {
   cms::cudatest::requireDevices();
@@ -19,23 +19,21 @@ int main() {
   auto nHits = 200;
   // inner scope to deallocate memory before destroying the stream
   {
-    TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream);
-    testTrackingRecHit2D::runKernels(tkhit.view());
+    TrackingRecHit2DGPUT<pixelTopology::Phase1> tkhit(nHits, 0, nullptr, nullptr, stream);
+    testTrackingRecHit2D::runKernels<pixelTopology::Phase1>(tkhit.view());
 
-    TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream);
-    testTrackingRecHit2D::runKernels(tkhitPhase2.view());
+    TrackingRecHit2DGPUT<pixelTopology::Phase2> tkhitPhase2(nHits, 0, nullptr, nullptr, stream);
+    testTrackingRecHit2D::runKernels<pixelTopology::Phase2>(tkhitPhase2.view());
 
-    TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit);
+    TrackingRecHit2DHostT<pixelTopology::Phase1> tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit);
     cudaStreamSynchronize(stream);
     assert(tkhitH.view());
     assert(tkhitH.view()->nHits() == unsigned(nHits));
-    assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules);
 
-    TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit);
+    TrackingRecHit2DHostT<pixelTopology::Phase2> tkhitHPhase2(nHits, 0, nullptr, nullptr, stream, &tkhitPhase2);
     cudaStreamSynchronize(stream);
     assert(tkhitHPhase2.view());
     assert(tkhitHPhase2.view()->nHits() == unsigned(nHits));
-    assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules);
   }
 
   cudaCheck(cudaStreamDestroy(stream));
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu
index 06bd599d074f9..e902ea971edf3 100644
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu
@@ -1,31 +1,15 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "TrackingRecHit2DCUDAImpl_t.h"
 
 namespace testTrackingRecHit2D {
 
-  __global__ void fill(TrackingRecHit2DSOAView* phits) {
-    assert(phits);
-    auto& hits = *phits;
-    assert(hits.nHits() == 200);
-
-    int i = threadIdx.x;
-    if (i > 200)
-      return;
-  }
-
-  __global__ void verify(TrackingRecHit2DSOAView const* phits) {
-    assert(phits);
-    auto const& hits = *phits;
-    assert(hits.nHits() == 200);
-
-    int i = threadIdx.x;
-    if (i > 200)
-      return;
-  }
-
-  void runKernels(TrackingRecHit2DSOAView* hits) {
+  template <typename TrackerTraits>
+  void runKernels(TrackingRecHit2DSOAViewT<TrackerTraits>* hits) {
     assert(hits);
-    fill<<<1, 1024>>>(hits);
-    verify<<<1, 1024>>>(hits);
+    fill<TrackerTraits><<<1, 1024>>>(hits);
+    verify<TrackerTraits><<<1, 1024>>>(hits);
   }
 
+  template void runKernels<pixelTopology::Phase1>(TrackingRecHit2DSOAViewT<pixelTopology::Phase1>* hits);
+  template void runKernels<pixelTopology::Phase2>(TrackingRecHit2DSOAViewT<pixelTopology::Phase2>* hits);
 }  // namespace testTrackingRecHit2D
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
index e31b87f30fa11..95106050f3d7a 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexSoA.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -8,7 +8,7 @@
 // These vertices are clusterized and fitted only along the beam line (z)
 // to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
 struct ZVertexSoA {
-  static constexpr uint32_t MAXTRACKS = 32 * 1024;
+  static constexpr uint32_t MAXTRACKS = 128 * 1024;
   static constexpr uint32_t MAXVTX = 1024;
 
   int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc
index fbd5a41d4a898..71abb95dbb4d1 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc
@@ -2,7 +2,7 @@
 // Package:    SiPixelPhase1CompareRecHitsSoA
 // Class:      SiPixelPhase1CompareRecHitsSoA
 //
-/**\class SiPixelPhase1CompareRecHitsSoA SiPixelPhase1CompareRecHitsSoA.cc 
+/**\class SiPixelPhase1CompareRecHitsSoA SiPixelPhase1CompareRecHitsSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury, Alessandro Rossi
@@ -29,6 +29,9 @@
 
 class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer {
 public:
+  using HitSoA = TrackingRecHit2DSOAViewT<pixelTopology::Phase1>;
+  using HitsOnCPU = TrackingRecHit2DCPUT<pixelTopology::Phase1>;
+
   explicit SiPixelPhase1CompareRecHitsSoA(const edm::ParameterSet&);
   ~SiPixelPhase1CompareRecHitsSoA() override = default;
   void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override;
@@ -39,8 +42,8 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer {
 private:
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
   const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> topoToken_;
-  const edm::EDGetTokenT<TrackingRecHit2DCPU> tokenSoAHitsCPU_;
-  const edm::EDGetTokenT<TrackingRecHit2DCPU> tokenSoAHitsGPU_;
+  const edm::EDGetTokenT<HitsOnCPU> tokenSoAHitsCPU_;
+  const edm::EDGetTokenT<HitsOnCPU> tokenSoAHitsGPU_;
   const std::string topFolderName_;
   const float mind2cut_;
   static constexpr uint32_t invalidHit_ = std::numeric_limits<uint32_t>::max();
@@ -77,8 +80,8 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer {
 SiPixelPhase1CompareRecHitsSoA::SiPixelPhase1CompareRecHitsSoA(const edm::ParameterSet& iConfig)
     : geomToken_(esConsumes<TrackerGeometry, TrackerDigiGeometryRecord, edm::Transition::BeginRun>()),
       topoToken_(esConsumes<TrackerTopology, TrackerTopologyRcd, edm::Transition::BeginRun>()),
-      tokenSoAHitsCPU_(consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelHitsSrcCPU"))),
-      tokenSoAHitsGPU_(consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelHitsSrcGPU"))),
+      tokenSoAHitsCPU_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcCPU"))),
+      tokenSoAHitsGPU_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcGPU"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       mind2cut_(iConfig.getParameter<double>("minD2cut")) {}
 //
@@ -106,10 +109,11 @@ void SiPixelPhase1CompareRecHitsSoA::analyze(const edm::Event& iEvent, const edm
     out << "the comparison will not run.";
     return;
   }
+
   auto const& rhsoaCPU = *rhsoaHandleCPU;
-  const TrackingRecHit2DSOAView* soa2dCPU = rhsoaCPU.view();
+  const HitSoA* soa2dCPU = rhsoaCPU.view();
   auto const& rhsoaGPU = *rhsoaHandleGPU;
-  const TrackingRecHit2DSOAView* soa2dGPU = rhsoaGPU.view();
+  const HitSoA* soa2dGPU = rhsoaGPU.view();
 
   uint32_t nHitsCPU = soa2dCPU->nHits();
   uint32_t nHitsGPU = soa2dGPU->nHits();
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
index 7b12f694d4e8c..915c2ac1399f5 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc
@@ -2,7 +2,7 @@
 // Package:    SiPixelPhase1CompareTrackSoA
 // Class:      SiPixelPhase1CompareTrackSoA
 //
-/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc 
+/**\class SiPixelPhase1CompareTrackSoA SiPixelPhase1CompareTrackSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury
@@ -64,6 +64,8 @@ namespace {
 
 class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
 public:
+  using PixelTrackSoAPhase1 = PixelTrackHeterogeneousT<pixelTopology::Phase1>;
+
   explicit SiPixelPhase1CompareTrackSoA(const edm::ParameterSet&);
   ~SiPixelPhase1CompareTrackSoA() override = default;
   void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
@@ -71,8 +73,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrackCPU_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrackGPU_;
+  const edm::EDGetTokenT<PixelTrackSoAPhase1> tokenSoATrackCPU_;
+  const edm::EDGetTokenT<PixelTrackSoAPhase1> tokenSoATrackGPU_;
   const std::string topFolderName_;
   const bool useQualityCut_;
   const pixelTrack::Quality minQuality_;
@@ -113,8 +115,8 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig)
-    : tokenSoATrackCPU_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcCPU"))),
-      tokenSoATrackGPU_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
+    : tokenSoATrackCPU_(consumes<PixelTrackSoAPhase1>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcCPU"))),
+      tokenSoATrackGPU_(consumes<PixelTrackSoAPhase1>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcGPU"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       useQualityCut_(iConfig.getParameter<bool>("useQualityCut")),
       minQuality_(pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"))),
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
index 4559e57d1482c..231186f88e53f 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
@@ -3,7 +3,7 @@
 // Package:    SiPixelPhase1MonitorRecHitsSoA
 // Class:      SiPixelPhase1MonitorRecHitsSoA
 //
-/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc 
+/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury, Alessandro Rossi
@@ -30,6 +30,9 @@
 
 class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer {
 public:
+  using HitSoA = TrackingRecHit2DSOAViewT<pixelTopology::Phase1>;
+  using HitsOnCPU = TrackingRecHit2DCPUT<pixelTopology::Phase1>;
+
   explicit SiPixelPhase1MonitorRecHitsSoA(const edm::ParameterSet&);
   ~SiPixelPhase1MonitorRecHitsSoA() override = default;
   void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override;
@@ -40,7 +43,7 @@ class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer {
 private:
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
   const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> topoToken_;
-  const edm::EDGetTokenT<TrackingRecHit2DCPU> tokenSoAHitsCPU_;
+  const edm::EDGetTokenT<HitsOnCPU> tokenSoAHitsCPU_;
   const std::string topFolderName_;
   const TrackerGeometry* tkGeom_ = nullptr;
   const TrackerTopology* tTopo_ = nullptr;
@@ -74,7 +77,7 @@ class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer {
 SiPixelPhase1MonitorRecHitsSoA::SiPixelPhase1MonitorRecHitsSoA(const edm::ParameterSet& iConfig)
     : geomToken_(esConsumes<TrackerGeometry, TrackerDigiGeometryRecord, edm::Transition::BeginRun>()),
       topoToken_(esConsumes<TrackerTopology, TrackerTopologyRcd, edm::Transition::BeginRun>()),
-      tokenSoAHitsCPU_(consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelHitsSrc"))),
+      tokenSoAHitsCPU_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrc"))),
       topFolderName_(iConfig.getParameter<std::string>("TopFolderName")) {}
 //
 // Begin Run
@@ -94,7 +97,7 @@ void SiPixelPhase1MonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm
     return;
   }
   auto const& rhsoa = *rhsoaHandle;
-  const TrackingRecHit2DSOAView* soa2d = rhsoa.view();
+  const HitSoA* soa2d = rhsoa.view();
 
   uint32_t nHits_ = soa2d->nHits();
   hnHits->Fill(nHits_);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
index 622895ba07bcc..5d2545b6cdc9f 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
@@ -3,7 +3,7 @@
 // Package:    SiPixelPhase1MonitorTrackSoA
 // Class:      SiPixelPhase1MonitorTrackSoA
 //
-/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc 
+/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury
@@ -27,6 +27,7 @@
 
 class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer {
 public:
+  using PixelTrackHeterogeneousPhase1 = PixelTrackHeterogeneousT<pixelTopology::Phase1>;
   explicit SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet&);
   ~SiPixelPhase1MonitorTrackSoA() override = default;
   void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
@@ -34,7 +35,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<PixelTrackHeterogeneousPhase1> tokenSoATrack_;
   std::string topFolderName_;
   bool useQualityCut_;
   pixelTrack::Quality minQuality_;
@@ -62,7 +63,7 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer {
 //
 
 SiPixelPhase1MonitorTrackSoA::SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet& iConfig) {
-  tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+  tokenSoATrack_ = consumes<PixelTrackHeterogeneousPhase1>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
   topFolderName_ = iConfig.getParameter<std::string>("topFolderName");  //"SiPixelHeterogeneous/PixelTrackSoA";
   useQualityCut_ = iConfig.getParameter<bool>("useQualityCut");
   minQuality_ = pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"));
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
index af6c240a69172..6324cee4372d8 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc
@@ -3,7 +3,7 @@
 // Package:    SiPixelPhase1MonitorVertexSoA
 // Class:      SiPixelPhase1MonitorVertexSoA
 //
-/**\class SiPixelPhase1MonitorVertexSoA SiPixelPhase1MonitorVertexSoA.cc 
+/**\class SiPixelPhase1MonitorVertexSoA SiPixelPhase1MonitorVertexSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury
@@ -81,11 +81,13 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm:
     dxdz = bs.dxdz();
     dydz = bs.dydz();
   }
+
   for (int iv = 0; iv < nVertices; iv++) {
     auto si = vsoa.sortInd[iv];
     auto z = vsoa.zv[si];
     auto x = x0 + dxdz * z;
     auto y = y0 + dydz * z;
+
     z += z0;
     hx->Fill(x);
     hy->Fill(y);
diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h
index d91d9b40e89ce..c991d09666297 100644
--- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h
+++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h
@@ -4,104 +4,48 @@
 #include <array>
 #include <cstdint>
 #include <type_traits>
+#include "FWCore/Utilities/interface/HostDeviceConstant.h"
 
 namespace pixelTopology {
-  template <class Function, std::size_t... Indices>
-  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
-      -> std::array<std::invoke_result_t<Function, std::size_t>, sizeof...(Indices)> {
-    return {{f(Indices)...}};
-  }
-
-  template <int N, class Function>
-  constexpr auto map_to_array(Function f) -> std::array<std::invoke_result_t<Function, std::size_t>, N> {
-    return map_to_array_helper(f, std::make_index_sequence<N>{});
-  }
 
   constexpr auto maxNumberOfLadders = 160;
   constexpr uint32_t maxLayers = 28;
 
-  struct AverageGeometry {
+  template <typename TrackerTraits>
+  struct AverageGeometryT {
     //
-    float ladderZ[maxNumberOfLadders];
-    float ladderX[maxNumberOfLadders];
-    float ladderY[maxNumberOfLadders];
-    float ladderR[maxNumberOfLadders];
-    float ladderMinZ[maxNumberOfLadders];
-    float ladderMaxZ[maxNumberOfLadders];
+    float ladderZ[TrackerTraits::numberOfLaddersInBarrel];
+    float ladderX[TrackerTraits::numberOfLaddersInBarrel];
+    float ladderY[TrackerTraits::numberOfLaddersInBarrel];
+    float ladderR[TrackerTraits::numberOfLaddersInBarrel];
+    float ladderMinZ[TrackerTraits::numberOfLaddersInBarrel];
+    float ladderMaxZ[TrackerTraits::numberOfLaddersInBarrel];
     float endCapZ[2];  // just for pos and neg Layer1
   };
 
-  constexpr inline uint16_t localY(uint16_t py, uint16_t n) {
-    auto roc = py / n;
-    auto shift = 2 * roc;
-    auto yInRoc = py - n * roc;
-    if (yInRoc > 0)
-      shift += 1;
-    return py + shift;
-  }
-
-}  // namespace pixelTopology
-
-namespace phase1PixelTopology {
-
-  constexpr uint16_t numberOfModulesInBarrel = 1184;
-  constexpr uint16_t numberOfModulesInLadder = 8;
-  constexpr uint16_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder;
-
-  constexpr uint16_t numRowsInRoc = 80;
-  constexpr uint16_t numColsInRoc = 52;
-  constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
-  constexpr uint16_t lastColInRoc = numColsInRoc - 1;
+  constexpr int16_t phi0p05 = 522;  // round(521.52189...) = phi2short(0.05);
+  constexpr int16_t phi0p06 = 626;  // round(625.82270...) = phi2short(0.06);
+  constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
+  constexpr int16_t phi0p09 = 900;
 
-  constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
-  constexpr uint16_t numColsInModule = 8 * numColsInRoc;
-  constexpr uint16_t lastRowInModule = numRowsInModule - 1;
-  constexpr uint16_t lastColInModule = numColsInModule - 1;
-
-  constexpr int16_t xOffset = -81;
-  constexpr int16_t yOffset = -54 * 4;
-
-  constexpr uint16_t pixelThickness = 285;
-  constexpr uint16_t pixelPitchX = 100;
-  constexpr uint16_t pixelPitchY = 150;
-
-  constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule);
+  template <class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+      -> std::array<std::invoke_result_t<Function, std::size_t>, sizeof...(Indices)> {
+    return {{f(Indices)...}};
+  }
 
-  constexpr uint32_t numberOfModules = 1856;
-  constexpr uint32_t numberOfLayers = 10;
-#ifdef __CUDA_ARCH__
-  __device__
-#endif
-      constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
-                                                           96,
-                                                           320,
-                                                           672,  // barrel
-                                                           1184,
-                                                           1296,
-                                                           1408,  // positive endcap
-                                                           1520,
-                                                           1632,
-                                                           1744,  // negative endcap
-                                                           numberOfModules};
-  constexpr char const* layerName[numberOfLayers] = {
-      "BL1",
-      "BL2",
-      "BL3",
-      "BL4",  // barrel
-      "E+1",
-      "E+2",
-      "E+3",  // positive endcap
-      "E-1",
-      "E-2",
-      "E-3"  // negative endcap
-  };
+  template <int N, class Function>
+  constexpr auto map_to_array(Function f) -> std::array<std::invoke_result_t<Function, std::size_t>, N> {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
 
+  template <typename TrackerTraits>
   constexpr uint16_t findMaxModuleStride() {
     bool go = true;
     int n = 2;
     while (go) {
-      for (uint8_t i = 1; i < std::size(layerStart); ++i) {
-        if (layerStart[i] % n != 0) {
+      for (uint8_t i = 1; i < TrackerTraits::numberOfLayers + 1; ++i) {
+        if (TrackerTraits::layerStart[i] % n != 0) {
           go = false;
           break;
         }
@@ -113,48 +57,62 @@ namespace phase1PixelTopology {
     return n / 2;
   }
 
-  constexpr uint16_t maxModuleStride = findMaxModuleStride();
+  template <typename TrackerTraits>
+  constexpr uint16_t maxModuleStride = findMaxModuleStride<TrackerTraits>();
 
+  template <typename TrackerTraits>
   constexpr uint8_t findLayer(uint32_t detId, uint8_t sl = 0) {
-    for (uint8_t i = sl; i < std::size(layerStart); ++i)
-      if (detId < layerStart[i + 1])
+    for (uint8_t i = sl; i < TrackerTraits::numberOfLayers + 1; ++i)
+      if (detId < TrackerTraits::layerStart[i + 1])
         return i;
-    return std::size(layerStart);
+    return TrackerTraits::numberOfLayers + 1;
   }
 
+  template <typename TrackerTraits>
   constexpr uint8_t findLayerFromCompact(uint32_t detId) {
-    detId *= maxModuleStride;
-    for (uint8_t i = 0; i < std::size(layerStart); ++i)
-      if (detId < layerStart[i + 1])
+    detId *= maxModuleStride<TrackerTraits>;
+    for (uint8_t i = 0; i < TrackerTraits::numberOfLayers + 1; ++i)
+      if (detId < TrackerTraits::layerStart[i + 1])
         return i;
-    return std::size(layerStart);
+    return TrackerTraits::numberOfLayers + 1;
   }
 
-  constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
+  template <typename TrackerTraits>
+  constexpr uint32_t layerIndexSize = TrackerTraits::numberOfModules / maxModuleStride<TrackerTraits>;
+
+  template <typename TrackerTraits>
 #ifdef __CUDA_ARCH__
   __device__
 #endif
-      constexpr std::array<uint8_t, layerIndexSize>
-          layer = pixelTopology::map_to_array<layerIndexSize>(findLayerFromCompact);
+      constexpr std::array<uint8_t, layerIndexSize<TrackerTraits>>
+          layer = map_to_array<layerIndexSize<TrackerTraits>>(findLayerFromCompact<TrackerTraits>);
 
+  template <typename TrackerTraits>
   constexpr uint8_t getLayer(uint32_t detId) {
-    return phase1PixelTopology::layer[detId / phase1PixelTopology::maxModuleStride];
+    return layer<TrackerTraits>[detId / maxModuleStride<TrackerTraits>];
   }
 
+  template <typename TrackerTraits>
   constexpr bool validateLayerIndex() {
     bool res = true;
-    for (auto i = 0U; i < numberOfModules; ++i) {
-      auto j = i / maxModuleStride;
-      res &= (layer[j] < numberOfLayers);
-      res &= (i >= layerStart[layer[j]]);
-      res &= (i < layerStart[layer[j] + 1]);
+    for (auto i = 0U; i < TrackerTraits::numberOfModules; ++i) {
+      auto j = i / maxModuleStride<TrackerTraits>;
+      res &= (layer<TrackerTraits>[j] < TrackerTraits::numberOfLayers);
+      res &= (i >= TrackerTraits::layerStart[layer<TrackerTraits>[j]]);
+      res &= (i < TrackerTraits::layerStart[layer<TrackerTraits>[j] + 1]);
     }
     return res;
   }
 
-  static_assert(validateLayerIndex(), "layer from detIndex algo is buggy");
+  template <typename TrackerTraits>
+#ifdef __CUDA_ARCH__
+  __device__
+#endif
+      constexpr inline uint32_t
+      layerStart(uint32_t i) {
+    return TrackerTraits::layerStart[i];
+  }
 
-  // this is for the ROC n<512 (upgrade 1024)
   constexpr inline uint16_t divu52(uint16_t n) {
     n = n >> 2;
     uint16_t q = (n >> 1) + (n >> 4);
@@ -163,128 +121,397 @@ namespace phase1PixelTopology {
     uint16_t r = n - q * 13;
     return q + ((r + 3) >> 4);
   }
+}  // namespace pixelTopology
 
-  constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }
+namespace phase1PixelTopology {
 
-  constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }
+  using pixelTopology::phi0p05;
+  using pixelTopology::phi0p06;
+  using pixelTopology::phi0p07;
+
+  constexpr uint32_t numberOfLayers = 28;
+  constexpr int nPairs = 13 + 2 + 4;
+  constexpr uint16_t numberOfModules = 1856;
+
+  constexpr uint32_t max_ladder_bpx0 = 12;
+  constexpr uint32_t first_ladder_bpx0 = 0;
+  constexpr float module_length_bpx0 = 6.7f;
+  constexpr float module_tolerance_bpx0 = 0.4f;  // projection to cylinder is inaccurate on BPIX1
+  constexpr uint32_t max_ladder_bpx4 = 64;
+  constexpr uint32_t first_ladder_bpx4 = 84;
+  constexpr float radius_even_ladder = 15.815f;
+  constexpr float radius_odd_ladder = 16.146f;
+  constexpr float module_length_bpx4 = 6.7f;
+  constexpr float module_tolerance_bpx4 = 0.2f;
+  constexpr float barrel_z_length = 26.f;
+  constexpr float forward_z_begin = 32.f;
+
+  HOST_DEVICE_CONSTANT uint8_t layerPairs[2 * nPairs] = {
+      0, 1, 0, 4, 0, 7,              // BPIX1 (3)
+      1, 2, 1, 4, 1, 7,              // BPIX2 (6)
+      4, 5, 7, 8,                    // FPIX1 (8)
+      2, 3, 2, 4, 2, 7, 5, 6, 8, 9,  // BPIX3 & FPIX2 (13)
+      0, 2, 1, 3,                    // Jumping Barrel (15)
+      0, 5, 0, 8,                    // Jumping Forward (BPIX1,FPIX2)
+      4, 6, 7, 9                     // Jumping Forward (19)
+  };
 
-  constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }
+  HOST_DEVICE_CONSTANT int16_t phicuts[nPairs]{phi0p05,
+                                               phi0p07,
+                                               phi0p07,
+                                               phi0p05,
+                                               phi0p06,
+                                               phi0p06,
+                                               phi0p05,
+                                               phi0p05,
+                                               phi0p06,
+                                               phi0p06,
+                                               phi0p06,
+                                               phi0p05,
+                                               phi0p05,
+                                               phi0p05,
+                                               phi0p05,
+                                               phi0p05,
+                                               phi0p05,
+                                               phi0p05,
+                                               phi0p05};
+  HOST_DEVICE_CONSTANT float minz[nPairs] = {
+      -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.};
+  HOST_DEVICE_CONSTANT float maxz[nPairs] = {
+      20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.};
+  HOST_DEVICE_CONSTANT float maxr[nPairs] = {
+      20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.};
+
+  static constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
+                                                              96,
+                                                              320,
+                                                              672,  // barrel
+                                                              1184,
+                                                              1296,
+                                                              1408,  // positive endcap
+                                                              1520,
+                                                              1632,
+                                                              1744,  // negative endcap
+                                                              numberOfModules};
+}  // namespace phase1PixelTopology
 
-  constexpr inline uint16_t toRocY(uint16_t py) {
-    auto roc = divu52(py);
-    return py - 52 * roc;
-  }
+namespace phase2PixelTopology {
 
-  constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }
+  using pixelTopology::phi0p05;
+  using pixelTopology::phi0p06;
+  using pixelTopology::phi0p07;
+  using pixelTopology::phi0p09;
 
-  constexpr inline bool isBigPixY(uint16_t py) {
-    auto ly = toRocY(py);
-    return (ly == 0) | (ly == lastColInRoc);
-  }
+  constexpr uint32_t numberOfLayers = 28;
+  constexpr int nPairs = 23 + 6 + 14 + 8 + 4;  // include far forward layer pairs
+  constexpr uint16_t numberOfModules = 3892;
 
-  constexpr inline uint16_t localX(uint16_t px) {
-    auto shift = 0;
-    if (px > lastRowInRoc)
-      shift += 1;
-    if (px > numRowsInRoc)
-      shift += 1;
-    return px + shift;
-  }
+  HOST_DEVICE_CONSTANT uint8_t layerPairs[2 * nPairs] = {
 
-  constexpr inline uint16_t localY(uint16_t py) {
-    auto roc = divu52(py);
-    auto shift = 2 * roc;
-    auto yInRoc = py - 52 * roc;
-    if (yInRoc > 0)
-      shift += 1;
-    return py + shift;
-  }
+      0,  1,  0,  4,  0,  16,  //BPIX1 (3)
+      1,  2,  1,  4,  1,  16,  //BPIX2 (6)
+      2,  3,  2,  4,  2,  16,  //BPIX3 & Forward (9)
 
-}  // namespace phase1PixelTopology
+      4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11,  //POS (16)
+      16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23,  //NEG (23)
 
-namespace phase2PixelTopology {
+      0,  2,  0,  5,  0,  17, 0,  6,  0,  18,  // BPIX1 Jump (28)
+      1,  3,  1,  5,  1,  17, 1,  6,  1,  18,  // BPIX2 Jump (33)
 
-  constexpr uint32_t numberOfModulesInBarrel = 756;
-  constexpr uint32_t numberOfModulesInLadder = 9;
-  constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder;
-
-  constexpr uint32_t numberOfModules = 3892;
-  constexpr uint8_t numberOfLayers = 28;
-
-  constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
-                                                       108,
-                                                       324,
-                                                       504,  //Barrel
-                                                       756,
-                                                       864,
-                                                       972,
-                                                       1080,
-                                                       1188,
-                                                       1296,
-                                                       1404,
-                                                       1512,
-                                                       1620,
-                                                       1796,
-                                                       1972,
-                                                       2148,  //Fp
-                                                       2324,
-                                                       2432,
-                                                       2540,
-                                                       2648,
-                                                       2756,
-                                                       2864,
-                                                       2972,
-                                                       3080,
-                                                       3188,
-                                                       3364,
-                                                       3540,
-                                                       3716,  //Np
-                                                       numberOfModules};
+      11, 12, 12, 13, 13, 14, 14, 15,  //Late POS (37)
+      23, 24, 24, 25, 25, 26, 26, 27,  //Late NEG (41)
 
-  constexpr uint16_t findMaxModuleStride() {
-    bool go = true;
-    int n = 2;
-    while (go) {
-      for (uint8_t i = 1; i < numberOfLayers + 1; ++i) {
-        if (layerStart[i] % n != 0) {
-          go = false;
-          break;
-        }
-      }
-      if (!go)
-        break;
-      n *= 2;
-    }
-    return n / 2;
-  }
+      4,  6,  5,  7,  6,  8,  7,  9,  8,  10, 9,  11, 10, 12,  //POS Jump (48)
+      16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24,  //NEG Jump (55)
+  };
+  HOST_DEVICE_CONSTANT uint32_t layerStart[numberOfLayers + 1] = {0,
+                                                                  108,
+                                                                  324,
+                                                                  504,  //Barrel
+                                                                  756,
+                                                                  864,
+                                                                  972,
+                                                                  1080,
+                                                                  1188,
+                                                                  1296,
+                                                                  1404,
+                                                                  1512,
+                                                                  1620,
+                                                                  1796,
+                                                                  1972,
+                                                                  2148,  //Fp
+                                                                  2324,
+                                                                  2432,
+                                                                  2540,
+                                                                  2648,
+                                                                  2756,
+                                                                  2864,
+                                                                  2972,
+                                                                  3080,
+                                                                  3188,
+                                                                  3364,
+                                                                  3540,
+                                                                  3716,  //Np
+                                                                  numberOfModules};
+
+  HOST_DEVICE_CONSTANT int16_t phicuts[nPairs]{
+      phi0p05, phi0p05, phi0p05, phi0p06, phi0p07, phi0p07, phi0p06, phi0p07, phi0p07, phi0p05, phi0p05,
+      phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05,
+      phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p07, phi0p07, phi0p07, phi0p07,
+      phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07, phi0p07,
+      phi0p07, phi0p07, phi0p07, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05, phi0p05};
+
+  HOST_DEVICE_CONSTANT float minz[nPairs] = {
+      -16.0, 4.0,   -22.0, -17.0, 6.0,   -22.0, -18.0, 11.0,  -22.0,  23.0,   30.0,   39.0,   50.0,   65.0,
+      82.0,  109.0, -28.0, -35.0, -44.0, -55.0, -70.0, -87.0, -113.0, -16.,   7.0,    -22.0,  11.0,   -22.0,
+      -17.0, 9.0,   -22.0, 13.0,  -22.0, 137.0, 173.0, 199.0, 229.0,  -142.0, -177.0, -203.0, -233.0, 23.0,
+      30.0,  39.0,  50.0,  65.0,  82.0,  109.0, -28.0, -35.0, -44.0,  -55.0,  -70.0,  -87.0,  -113.0};
+
+  HOST_DEVICE_CONSTANT float maxz[nPairs] = {
+
+      17.0, 22.0,  -4.0,  17.0,  22.0,  -6.0,  18.0,  22.0,  -11.0,  28.0,   35.0,   44.0,   55.0,   70.0,
+      87.0, 113.0, -23.0, -30.0, -39.0, -50.0, -65.0, -82.0, -109.0, 17.0,   22.0,   -7.0,   22.0,   -10.0,
+      17.0, 22.0,  -9.0,  22.0,  -13.0, 142.0, 177.0, 203.0, 233.0,  -137.0, -173.0, -199.0, -229.0, 28.0,
+      35.0, 44.0,  55.0,  70.0,  87.0,  113.0, -23.0, -30.0, -39.0,  -50.0,  -65.0,  -82.0,  -109.0};
+
+  HOST_DEVICE_CONSTANT float maxr[nPairs] = {5.0, 5.0, 5.0, 7.0, 8.0, 8.0,  7.0, 7.0, 7.0, 6.0, 6.0, 6.0, 6.0, 5.0,
+                                             6.0, 5.0, 6.0, 6.0, 6.0, 6.0,  5.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+                                             5.0, 8.0, 8.0, 8.0, 8.0, 6.0,  5.0, 5.0, 5.0, 6.0, 5.0, 5.0, 5.0, 9.0,
+                                             9.0, 9.0, 8.0, 8.0, 8.0, 11.0, 9.0, 9.0, 9.0, 8.0, 8.0, 8.0, 11.0};
+}  // namespace phase2PixelTopology
+
+namespace pixelTopology {
 
-  constexpr uint16_t maxModuleStride = findMaxModuleStride();
+  struct Phase2 {
+    // types
+    using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+    using tindex_type = uint32_t;  // for tuples
+    using cindex_type = uint32_t;  // for cells
 
-  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
-    detId *= maxModuleStride;
-    for (uint8_t i = 0; i < numberOfLayers + 1; ++i)
-      if (detId < layerStart[i + 1])
-        return i;
-    return numberOfLayers + 1;
-  }
+    static constexpr uint32_t maxCellNeighbors = 64;
+    static constexpr uint32_t maxCellTracks = 302;
+    static constexpr uint32_t maxHitsOnTrack = 15;
+    static constexpr uint32_t maxHitsOnTrackForFullFit = 6;
+    static constexpr uint32_t avgHitsPerTrack = 9;
+    static constexpr uint32_t maxCellsPerHit = 256;
+    static constexpr uint32_t avgTracksPerHit = 10;
+    static constexpr uint32_t maxNumberOfTuples = 256 * 1024;
+    static constexpr uint32_t maxHitsForContainers = avgHitsPerTrack * maxNumberOfTuples;
+    static constexpr uint32_t maxNumberOfDoublets = 5 * 512 * 1024;
+    static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
+    static constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples;
+    static constexpr uint32_t maxDepth = 12;
+    static constexpr uint32_t numberOfLayers = 28;
 
-  constexpr uint16_t layerIndexSize = numberOfModules / maxModuleStride;
-  constexpr std::array<uint8_t, layerIndexSize> layer =
-      pixelTopology::map_to_array<layerIndexSize>(findLayerFromCompact);
+    static constexpr uint32_t maxSizeCluster = 2047;
 
-  constexpr bool validateLayerIndex() {
-    bool res = true;
-    for (auto i = 0U; i < numberOfModules; ++i) {
-      auto j = i / maxModuleStride;
-      res &= (layer[j] < numberOfLayers);
-      res &= (i >= layerStart[layer[j]]);
-      res &= (i < layerStart[layer[j] + 1]);
+    static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
+    static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16;
+
+    static constexpr uint32_t last_bpix1_detIndex = 108;
+    static constexpr uint32_t last_bpix2_detIndex = 324;
+    static constexpr uint32_t last_barrel_detIndex = 504;
+
+    static constexpr uint32_t maxPixInModule = 6000;
+
+    static constexpr float moduleLength = 4.345f;
+    static constexpr float endcapCorrection = 0.0f;
+
+    static constexpr float xerr_barrel_l1_def = 0.00035f;
+    static constexpr float yerr_barrel_l1_def = 0.00125f;
+    static constexpr float xerr_barrel_ln_def = 0.00035f;
+    static constexpr float yerr_barrel_ln_def = 0.00125f;
+    static constexpr float xerr_endcap_def = 0.00060f;
+    static constexpr float yerr_endcap_def = 0.00180f;
+
+    static constexpr float bigPixXCorrection = 0.0f;
+    static constexpr float bigPixYCorrection = 0.0f;
+
+    static constexpr float dzdrFact = 8 * 0.0285 / 0.015;  // from dz/dr to "DY"
+    static constexpr float z0Cut = 7.5f;
+    static constexpr float doubletHardPt = 0.8f;
+
+    static constexpr int minYsizeB1 = 25;
+    static constexpr int minYsizeB2 = 15;
+
+    static constexpr int nPairsMinimal = 33;
+    static constexpr int nPairsFarForwards = nPairsMinimal + 8;  // include barrel "jumping" layer pairs
+    static constexpr int nPairs = phase2PixelTopology::nPairs;   // include far forward layer pairs
+
+    static constexpr int maxDYsize12 = 12;
+    static constexpr int maxDYsize = 10;
+    static constexpr int maxDYPred = 20;
+
+    static constexpr uint16_t numberOfModules = 3892;
+
+    static constexpr uint16_t clusterBinning = 1024;
+    static constexpr uint16_t clusterBits = 10;
+
+    static constexpr uint16_t numberOfModulesInBarrel = 756;
+    static constexpr uint16_t numberOfModulesInLadder = 9;
+    static constexpr uint16_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder;
+
+    static constexpr uint16_t firstEndcapPos = 4;
+    static constexpr uint16_t firstEndcapNeg = 16;
+
+    static constexpr int16_t xOffset = -1e4;  //not used actually, to suppress static analyzer warnings
+
+    static constexpr char const *nameModifier = "Phase2";
+
+    static constexpr uint32_t const *layerStart = phase2PixelTopology::layerStart;
+    static constexpr float const *minz = phase2PixelTopology::minz;
+    static constexpr float const *maxz = phase2PixelTopology::maxz;
+    static constexpr float const *maxr = phase2PixelTopology::maxr;
+
+    static constexpr uint8_t const *layerPairs = phase2PixelTopology::layerPairs;
+    static constexpr int16_t const *phicuts = phase2PixelTopology::phicuts;
+
+    static constexpr inline bool isBigPixX(uint16_t px) { return false; }
+    static constexpr inline bool isBigPixY(uint16_t py) { return false; }
+
+    static constexpr inline uint16_t localX(uint16_t px) { return px; }
+    static constexpr inline uint16_t localY(uint16_t py) { return py; }
+  };
+
+  struct Phase1 {
+    // types
+    using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+    using tindex_type = uint16_t;  // for tuples
+    using cindex_type = uint32_t;  // for cells
+
+    static constexpr uint32_t maxCellNeighbors = 36;
+    static constexpr uint32_t maxCellTracks = 48;
+    static constexpr uint32_t maxHitsOnTrack = 10;
+    static constexpr uint32_t maxHitsOnTrackForFullFit = 6;
+    static constexpr uint32_t avgHitsPerTrack = 4;
+    static constexpr uint32_t maxCellsPerHit = 256;
+    static constexpr uint32_t avgTracksPerHit = 6;
+    static constexpr uint32_t maxNumberOfTuples = 32 * 1024;
+    static constexpr uint32_t maxHitsForContainers = avgHitsPerTrack * maxNumberOfTuples;
+    static constexpr uint32_t maxNumberOfDoublets = 512 * 1024;
+    static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
+    static constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples;
+    static constexpr uint32_t maxDepth = 6;
+    static constexpr uint32_t numberOfLayers = 10;
+
+    static constexpr uint32_t maxSizeCluster = 1023;
+
+    static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
+    static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16;
+
+    static constexpr uint32_t last_bpix1_detIndex = 96;
+    static constexpr uint32_t last_bpix2_detIndex = 320;
+    static constexpr uint32_t last_barrel_detIndex = 1184;
+
+    static constexpr uint32_t maxPixInModule = 6000;
+
+    static constexpr float moduleLength = 6.7f;
+    static constexpr float endcapCorrection = 1.5f;
+
+    static constexpr float xerr_barrel_l1_def = 0.00200f;
+    static constexpr float yerr_barrel_l1_def = 0.00210f;
+    static constexpr float xerr_barrel_ln_def = 0.00200f;
+    static constexpr float yerr_barrel_ln_def = 0.00210f;
+    static constexpr float xerr_endcap_def = 0.0020f;
+    static constexpr float yerr_endcap_def = 0.00210f;
+
+    static constexpr float bigPixXCorrection = 1.0f;
+    static constexpr float bigPixYCorrection = 8.0f;
+
+    static constexpr float dzdrFact = 8 * 0.0285 / 0.015;  // from dz/dr to "DY"
+    static constexpr float z0Cut = 12.f;
+    static constexpr float doubletHardPt = 0.5f;
+
+    static constexpr int minYsizeB1 = 36;
+    static constexpr int minYsizeB2 = 28;
+
+    static constexpr int nPairsForQuadruplets = 13;                     // quadruplets require hits in all layers
+    static constexpr int nPairsForTriplets = nPairsForQuadruplets + 2;  // include barrel "jumping" layer pairs
+    static constexpr int nPairs = nPairsForTriplets + 4;                // include forward "jumping" layer pairs
+
+    static constexpr int maxDYsize12 = 28;
+    static constexpr int maxDYsize = 20;
+    static constexpr int maxDYPred = 20;
+
+    static constexpr uint16_t numberOfModules = 1856;
+
+    static constexpr uint16_t numRowsInRoc = 80;
+    static constexpr uint16_t numColsInRoc = 52;
+    static constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
+    static constexpr uint16_t lastColInRoc = numColsInRoc - 1;
+
+    static constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
+    static constexpr uint16_t numColsInModule = 8 * numColsInRoc;
+    static constexpr uint16_t lastRowInModule = numRowsInModule - 1;
+    static constexpr uint16_t lastColInModule = numColsInModule - 1;
+
+    static constexpr uint16_t clusterBinning = numColsInModule + 2;
+    static constexpr uint16_t clusterBits = 9;
+
+    static constexpr uint16_t numberOfModulesInBarrel = 1184;
+    static constexpr uint16_t numberOfModulesInLadder = 8;
+    static constexpr uint16_t numberOfLaddersInBarrel = numberOfModulesInBarrel / numberOfModulesInLadder;
+
+    static constexpr uint16_t firstEndcapPos = 4;
+    static constexpr uint16_t firstEndcapNeg = 7;
+
+    static constexpr int16_t xOffset = -81;
+
+    static constexpr char const *nameModifier = "";
+
+    static constexpr uint32_t const *layerStart = phase1PixelTopology::layerStart;
+    static constexpr float const *minz = phase1PixelTopology::minz;
+    static constexpr float const *maxz = phase1PixelTopology::maxz;
+    static constexpr float const *maxr = phase1PixelTopology::maxr;
+
+    static constexpr uint8_t const *layerPairs = phase1PixelTopology::layerPairs;
+    static constexpr int16_t const *phicuts = phase1PixelTopology::phicuts;
+
+    static constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }
+
+    static constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }
+
+    static constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }
+
+    static constexpr inline uint16_t toRocY(uint16_t py) {
+      auto roc = divu52(py);
+      return py - 52 * roc;
     }
-    return res;
-  }
 
-  static_assert(validateLayerIndex(), "phase2 layer from detIndex algo is buggy");
+    static constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }
+    static constexpr inline bool isBigPixY(uint16_t py) {
+      auto ly = toRocY(py);
+      return (ly == 0) | (ly == lastColInRoc);
+    }
 
-}  // namespace phase2PixelTopology
+    static constexpr inline uint16_t localX(uint16_t px) {
+      auto shift = 0;
+      if (px > lastRowInRoc)
+        shift += 1;
+      if (px > numRowsInRoc)
+        shift += 1;
+      return px + shift;
+    }
+
+    static constexpr inline uint16_t localY(uint16_t py) {
+      auto roc = divu52(py);
+      auto shift = 2 * roc;
+      auto yInRoc = py - 52 * roc;
+      if (yInRoc > 0)
+        shift += 1;
+      return py + shift;
+    }
+  };
+
+  template <typename T>
+  using isPhase1Topology = typename std::enable_if<std::is_base_of<Phase1, T>::value>::type;
+
+  template <typename T>
+  using isPhase2Topology = typename std::enable_if<std::is_base_of<Phase2, T>::value>::type;
+
+  // struct HIonPhase1 : public Phase1 {
+  //     static constexpr uint32_t maxNumberOfDoublets=3*1024*1024;};
+
+}  // namespace pixelTopology
 
 #endif  // Geometry_CommonTopologies_SimplePixelTopology_h
diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu
index ed906de004bcf..cfb6784a6c1fb 100644
--- a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu
+++ b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu
@@ -10,6 +10,7 @@
 namespace {
 
   // original code from CMSSW_4_4
+  using namespace pixelTopology;
 
   std::tuple<int, bool> localXori(int mpx) {
     const float m_pitchx = 1.f;
@@ -127,14 +128,14 @@ namespace {
 }  // namespace
 
 constexpr void testLayer() {
-  for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) {
-    uint32_t layer = phase1PixelTopology::getLayer(i);
-    uint32_t tLayer = phase1PixelTopology::findLayer(i);
+  for (auto i = 0U; i < Phase1::numberOfModules; ++i) {
+    uint32_t layer = getLayer<Phase1>(i);
+    uint32_t tLayer = findLayer<Phase1>(i);
     assert(tLayer == layer);
-    //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl;
-    assert(layer < phase1PixelTopology::numberOfLayers);
-    assert(i >= phase1PixelTopology::layerStart[layer]);
-    assert(i < phase1PixelTopology::layerStart[layer + 1]);
+
+    assert(layer < Phase1::numberOfLayers);
+    assert(i >= Phase1::layerStart[layer]);
+    assert(i < Phase1::layerStart[layer + 1]);
   }
 }
 
@@ -145,8 +146,8 @@ int main() {
 
   for (uint16_t ix = 0; ix < 80 * 2; ++ix) {
     auto ori = localXori(ix);
-    auto xl = phase1PixelTopology::localX(ix);
-    auto bp = phase1PixelTopology::isBigPixX(ix);
+    auto xl = Phase1::localX(ix);
+    auto bp = Phase1::isBigPixX(ix);
     if (std::get<0>(ori) != xl)
       std::cout << "Error " << std::get<0>(ori) << "!=" << xl << std::endl;
     assert(std::get<1>(ori) == bp);
@@ -154,21 +155,20 @@ int main() {
 
   for (uint16_t iy = 0; iy < 52 * 8; ++iy) {
     auto ori = localYori(iy);
-    auto yl = phase1PixelTopology::localY(iy);
-    auto bp = phase1PixelTopology::isBigPixY(iy);
+    auto yl = Phase1::localY(iy);
+    auto bp = Phase1::isBigPixY(iy);
     if (std::get<0>(ori) != yl)
       std::cout << "Error " << std::get<0>(ori) << "!=" << yl << std::endl;
     assert(std::get<1>(ori) == bp);
   }
 
-  for (auto i = 0U; i < phase1PixelTopology::numberOfLayers; ++i) {
-    std::cout << "layer " << i << ", \"" << phase1PixelTopology::layerName[i] << "\", ["
-              << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i + 1] << ") "
-              << phase1PixelTopology::layerStart[i + 1] - phase1PixelTopology::layerStart[i] << std::endl;
+  for (auto i = 0U; i < Phase1::numberOfLayers; ++i) {
+    std::cout << "layer " << i << "\", [" << Phase1::layerStart[i] << ", " << Phase1::layerStart[i + 1] << ") "
+              << Phase1::layerStart[i + 1] - Phase1::layerStart[i] << std::endl;
   }
 
-  std::cout << "maxModuleStide layerIndexSize " << phase1PixelTopology::maxModuleStride << ' '
-            << phase1PixelTopology::layerIndexSize << std::endl;
+  std::cout << "maxModuleStide layerIndexSize " << maxModuleStride<Phase1> << ' '
+            << layerIndexSize<Phase1> << std::endl;
 
   testLayer();
 
diff --git a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py
index 53b007a50b775..b778daa63677f 100644
--- a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py
+++ b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py
@@ -210,6 +210,34 @@ def customiseForOffline(process):
 
     return process
 
+#Customize for Tracker Traits and Enabling Phase2 for Inner Tracker Reconstruction #38761
+def customizeHLTfor38761(process):
+
+     for producer in producers_by_type(process, "SiPixelRecHitSoAFromLegacy"):
+         if hasattr(producer, "isPhase2"):
+             delattr(producer, "isPhase2")
+     for producer in producers_by_type(process, "SiPixelDigisClustersFromSoA"):
+         if hasattr(producer, "isPhase2"):
+             delattr(producer, "isPhase2")
+
+     if 'hltSiPixelRecHitsSoA' in process.__dict__:
+         process.hltSiPixelRecHitsSoA.cpu =  cms.EDAlias(hltSiPixelRecHitsFromLegacy = cms.VPSet(
+            cms.PSet(
+                type = cms.string('pixelTopologyPhase1TrackingRecHit2DCPUT')
+            ),
+            cms.PSet(
+                type = cms.string('uintAsHostProduct')
+            )))
+
+     for producer in esproducers_by_type(process, "PixelCPEFastESProducer"):
+         if hasattr(producer, "isPhase2"):
+             delattr(producer, "isPhase2")
+     for producer in esproducers_by_type(process, "PixelCPEGenericESProducer"):
+         if hasattr(producer, "Upgrade"):
+             setattr(producer,"isPhase2",getattr(producer,"Upgrade"))
+             delattr(producer, "Upgrade")
+
+     return process
 
 # CMSSW version specific customizations
 def customizeHLTforCMSSW(process, menuType="GRun"):
@@ -218,5 +246,9 @@ def customizeHLTforCMSSW(process, menuType="GRun"):
 
     # add call to action function in proper order: newest last!
     # process = customiseFor12718(process)
+ 
+    process = customizeHLTfor38761(process)
+
+
 
     return process
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc
index 10cd09502cf9a..363c4b7635b70 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.cc
@@ -294,7 +294,7 @@ void PixelThresholdClusterizer::copy_to_buffer(DigiIterator begin, DigiIterator
 
     if (adc < 100)
       adc = 100;  // put all negative pixel charges into the 100 elec bin
-    /* This is semi-random good number. The exact number (in place of 100) is irrelevant from the point 
+    /* This is semi-random good number. The exact number (in place of 100) is irrelevant from the point
        of view of the final cluster charge since these are typically >= 20000.
     */
 
@@ -444,7 +444,7 @@ SiPixelCluster PixelThresholdClusterizer::make_cluster(const SiPixelCluster::Pix
 
   /*  this is not possible as dead and noisy pixel cannot make it into a seed...
   if ( doMissCalibrate &&
-       (theSiPixelGainCalibrationService_->isDead(theDetid,pix.col(),pix.row()) || 
+       (theSiPixelGainCalibrationService_->isDead(theDetid,pix.col(),pix.row()) ||
 	theSiPixelGainCalibrationService_->isNoisy(theDetid,pix.col(),pix.row())) )
     {
       std::cout << "IMPOSSIBLE" << std::endl;
@@ -489,15 +489,15 @@ SiPixelCluster PixelThresholdClusterizer::make_cluster(const SiPixelCluster::Pix
         }
 
         /* //Commenting out the addition of dead pixels to the cluster until further testing -- dfehling 06/09
-	      //Check on the bounds of the module; this is to keep the isDead and isNoisy modules from returning errors 
-	      else if(r>= 0 && c >= 0 && (r <= (theNumOfRows-1.)) && (c <= (theNumOfCols-1.))){ 
+	      //Check on the bounds of the module; this is to keep the isDead and isNoisy modules from returning errors
+	      else if(r>= 0 && c >= 0 && (r <= (theNumOfRows-1.)) && (c <= (theNumOfCols-1.))){
 	      //Check for dead/noisy pixels check that the buffer is not -1 (already considered).  Check whether we want to split clusters separated by dead pixels or not.
 	      if((theSiPixelGainCalibrationService_->isDead(theDetid,c,r) || theSiPixelGainCalibrationService_->isNoisy(theDetid,c,r)) && theBuffer(r,c) != 1){
-	      
-	      //If a pixel is dead or noisy, check to see if we want to split the clusters or not.  
+
+	      //If a pixel is dead or noisy, check to see if we want to split the clusters or not.
 	      //Push it into a dead pixel stack in case we want to split the clusters.  Otherwise add it to the cluster.
    	      //If we are splitting the clusters, we will iterate over the dead pixel stack later.
-	      
+
 	      SiPixelCluster::PixelPos newpix(r,c);
 	      if(!doSplitClusters){
 
@@ -505,10 +505,10 @@ SiPixelCluster PixelThresholdClusterizer::make_cluster(const SiPixelCluster::Pix
 	      else if(doSplitClusters){
 	      dead_pixel_stack.push(newpix);
 	      dead_flag = true;}
-	      
+
 	      theBuffer.set_adc(newpix, 1);
-	      } 
-	      
+	      }
+
 	      }
 	      */
       }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h
index 77ac94bb2b7e0..77cc0e6491fcd 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/PixelThresholdClusterizer.h
@@ -110,8 +110,10 @@ class dso_hidden PixelThresholdClusterizer : public PixelClusterizerBase {
 
   const double theElectronPerADCGain;  //  ADC to electrons conversion
 
-  const bool doPhase2Calibration;      // The ADC --> electrons calibration is for phase-2 tracker
-  const bool dropDuplicates;           // Enabling dropping duplicate pixels
+  const bool doPhase2Calibration;  // The ADC --> electrons calibration is for phase-2 tracker
+
+  const bool dropDuplicates;  // Enabling dropping duplicate pixels
+
   const int thePhase2ReadoutMode;      // Readout mode of the phase-2 IT digitizer
   const double thePhase2DigiBaseline;  // Threshold above which digis are measured in the phase-2 IT
   const int thePhase2KinkADC;          // ADC count at which the kink in the dual slop kicks in
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
index d36c345ecf02a..538e0356630a0 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
@@ -21,10 +21,11 @@
 #include "PixelClusterizerBase.h"
 #include "SiPixelClusterThresholds.h"
 
-class SiPixelDigisClustersFromSoA : public edm::global::EDProducer<> {
+template <typename TrackerTraits>
+class SiPixelDigisClustersFromSoAT : public edm::global::EDProducer<> {
 public:
-  explicit SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig);
-  ~SiPixelDigisClustersFromSoA() override = default;
+  explicit SiPixelDigisClustersFromSoAT(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisClustersFromSoAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -42,38 +43,42 @@ class SiPixelDigisClustersFromSoA : public edm::global::EDProducer<> {
 
   const bool produceDigis_;
   const bool storeDigis_;
-  const bool isPhase2_;
 };
 
-SiPixelDigisClustersFromSoA::SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+SiPixelDigisClustersFromSoAT<TrackerTraits>::SiPixelDigisClustersFromSoAT(const edm::ParameterSet& iConfig)
     : topoToken_(esConsumes()),
       digiGetToken_(consumes<SiPixelDigisSoA>(iConfig.getParameter<edm::InputTag>("src"))),
       clusterPutToken_(produces<SiPixelClusterCollectionNew>()),
       clusterThresholds_{iConfig.getParameter<int>("clusterThreshold_layer1"),
                          iConfig.getParameter<int>("clusterThreshold_otherLayers")},
       produceDigis_(iConfig.getParameter<bool>("produceDigis")),
-      storeDigis_(iConfig.getParameter<bool>("produceDigis") & iConfig.getParameter<bool>("storeDigis")),
-      isPhase2_(iConfig.getParameter<bool>("isPhase2")) {
+      storeDigis_(iConfig.getParameter<bool>("produceDigis") & iConfig.getParameter<bool>("storeDigis")) {
   if (produceDigis_)
     digiPutToken_ = produces<edm::DetSetVector<PixelDigi>>();
 }
 
-void SiPixelDigisClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void SiPixelDigisClustersFromSoAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelDigisSoA"));
   desc.add<int>("clusterThreshold_layer1", kSiPixelClusterThresholdsDefaultPhase1.layer1);
   desc.add<int>("clusterThreshold_otherLayers", kSiPixelClusterThresholdsDefaultPhase1.otherLayers);
   desc.add<bool>("produceDigis", true);
   desc.add<bool>("storeDigis", true);
-  desc.add<bool>("isPhase2", false);
+
   descriptions.addWithDefaultLabel(desc);
 }
 
-void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+template <typename TrackerTraits>
+void SiPixelDigisClustersFromSoAT<TrackerTraits>::produce(edm::StreamID,
+                                                          edm::Event& iEvent,
+                                                          const edm::EventSetup& iSetup) const {
   const auto& digis = iEvent.get(digiGetToken_);
   const uint32_t nDigis = digis.size();
   const auto& ttopo = iSetup.getData(topoToken_);
-  auto maxModules = isPhase2_ ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+  constexpr auto maxModules = TrackerTraits::numberOfModules;
+
   std::unique_ptr<edm::DetSetVector<PixelDigi>> collection;
   if (produceDigis_)
     collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
@@ -117,7 +122,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con
     for (int32_t ic = 0; ic < nclus + 1; ++ic) {
       auto const& acluster = aclusters[ic];
       // in any case we cannot  go out of sync with gpu...
-      if (acluster.charge < clusterThreshold and !isPhase2_)
+      if (!std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value and acluster.charge < clusterThreshold)
         edm::LogWarning("SiPixelDigisClustersFromSoA") << "cluster below charge Threshold "
                                                        << "Layer/DetId/clusId " << layer << '/' << detId << '/' << ic
                                                        << " size/charge " << acluster.isize << '/' << acluster.charge;
@@ -200,4 +205,9 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con
   iEvent.put(clusterPutToken_, std::move(outputClusters));
 }
 
+using SiPixelDigisClustersFromSoA = SiPixelDigisClustersFromSoAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoA);
+using SiPixelDigisClustersFromSoAPhase1 = SiPixelDigisClustersFromSoAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoAPhase1);
+using SiPixelDigisClustersFromSoAPhase2 = SiPixelDigisClustersFromSoAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoAPhase2);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 48dfa98839d36..bc9be260deb20 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -443,15 +443,15 @@ namespace pixelgpudetails {
 
   }  // end of Raw to Digi kernel
 
-  template <bool isPhase2>
+  template <typename TrackerTraits>
   __global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule,
                                       uint32_t *__restrict__ moduleStart,
                                       uint32_t const *__restrict__ nModules,
                                       uint32_t *__restrict__ nModules_Clusters) {
-    constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
-    constexpr int startBPIX2 = isPhase2 ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1];
+    constexpr int nMaxModules = TrackerTraits::numberOfModules;
+    constexpr int startBPIX2 = TrackerTraits::layerStart[1];
 
-    assert(nMaxModules < phase2PixelTopology::numberOfModules);
+    assert(nMaxModules < TrackerTraits::numberOfModules);
     assert(startBPIX2 < nMaxModules);
     assert(nMaxModules < 4096);  // easy to extend at least till 32*1024
     assert(nMaxModules > 1024);
@@ -466,7 +466,8 @@ namespace pixelgpudetails {
       moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), clusInModule[i]);
     }
 
-    __shared__ uint32_t ws[64];
+    constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
+    __shared__ uint32_t ws[32];
     cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
     constexpr int lastModules = isPhase2 ? 1024 : nMaxModules - 1024;
     cms::cuda::blockPrefixScan(moduleStart + 1024 + 1, moduleStart + 1024 + 1, lastModules, ws);
@@ -510,10 +511,8 @@ namespace pixelgpudetails {
     assert(moduleStart[maxH + 1] >= moduleStart[maxH]);
     assert(moduleStart[nMaxModules] >= moduleStart[maxH + 1]);
 
-    constexpr int startFP1 =
-        isPhase2 ? phase2PixelTopology::numberOfModulesInBarrel : phase1PixelTopology::numberOfModulesInBarrel;
-    constexpr int startLastFwd = isPhase2 ? phase2PixelTopology::layerStart[phase2PixelTopology::numberOfLayers]
-                                          : phase1PixelTopology::layerStart[phase1PixelTopology::numberOfLayers];
+    constexpr int startFP1 = TrackerTraits::numberOfModulesInBarrel;
+    constexpr int startLastFwd = TrackerTraits::layerStart[TrackerTraits::numberOfLayers];
     for (int i = first, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
       if (0 != i)
         assert(moduleStart[i] >= moduleStart[i - i]);
@@ -540,6 +539,7 @@ namespace pixelgpudetails {
                                                        bool includeErrors,
                                                        bool debug,
                                                        cudaStream_t stream) {
+    using pixelTopology::Phase1;
     // we're not opting for calling this function in case of early events
     assert(wordCounter != 0);
     nDigis = wordCounter;
@@ -553,7 +553,7 @@ namespace pixelgpudetails {
     if (includeErrors) {
       digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream);
     }
-    clusters_d = SiPixelClustersCUDA(phase1PixelTopology::numberOfModules, stream);
+    clusters_d = SiPixelClustersCUDA(Phase1::numberOfModules, stream);
 
     // Begin Raw2Digi block
     {
@@ -618,8 +618,7 @@ namespace pixelgpudetails {
       // clusterizer ...
       using namespace gpuClustering;
       int threadsPerBlock = 256;
-      int blocks = (std::max(int(wordCounter), int(phase1PixelTopology::numberOfModules)) + threadsPerBlock - 1) /
-                   threadsPerBlock;
+      int blocks = (std::max(int(wordCounter), int(Phase1::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock;
 
       if (isRun2)
         gpuCalibPixel::calibDigis<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().moduleInd(),
@@ -652,7 +651,7 @@ namespace pixelgpudetails {
                 << " threads\n";
 #endif
 
-      countModules<false><<<blocks, threadsPerBlock, 0, stream>>>(
+      countModules<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(
           digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
@@ -662,29 +661,30 @@ namespace pixelgpudetails {
       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
 #endif
 
-      findClus<false><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().rawIdArr(),
-                                                              digis_d.view().moduleInd(),
-                                                              digis_d.view().xx(),
-                                                              digis_d.view().yy(),
-                                                              clusters_d.moduleStart(),
-                                                              clusters_d.clusInModule(),
-                                                              clusters_d.moduleId(),
-                                                              digis_d.view().clus(),
-                                                              wordCounter);
+      findClus<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().rawIdArr(),
+                                                               digis_d.view().moduleInd(),
+                                                               digis_d.view().xx(),
+                                                               digis_d.view().yy(),
+                                                               clusters_d.moduleStart(),
+                                                               clusters_d.clusInModule(),
+                                                               clusters_d.moduleId(),
+                                                               digis_d.view().clus(),
+                                                               wordCounter);
+
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
       cudaCheck(cudaStreamSynchronize(stream));
 #endif
 
       // apply charge cut
-      clusterChargeCut<false><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
-                                                                      digis_d.view().moduleInd(),
-                                                                      digis_d.view().adc(),
-                                                                      clusters_d.moduleStart(),
-                                                                      clusters_d.clusInModule(),
-                                                                      clusters_d.moduleId(),
-                                                                      digis_d.view().clus(),
-                                                                      wordCounter);
+      clusterChargeCut<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
+                                                                       digis_d.view().moduleInd(),
+                                                                       digis_d.view().adc(),
+                                                                       clusters_d.moduleStart(),
+                                                                       clusters_d.clusInModule(),
+                                                                       clusters_d.moduleId(),
+                                                                       digis_d.view().clus(),
+                                                                       wordCounter);
 
       cudaCheck(cudaGetLastError());
 
@@ -694,7 +694,7 @@ namespace pixelgpudetails {
       // synchronization/ExternalWork
       auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
       // MUST be ONE block
-      fillHitsModuleStart<false><<<1, 1024, 0, stream>>>(
+      fillHitsModuleStart<Phase1><<<1, 1024, 0, stream>>>(
           clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get());
 
       // copy to host
@@ -719,6 +719,7 @@ namespace pixelgpudetails {
                                                              const uint32_t numDigis,
                                                              cudaStream_t stream) {
     using namespace gpuClustering;
+    using pixelTopology::Phase2;
     nDigis = numDigis;
     digis_d = SiPixelDigisCUDA(numDigis, stream);
 
@@ -732,7 +733,7 @@ namespace pixelgpudetails {
     cudaCheck(
         cudaMemcpyAsync(digis_d.view().rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
 
-    clusters_d = SiPixelClustersCUDA(phase2PixelTopology::numberOfModules, stream);
+    clusters_d = SiPixelClustersCUDA(Phase2::numberOfModules, stream);
 
     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
 
@@ -750,13 +751,10 @@ namespace pixelgpudetails {
 
 #ifdef GPU_DEBUG
     cudaCheck(cudaStreamSynchronize(stream));
-#endif
-
-#ifdef GPU_DEBUG
     std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
 #endif
 
-    countModules<true><<<blocks, threadsPerBlock, 0, stream>>>(
+    countModules<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(
         digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), numDigis);
     cudaCheck(cudaGetLastError());
 
@@ -765,37 +763,49 @@ namespace pixelgpudetails {
         &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
 
     threadsPerBlock = 256;
-    blocks = phase2PixelTopology::numberOfModules;
-
-    findClus<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().rawIdArr(),
-                                                           digis_d.view().moduleInd(),
-                                                           digis_d.view().xx(),
-                                                           digis_d.view().yy(),
-                                                           clusters_d.moduleStart(),
-                                                           clusters_d.clusInModule(),
-                                                           clusters_d.moduleId(),
-                                                           digis_d.view().clus(),
-                                                           numDigis);
+    blocks = Phase2::numberOfModules;
+
+#ifdef GPU_DEBUG
+    cudaCheck(cudaStreamSynchronize(stream));
+    std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
+#endif
+    findClus<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().rawIdArr(),
+                                                             digis_d.view().moduleInd(),
+                                                             digis_d.view().xx(),
+                                                             digis_d.view().yy(),
+                                                             clusters_d.moduleStart(),
+                                                             clusters_d.clusInModule(),
+                                                             clusters_d.moduleId(),
+                                                             digis_d.view().clus(),
+                                                             numDigis);
 
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaStreamSynchronize(stream));
+    std::cout << "CUDA clusterChargeCut kernel launch with " << blocks << " blocks of " << threadsPerBlock
+              << " threads\n";
 #endif
 
     // apply charge cut
-    clusterChargeCut<true><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
-                                                                   digis_d.view().moduleInd(),
-                                                                   digis_d.view().adc(),
-                                                                   clusters_d.moduleStart(),
-                                                                   clusters_d.clusInModule(),
-                                                                   clusters_d.moduleId(),
-                                                                   digis_d.view().clus(),
-                                                                   numDigis);
+    clusterChargeCut<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
+                                                                     digis_d.view().moduleInd(),
+                                                                     digis_d.view().adc(),
+                                                                     clusters_d.moduleStart(),
+                                                                     clusters_d.clusInModule(),
+                                                                     clusters_d.moduleId(),
+                                                                     digis_d.view().clus(),
+                                                                     numDigis);
     cudaCheck(cudaGetLastError());
 
     auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
     // MUST be ONE block
-    fillHitsModuleStart<true><<<1, 1024, 0, stream>>>(
+
+#ifdef GPU_DEBUG
+    cudaCheck(cudaStreamSynchronize(stream));
+    std::cout << "CUDA fillHitsModuleStart kernel launch \n";
+#endif
+
+    fillHitsModuleStart<Phase2><<<1, 1024, 0, stream>>>(
         clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get());
 
     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
index d46fe76f3e81d..75e8389513b68 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
@@ -23,9 +23,9 @@ namespace gpuCalibPixel {
   constexpr float VCaltoElectronOffset_L1 = -670;  // L1:   -670 +- 220
   constexpr int VCalChargeThreshold = 100;
   //for phase2
-  constexpr float ElectronPerADCGain = 600;
+  constexpr float ElectronPerADCGain = 1500;
   constexpr int8_t Phase2ReadoutMode = 3;
-  constexpr uint16_t Phase2DigiBaseline = 1500;
+  constexpr uint16_t Phase2DigiBaseline = 1000;
   constexpr uint8_t Phase2KinkADC = 8;
 
   template <bool isRun2>
@@ -104,21 +104,20 @@ namespace gpuCalibPixel {
         adc_int = int(adc_int * ElectronPerADCGain);
       else {
         if (adc_int < Phase2KinkADC)
-          adc_int = int((adc_int - 0.5) * ElectronPerADCGain);
+          adc_int = int((adc_int + 0.5) * ElectronPerADCGain);
         else {
           constexpr int8_t dspp = (Phase2ReadoutMode < 10 ? Phase2ReadoutMode : 10);
           constexpr int8_t ds = int8_t(dspp <= 1 ? 1 : (dspp - 1) * (dspp - 1));
 
-          adc_int -= (Phase2KinkADC - 1);
+          adc_int -= Phase2KinkADC;
           adc_int *= ds;
-          adc_int += (Phase2KinkADC - 1);
+          adc_int += Phase2KinkADC;
 
           adc_int = ((adc_int + 0.5 * ds) * ElectronPerADCGain);
         }
 
         adc_int += int(Phase2DigiBaseline);
       }
-
       adc[i] = std::min(adc_int, int(std::numeric_limits<uint16_t>::max()));
     }
   }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
index a7dd8ac3752c2..fced5675e5c29 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -14,7 +14,7 @@
 
 namespace gpuClustering {
 
-  template <bool isPhase2>
+  template <typename TrackerTraits>
   __global__ void clusterChargeCut(
       SiPixelClusterThresholds
           clusterThresholds,             // charge cut on cluster in electrons (for layer 1 and for other layers)
@@ -29,9 +29,8 @@ namespace gpuClustering {
     __shared__ uint8_t ok[maxNumClustersPerModules];
     __shared__ uint16_t newclusId[maxNumClustersPerModules];
 
-    constexpr int startBPIX2 = isPhase2 ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1];
-    [[maybe_unused]] constexpr int nMaxModules =
-        isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+    constexpr int startBPIX2 = TrackerTraits::layerStart[1];
+    [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
 
     assert(nMaxModules < maxNumModules);
     assert(startBPIX2 < nMaxModules);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index ed3510e4918f8..675eae8938236 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -10,6 +10,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
+//#define GPU_DEBUG
+
 namespace gpuClustering {
 
   // Phase-1 pixel modules
@@ -65,14 +67,15 @@ namespace gpuClustering {
   __device__ uint32_t gMaxHit = 0;
 #endif
 
-  template <bool isPhase2>
+  template <typename TrackerTraits>
   __global__ void countModules(uint16_t const* __restrict__ id,
                                uint32_t* __restrict__ moduleStart,
                                int32_t* __restrict__ clusterId,
                                int numElements) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
-    [[maybe_unused]] constexpr int nMaxModules =
-        isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+
+    [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
+
     assert(nMaxModules < maxNumModules);
     for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
       clusterId[i] = i;
@@ -89,7 +92,7 @@ namespace gpuClustering {
     }
   }
 
-  template <bool isPhase2>
+  template <typename TrackerTraits>
   __global__ void findClus(uint32_t* __restrict__ rawIdArr,
                            uint16_t* __restrict__ id,                 // module id of each pixel
                            uint16_t const* __restrict__ x,            // local coordinates of each pixel
@@ -101,6 +104,7 @@ namespace gpuClustering {
                            int numElements) {
     // status is only used for Phase-1, but it cannot be declared conditionally only if isPhase2 is false;
     // to minimize the impact on Phase-2 reconstruction it is declared with a very small size.
+    constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
     constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
     __shared__ uint32_t status[pixelStatusSize];  // packed words array used to store the PixelStatus of each pixel
     __shared__ int msize;
@@ -108,14 +112,12 @@ namespace gpuClustering {
     auto firstModule = blockIdx.x;
     auto endModule = moduleStart[0];
 
-    [[maybe_unused]] constexpr int nMaxModules =
-        isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
-    assert(nMaxModules < maxNumModules);
+    assert(TrackerTraits::numberOfModules < maxNumModules);
 
     for (auto module = firstModule; module < endModule; module += gridDim.x) {
       auto firstPixel = moduleStart[1 + module];
       auto thisModuleId = id[firstPixel];
-      assert(thisModuleId < nMaxModules);
+      assert(thisModuleId < TrackerTraits::numberOfModules);
 
 #ifdef GPU_DEBUG
       if (thisModuleId % 100 == 1)
@@ -141,9 +143,10 @@ namespace gpuClustering {
 
       //init hist  (ymax=416 < 512 : 9bits)
       //6000 max pixels required for HI operations with no measurable impact on pp performance
-      constexpr uint32_t maxPixInModule = 6000;
-      constexpr auto nbins = isPhase2 ? 1024 : phase1PixelTopology::numColsInModule + 2;  //2+2;
-      constexpr auto nbits = isPhase2 ? 10 : 9;                                           //2+2;
+      constexpr uint32_t maxPixInModule = TrackerTraits::maxPixInModule;
+      constexpr auto nbins = TrackerTraits::clusterBinning;
+      constexpr auto nbits = TrackerTraits::clusterBits;
+
       using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, nbits, uint16_t>;
       __shared__ Hist hist;
       __shared__ typename Hist::Counter ws[32];
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py
index 91c5cecb848bf..2126235c353bb 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizer_cfi.py
@@ -16,15 +16,15 @@
   ClusterThreshold_L1     = 2000
 )
 
-# Run3, changes in the gain calibration scheme 
+# Run3, changes in the gain calibration scheme
 #from Configuration.Eras.Era_Run3_cff import Run3
 #Run3.toModify(siPixelClusters,
 from Configuration.Eras.Modifier_run3_common_cff import run3_common
 run3_common.toModify(siPixelClusters,
   VCaltoElectronGain      = 1,  # all gains=1, pedestals=0
-  VCaltoElectronGain_L1   = 1,   
-  VCaltoElectronOffset    = 0,   
-  VCaltoElectronOffset_L1 = 0,  
+  VCaltoElectronGain_L1   = 1,
+  VCaltoElectronOffset    = 0,
+  VCaltoElectronOffset_L1 = 0,
   ClusterThreshold_L1     = 4000
 )
 
@@ -45,4 +45,7 @@
 (premix_stage2 & phase2_tracker).toModify(siPixelClusters,
     src = "mixData:Pixel"
 )
-
+from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit
+(phase2_tracker & pixelNtupletFit).toModify(siPixelClusters, #at the moment the duplicate dropping is not imnplemented in Phase2
+    DropDuplicates = False
+)
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
index 2916f5f8d037b..21b641cae9819 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
@@ -25,6 +25,8 @@
 
 # convert the pixel digis (except errors) and clusters to the legacy format
 from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA
+from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoAPhase2_cfi import siPixelDigisClustersFromSoAPhase2 as _siPixelDigisClustersFromSoAPhase2
+
 siPixelDigisClustersPreSplitting = _siPixelDigisClustersFromSoA.clone()
 
 run3_common.toModify(siPixelDigisClustersPreSplitting,
@@ -32,7 +34,7 @@
 
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
 
-(gpu & ~phase2_tracker).toReplaceWith(siPixelClustersPreSplittingTask, cms.Task(
+gpu.toReplaceWith(siPixelClustersPreSplittingTask, cms.Task(
     # conditions used *only* by the modules running on GPU
     siPixelROCsStatusAndMappingWrapperESProducer,
     siPixelGainCalibrationForHLTGPU,
@@ -55,12 +57,13 @@
     src = "siPixelClustersPreSplittingCUDA"
 )
 
-phase2_tracker.toModify(siPixelDigisClustersPreSplitting,
+phase2_tracker.toReplaceWith(siPixelDigisClustersPreSplitting, _siPixelDigisClustersFromSoAPhase2.clone(
                         clusterThreshold_layer1 = 4000,
                         clusterThreshold_otherLayers = 4000,
                         src = "siPixelDigisPhase2SoA",
                         #produceDigis = False
-                        )
+                        ))
+
 (gpu & phase2_tracker).toReplaceWith(siPixelClustersPreSplittingTask, cms.Task(
                             # reconstruct the pixel clusters on the gpu from copied digis
                             siPixelClustersPreSplittingCUDA,
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index 8f997722f35ab..6aff7aa15196e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -20,12 +20,15 @@
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterThresholds.h"
 
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
 int main(void) {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
 #endif  // __CUDACC__
 
   using namespace gpuClustering;
+  using pixelTopology::Phase1;
 
   constexpr int numElements = 256 * maxNumModules;
   constexpr SiPixelClusterThresholds clusterThresholds(kSiPixelClusterThresholdsDefaultPhase1);
@@ -257,7 +260,7 @@ int main(void) {
               << " threads\n";
 
     cms::cuda::launch(
-        countModules<false>, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
+        countModules<Phase1>, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
 
     blocksPerGrid = maxNumModules;  //nModules;
 
@@ -265,7 +268,7 @@ int main(void) {
               << " threads\n";
     cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)));
 
-    cms::cuda::launch(findClus<false>,
+    cms::cuda::launch(findClus<Phase1>,
                       {blocksPerGrid, threadsPerBlock},
                       d_raw.get(),
                       d_id.get(),
@@ -292,7 +295,7 @@ int main(void) {
     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
-    cms::cuda::launch(clusterChargeCut<false>,
+    cms::cuda::launch(clusterChargeCut<Phase1>,
                       {blocksPerGrid, threadsPerBlock},
                       clusterThresholds,
                       d_id.get(),
@@ -306,17 +309,18 @@ int main(void) {
     cudaDeviceSynchronize();
 #else   // __CUDACC__
     h_moduleStart[0] = nModules;
-    countModules<false>(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
+    countModules<Phase1>(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
     memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t));
-    findClus<false>(h_raw.get(),
-                    h_id.get(),
-                    h_x.get(),
-                    h_y.get(),
-                    h_moduleStart.get(),
-                    h_clusInModule.get(),
-                    h_moduleId.get(),
-                    h_clus.get(),
-                    n);
+
+    findClus<Phase1>(h_raw.get(),
+                     h_id.get(),
+                     h_x.get(),
+                     h_y.get(),
+                     h_moduleStart.get(),
+                     h_clusInModule.get(),
+                     h_moduleId.get(),
+                     h_clus.get(),
+                     n);
 
     nModules = h_moduleStart[0];
     auto nclus = h_clusInModule.get();
@@ -331,14 +335,14 @@ int main(void) {
     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
-    clusterChargeCut<false>(clusterThresholds,
-                            h_id.get(),
-                            h_adc.get(),
-                            h_moduleStart.get(),
-                            h_clusInModule.get(),
-                            h_moduleId.get(),
-                            h_clus.get(),
-                            n);
+    clusterChargeCut<Phase1>(clusterThresholds,
+                             h_id.get(),
+                             h_adc.get(),
+                             h_moduleStart.get(),
+                             h_clusInModule.get(),
+                             h_moduleId.get(),
+                             h_clus.get(),
+                             n);
 #endif  // __CUDACC__
 
     std::cout << "found " << nModules << " Modules active" << std::endl;
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
index b373e8d0c7ec1..15c24dfefb420 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
@@ -9,8 +9,10 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEGenericBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 class MagneticField;
+template <typename TrackerTraits>
 class PixelCPEFast final : public PixelCPEGenericBase {
 public:
   PixelCPEFast(edm::ParameterSet const &conf,
@@ -27,11 +29,13 @@ class PixelCPEFast final : public PixelCPEGenericBase {
 
   // The return value can only be used safely in kernels launched on
   // the same cudaStream, or after cudaStreamSynchronize.
-  const pixelCPEforGPU::ParamsOnGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
+  using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>;
+  using LayerGeometry = pixelCPEforGPU::LayerGeometryT<TrackerTraits>;
+  using AverageGeometry = pixelTopology::AverageGeometryT<TrackerTraits>;
 
-  pixelCPEforGPU::ParamsOnGPU const &getCPUProduct() const { return cpuData_; }
+  const ParamsOnGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
 
-  bool isPhase2() const { return isPhase2_; };
+  ParamsOnGPU const &getCPUProduct() const { return cpuData_; }
 
 private:
   LocalPoint localPosition(DetParam const &theDetParam, ClusterParam &theClusterParam) const override;
@@ -45,17 +49,15 @@ class PixelCPEFast final : public PixelCPEGenericBase {
   // allocate this with posix malloc to be compatible with the cpu workflow
   std::vector<pixelCPEforGPU::DetParams> detParamsGPU_;
   pixelCPEforGPU::CommonParams commonParamsGPU_;
-  pixelCPEforGPU::LayerGeometry layerGeometry_;
-  pixelCPEforGPU::AverageGeometry averageGeometry_;
-  pixelCPEforGPU::ParamsOnGPU cpuData_;
-
-  bool isPhase2_;
+  LayerGeometry layerGeometry_;
+  AverageGeometry averageGeometry_;
+  ParamsOnGPU cpuData_;
 
   struct GPUData {
     ~GPUData();
     // not needed if not used on CPU...
-    pixelCPEforGPU::ParamsOnGPU paramsOnGPU_h;
-    pixelCPEforGPU::ParamsOnGPU *paramsOnGPU_d = nullptr;  // copy of the above on the Device
+    ParamsOnGPU paramsOnGPU_h;
+    ParamsOnGPU *paramsOnGPU_d = nullptr;  // copy of the above on the Device
   };
   cms::cuda::ESProduct<GPUData> gpuData_;
 
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
index 063a761b9d1d8..e7c8ad5554f36 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
@@ -34,7 +34,6 @@ namespace pixelCPEforGPU {
     float thePitchX;
     float thePitchY;
 
-    bool isPhase2;
     uint16_t maxModuleStride;
     uint8_t numberOfLaddersInBarrel;
   };
@@ -71,15 +70,21 @@ namespace pixelCPEforGPU {
     Frame frame;
   };
 
-  using pixelTopology::AverageGeometry;
-
-  struct LayerGeometry {
-    uint32_t layerStart[phase2PixelTopology::numberOfLayers + 1];
-    uint8_t layer[phase2PixelTopology::layerIndexSize];
+  template <typename TrackerTopology>
+  struct LayerGeometryT {
+    uint32_t layerStart[TrackerTopology::numberOfLayers + 1];
+    uint8_t layer[pixelTopology::layerIndexSize<TrackerTopology>];
     uint16_t maxModuleStride;
   };
 
-  struct ParamsOnGPU {
+  // using LayerGeometry = LayerGeometryT<pixelTopology::Phase1>;
+  // using LayerGeometryPhase2 = LayerGeometryT<pixelTopology::Phase2>;
+
+  template <typename TrackerTopology>
+  struct ParamsOnGPUT {
+    using LayerGeometry = LayerGeometryT<TrackerTopology>;
+    using AverageGeometry = pixelTopology::AverageGeometryT<TrackerTopology>;
+
     CommonParams const* m_commonParams;
     DetParams const* m_detParams;
     LayerGeometry const* m_layerGeometry;
@@ -202,10 +207,12 @@ namespace pixelCPEforGPU {
     return 0.5f * (qdiff / qsum) * w_eff;
   }
 
+  template <typename TrackerTraits>
   constexpr inline void position(CommonParams const& __restrict__ comParams,
                                  DetParams const& __restrict__ detParams,
                                  ClusParams& cp,
                                  uint32_t ic) {
+    constexpr int maxSize = TrackerTraits::maxSizeCluster;
     //--- Upper Right corner of Lower Left pixel -- in measurement frame
     uint16_t llx = cp.minRow[ic] + 1;
     uint16_t lly = cp.minCol[ic] + 1;
@@ -215,56 +222,52 @@ namespace pixelCPEforGPU {
     uint16_t ury = cp.maxCol[ic];
 
     uint16_t llxl = llx, llyl = lly, urxl = urx, uryl = ury;
-    if (!comParams.isPhase2)  //only in Phase1
-    {
-      llxl = phase1PixelTopology::localX(llx);
-      llyl = phase1PixelTopology::localY(lly);
-      urxl = phase1PixelTopology::localX(urx);
-      uryl = phase1PixelTopology::localY(ury);
-    }
+
+    llxl = TrackerTraits::localX(llx);
+    llyl = TrackerTraits::localY(lly);
+    urxl = TrackerTraits::localX(urx);
+    uryl = TrackerTraits::localY(ury);
 
     auto mx = llxl + urxl;
     auto my = llyl + uryl;
 
-    auto xsize = int(urxl) + 2 - int(llxl);
-    auto ysize = int(uryl) + 2 - int(llyl);
+    int xsize = int(urxl) + 2 - int(llxl);
+    int ysize = int(uryl) + 2 - int(llyl);
     assert(xsize >= 0);  // 0 if bixpix...
     assert(ysize >= 0);
 
-    if (!comParams.isPhase2)  //Phase 1 big pixels
-    {
-      if (phase1PixelTopology::isBigPixX(cp.minRow[ic]))
-        ++xsize;
-      if (phase1PixelTopology::isBigPixX(cp.maxRow[ic]))
-        ++xsize;
-      if (phase1PixelTopology::isBigPixY(cp.minCol[ic]))
-        ++ysize;
-      if (phase1PixelTopology::isBigPixY(cp.maxCol[ic]))
-        ++ysize;
-    }
+    if (TrackerTraits::isBigPixX(cp.minRow[ic]))
+      ++xsize;
+    if (TrackerTraits::isBigPixX(cp.maxRow[ic]))
+      ++xsize;
+    if (TrackerTraits::isBigPixY(cp.minCol[ic]))
+      ++ysize;
+    if (TrackerTraits::isBigPixY(cp.maxCol[ic]))
+      ++ysize;
 
     int unbalanceX = 8.f * std::abs(float(cp.q_f_X[ic] - cp.q_l_X[ic])) / float(cp.q_f_X[ic] + cp.q_l_X[ic]);
     int unbalanceY = 8.f * std::abs(float(cp.q_f_Y[ic] - cp.q_l_Y[ic])) / float(cp.q_f_Y[ic] + cp.q_l_Y[ic]);
+
     xsize = 8 * xsize - unbalanceX;
     ysize = 8 * ysize - unbalanceY;
 
-    cp.xsize[ic] = std::min(xsize, comParams.isPhase2 ? 2047 : 1023);
-    cp.ysize[ic] = std::min(ysize, comParams.isPhase2 ? 2047 : 1023);
+    cp.xsize[ic] = std::min(xsize, maxSize);
+    cp.ysize[ic] = std::min(ysize, maxSize);
 
-    if (cp.minRow[ic] == 0 || cp.maxRow[ic] == phase1PixelTopology::lastRowInModule)
+    if (cp.minRow[ic] == 0 || cp.maxRow[ic] == uint32_t(detParams.nRows - 1))
       cp.xsize[ic] = -cp.xsize[ic];
-    if (cp.minCol[ic] == 0 || cp.maxCol[ic] == phase1PixelTopology::lastColInModule)
+
+    if (cp.minCol[ic] == 0 || cp.maxCol[ic] == uint32_t(detParams.nCols - 1))
       cp.ysize[ic] = -cp.ysize[ic];
 
     // apply the lorentz offset correction
     float xoff = 0.5f * float(detParams.nRows) * comParams.thePitchX;
     float yoff = 0.5f * float(detParams.nCols) * comParams.thePitchY;
 
-    if (!comParams.isPhase2)  //correction for bigpixels for phase1
-    {
-      xoff = xoff + comParams.thePitchX;
-      yoff = yoff + 8.0f * comParams.thePitchY;
-    }
+    //correction for bigpixels for phase1
+    xoff = xoff + TrackerTraits::bigPixXCorrection * comParams.thePitchX;
+    yoff = yoff + TrackerTraits::bigPixYCorrection * comParams.thePitchY;
+
     // apply the lorentz offset correction
     auto xPos = detParams.shiftX + (comParams.thePitchX * 0.5f * float(mx)) - xoff;
     auto yPos = detParams.shiftY + (comParams.thePitchY * 0.5f * float(my)) - yoff;
@@ -284,8 +287,8 @@ namespace pixelCPEforGPU {
                             thickness,
                             cotalpha,
                             comParams.thePitchX,
-                            comParams.isPhase2 ? false : phase1PixelTopology::isBigPixX(cp.minRow[ic]),
-                            comParams.isPhase2 ? false : phase1PixelTopology::isBigPixX(cp.maxRow[ic]));
+                            TrackerTraits::isBigPixX(cp.minRow[ic]),
+                            TrackerTraits::isBigPixX(cp.maxRow[ic]));
 
     auto ycorr = correction(cp.maxCol[ic] - cp.minCol[ic],
                             cp.q_f_Y[ic],
@@ -296,13 +299,14 @@ namespace pixelCPEforGPU {
                             thickness,
                             cotbeta,
                             comParams.thePitchY,
-                            comParams.isPhase2 ? false : phase1PixelTopology::isBigPixY(cp.minCol[ic]),
-                            comParams.isPhase2 ? false : phase1PixelTopology::isBigPixY(cp.maxCol[ic]));
+                            TrackerTraits::isBigPixY(cp.minCol[ic]),
+                            TrackerTraits::isBigPixY(cp.maxCol[ic]));
 
     cp.xpos[ic] = xPos + xcorr;
     cp.ypos[ic] = yPos + ycorr;
   }
 
+  template <typename TrackerTraits>
   constexpr inline void errorFromSize(CommonParams const& __restrict__ comParams,
                                       DetParams const& __restrict__ detParams,
                                       ClusParams& cp,
@@ -312,17 +316,14 @@ namespace pixelCPEforGPU {
     cp.yerr[ic] = 0.0085;
 
     // FIXME these are errors form Run1
-
-    bool isPhase2 = comParams.isPhase2;
-    // FIXME these are errors form Run1
-    float xerr_barrel_l1_def = isPhase2 ? 0.00035 : 0.00200;  // 0.01030;
-    float yerr_barrel_l1_def = isPhase2 ? 0.00125 : 0.00210;
-    float xerr_barrel_ln_def = isPhase2 ? 0.00035 : 0.00200;  // 0.01030;
-    float yerr_barrel_ln_def = isPhase2 ? 0.00125 : 0.00210;
-    float xerr_endcap_def = isPhase2 ? 0.00060 : 0.0020;
-    float yerr_endcap_def = isPhase2 ? 0.00180 : 0.00210;
-
-    constexpr float xerr_barrel_l1[] = {0.00115, 0.00120, 0.00088};
+    float xerr_barrel_l1_def = TrackerTraits::xerr_barrel_l1_def;
+    float yerr_barrel_l1_def = TrackerTraits::yerr_barrel_l1_def;
+    float xerr_barrel_ln_def = TrackerTraits::xerr_barrel_ln_def;
+    float yerr_barrel_ln_def = TrackerTraits::yerr_barrel_ln_def;
+    float xerr_endcap_def = TrackerTraits::xerr_endcap_def;
+    float yerr_endcap_def = TrackerTraits::yerr_endcap_def;
+
+    constexpr float xerr_barrel_l1[] = {0.00115, 0.00120, 0.00088};  //TODO MOVE THESE SOMEWHERE ELSE
     constexpr float yerr_barrel_l1[] = {
         0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240};
     constexpr float xerr_barrel_ln[] = {0.00115, 0.00120, 0.00088};
@@ -339,52 +340,31 @@ namespace pixelCPEforGPU {
     bool isEdgeY = cp.ysize[ic] < 1;
 
     // is one and big?
-    bool isBig1X = isPhase2 ? false : ((0 == sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic]));
-    bool isBig1Y = isPhase2 ? false : ((0 == sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic]));
-
-    if (!isPhase2) {
-      if (!isEdgeX && !isBig1X) {
-        if (not detParams.isBarrel) {
-          cp.xerr[ic] = sx < std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
-        } else if (detParams.layer == 1) {
-          cp.xerr[ic] = sx < std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx] : xerr_barrel_l1_def;
-        } else {
-          cp.xerr[ic] = sx < std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx] : xerr_barrel_ln_def;
-        }
-      }
-
-      if (!isEdgeY && !isBig1Y) {
-        if (not detParams.isBarrel) {
-          cp.yerr[ic] = sy < std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
-        } else if (detParams.layer == 1) {
-          cp.yerr[ic] = sy < std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy] : yerr_barrel_l1_def;
-        } else {
-          cp.yerr[ic] = sy < std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy] : yerr_barrel_ln_def;
-        }
-      }
-    } else {
-      if (!isEdgeX) {
-        if (not detParams.isBarrel) {
-          cp.xerr[ic] = sx < std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
-        } else if (detParams.layer == 1) {
-          cp.xerr[ic] = sx < std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx] : xerr_barrel_l1_def;
-        } else {
-          cp.xerr[ic] = sx < std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx] : xerr_barrel_ln_def;
-        }
+    bool isBig1X = ((0 == sx) && TrackerTraits::isBigPixX(cp.minRow[ic]));
+    bool isBig1Y = ((0 == sy) && TrackerTraits::isBigPixY(cp.minCol[ic]));
+
+    if (!isEdgeX && !isBig1X) {
+      if (not detParams.isBarrel) {
+        cp.xerr[ic] = sx < std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.xerr[ic] = sx < std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx] : xerr_barrel_l1_def;
+      } else {
+        cp.xerr[ic] = sx < std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx] : xerr_barrel_ln_def;
       }
+    }
 
-      if (!isEdgeY) {
-        if (not detParams.isBarrel) {
-          cp.yerr[ic] = sy < std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
-        } else if (detParams.layer == 1) {
-          cp.yerr[ic] = sy < std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy] : yerr_barrel_l1_def;
-        } else {
-          cp.yerr[ic] = sy < std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy] : yerr_barrel_ln_def;
-        }
+    if (!isEdgeY && !isBig1Y) {
+      if (not detParams.isBarrel) {
+        cp.yerr[ic] = sy < std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.yerr[ic] = sy < std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy] : yerr_barrel_l1_def;
+      } else {
+        cp.yerr[ic] = sy < std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy] : yerr_barrel_ln_def;
       }
     }
   }
 
+  template <typename TrackerTraits>
   constexpr inline void errorFromDB(CommonParams const& __restrict__ comParams,
                                     DetParams const& __restrict__ detParams,
                                     ClusParams& cp,
@@ -402,8 +382,8 @@ namespace pixelCPEforGPU {
     // is one and big?
     bool isOneX = (0 == sx);
     bool isOneY = (0 == sy);
-    bool isBigX = comParams.isPhase2 ? false : phase1PixelTopology::isBigPixX(cp.minRow[ic]);
-    bool isBigY = comParams.isPhase2 ? false : phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+    bool isBigX = TrackerTraits::isBigPixX(cp.minRow[ic]);
+    bool isBigY = TrackerTraits::isBigPixY(cp.minCol[ic]);
 
     auto ch = cp.charge[ic];
     auto bin = 0;
@@ -421,14 +401,14 @@ namespace pixelCPEforGPU {
     cp.status[ic].isOneY = isOneY;
     cp.status[ic].isBigY = (isOneY & isBigY) | isEdgeY;
 
-    auto xoff = -float(phase1PixelTopology::xOffset) * comParams.thePitchX;
+    auto xoff = -float(TrackerTraits::xOffset) * comParams.thePitchX;
     int low_value = 0;
     int high_value = CPEFastParametrisation::kNumErrorBins - 1;
     int bin_value = float(CPEFastParametrisation::kNumErrorBins) * (cp.xpos[ic] + xoff) / (2 * xoff);
     // return estimated bin value truncated to [0, 15]
     int jx = std::clamp(bin_value, low_value, high_value);
 
-    auto toCM = [](uint8_t x) { return float(x) * 1.e-4; };
+    auto toCM = [](uint8_t x) { return float(x) * 1.e-4f; };
 
     if (not isEdgeX) {
       cp.xerr[ic] = isOneX ? toCM(isBigX ? detParams.sx2 : detParams.sigmax1[jx])
@@ -441,6 +421,15 @@ namespace pixelCPEforGPU {
     }
   }
 
+  //for Phase2 -> fallback to error from size
+  template <>
+  constexpr inline void errorFromDB<pixelTopology::Phase2>(CommonParams const& __restrict__ comParams,
+                                                           DetParams const& __restrict__ detParams,
+                                                           ClusParams& cp,
+                                                           uint32_t ic) {
+    errorFromSize<pixelTopology::Phase2>(comParams, detParams, cp, ic);
+  }
+
 }  // namespace pixelCPEforGPU
 
 #endif  // RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc b/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc
index cd08eac535372..6044f2a5b9ad4 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc
@@ -16,9 +16,10 @@
 #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
 
-class PixelCPEFastESProducer : public edm::ESProducer {
+template <typename TrackerTraits>
+class PixelCPEFastESProducerT : public edm::ESProducer {
 public:
-  PixelCPEFastESProducer(const edm::ParameterSet& p);
+  PixelCPEFastESProducerT(const edm::ParameterSet& p);
   std::unique_ptr<PixelClusterParameterEstimator> produce(const TkPixelCPERecord&);
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -36,7 +37,8 @@ class PixelCPEFastESProducer : public edm::ESProducer {
 
 using namespace edm;
 
-PixelCPEFastESProducer::PixelCPEFastESProducer(const edm::ParameterSet& p) : pset_(p) {
+template <typename TrackerTraits>
+PixelCPEFastESProducerT<TrackerTraits>::PixelCPEFastESProducerT(const edm::ParameterSet& p) : pset_(p) {
   auto const& myname = p.getParameter<std::string>("ComponentName");
   auto const& magname = p.getParameter<edm::ESInputTag>("MagneticFieldRecord");
   useErrorsFromTemplates_ = p.getParameter<bool>("UseErrorsFromTemplates");
@@ -52,7 +54,9 @@ PixelCPEFastESProducer::PixelCPEFastESProducer(const edm::ParameterSet& p) : pse
   }
 }
 
-std::unique_ptr<PixelClusterParameterEstimator> PixelCPEFastESProducer::produce(const TkPixelCPERecord& iRecord) {
+template <typename TrackerTraits>
+std::unique_ptr<PixelClusterParameterEstimator> PixelCPEFastESProducerT<TrackerTraits>::produce(
+    const TkPixelCPERecord& iRecord) {
   // add the new la width object
   const SiPixelLorentzAngle* lorentzAngleWidthProduct = nullptr;
   lorentzAngleWidthProduct = &iRecord.get(lorentzAngleWidthToken_);
@@ -65,23 +69,24 @@ std::unique_ptr<PixelClusterParameterEstimator> PixelCPEFastESProducer::produce(
     //} else {
     //std::cout<<" pass an empty GenError pointer"<<std::endl;
   }
-  return std::make_unique<PixelCPEFast>(pset_,
-                                        &iRecord.get(magfieldToken_),
-                                        iRecord.get(pDDToken_),
-                                        iRecord.get(hTTToken_),
-                                        &iRecord.get(lorentzAngleToken_),
-                                        genErrorDBObjectProduct,
-                                        lorentzAngleWidthProduct);
+  return std::make_unique<PixelCPEFast<TrackerTraits>>(pset_,
+                                                       &iRecord.get(magfieldToken_),
+                                                       iRecord.get(pDDToken_),
+                                                       iRecord.get(hTTToken_),
+                                                       &iRecord.get(lorentzAngleToken_),
+                                                       genErrorDBObjectProduct,
+                                                       lorentzAngleWidthProduct);
 }
 
-void PixelCPEFastESProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void PixelCPEFastESProducerT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
   // from PixelCPEBase
   PixelCPEBase::fillPSetDescription(desc);
 
   // from PixelCPEFast
-  PixelCPEFast::fillPSetDescription(desc);
+  PixelCPEFast<TrackerTraits>::fillPSetDescription(desc);
 
   // used by PixelCPEFast
   desc.add<double>("EdgeClusterErrorX", 50.0);
@@ -89,11 +94,17 @@ void PixelCPEFastESProducer::fillDescriptions(edm::ConfigurationDescriptions& de
   desc.add<bool>("UseErrorsFromTemplates", true);
   desc.add<bool>("TruncatePixelCharge", true);
 
-  // specific to PixelCPEFastESProducer
-  desc.add<std::string>("ComponentName", "PixelCPEFast");
+  std::string name = "PixelCPEFast";
+  name += TrackerTraits::nameModifier;
+  desc.add<std::string>("ComponentName", name);
   desc.add<edm::ESInputTag>("MagneticFieldRecord", edm::ESInputTag());
 
-  descriptions.add("PixelCPEFastESProducer", desc);
+  descriptions.addWithDefaultLabel(desc);
 }
 
+using PixelCPEFastESProducer = PixelCPEFastESProducerT<pixelTopology::Phase1>;
 DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducer);
+using PixelCPEFastESProducerPhase1 = PixelCPEFastESProducerT<pixelTopology::Phase1>;
+DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducerPhase1);
+using PixelCPEFastESProducerPhase2 = PixelCPEFastESProducerT<pixelTopology::Phase2>;
+DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducerPhase2);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 135254fa6e9f2..cb5b4b2f2c387 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -12,21 +12,28 @@
 
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
+// #define GPU_DEBUG 1
 
 namespace {
+  template <typename TrackerTraits>
   __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart,
-                                    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                    pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
                                     uint32_t* hitsLayerStart) {
     auto i = blockIdx.x * blockDim.x + threadIdx.x;
-    auto m =
-        cpeParams->commonParams().isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
+    constexpr auto m = TrackerTraits::numberOfLayers;
 
     assert(0 == hitsModuleStart[0]);
 
     if (i <= m) {
       hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]];
 #ifdef GPU_DEBUG
-      printf("LayerStart %d/%d at module %d: %d\n", i, m, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]);
+      int old = i == 0 ? 0 : hitsModuleStart[cpeParams->layerGeometry().layerStart[i - 1]];
+      printf("LayerStart %d/%d at module %d: %d - %d\n",
+             i,
+             m,
+             cpeParams->layerGeometry().layerStart[i],
+             hitsLayerStart[i],
+             hitsLayerStart[i] - old);
 #endif
     }
   }
@@ -34,18 +41,18 @@ namespace {
 
 namespace pixelgpudetails {
 
-  TrackingRecHit2DGPU PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
-                                                          SiPixelClustersCUDA const& clusters_d,
-                                                          BeamSpotCUDA const& bs_d,
-                                                          pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                                          bool isPhase2,
-                                                          cudaStream_t stream) const {
+  template <typename TrackerTraits>
+  TrackingRecHit2DGPUT<TrackerTraits> PixelRecHitGPUKernel<TrackerTraits>::makeHitsAsync(
+      SiPixelDigisCUDA const& digis_d,
+      SiPixelClustersCUDA const& clusters_d,
+      BeamSpotCUDA const& bs_d,
+      pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
+      cudaStream_t stream) const {
+    using namespace gpuPixelRecHits;
     auto nHits = clusters_d.nClusters();
 
-    TrackingRecHit2DGPU hits_d(
-        nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
-    assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules
-                                            : phase1PixelTopology::numberOfModules);
+    TrackingRecHit2DGPUT<TrackerTraits> hits_d(
+        nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
 
     int activeModulesWithDigis = digis_d.nModules();
     // protect from empty events
@@ -54,9 +61,10 @@ namespace pixelgpudetails {
       int blocks = activeModulesWithDigis;
 
 #ifdef GPU_DEBUG
+
       std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
 #endif
-      gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream>>>(
+      getHits<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(
           cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
@@ -65,9 +73,10 @@ namespace pixelgpudetails {
 
       // assuming full warp of threads is better than a smaller number...
       if (nHits) {
-        setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
+        setHitsLayerStart<TrackerTraits>
+            <<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
         cudaCheck(cudaGetLastError());
-        auto nLayers = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
+        constexpr auto nLayers = TrackerTraits::numberOfLayers;
         cms::cuda::fillManyFromVector(hits_d.phiBinner(),
                                       nLayers,
                                       hits_d.iphi(),
@@ -87,4 +96,6 @@ namespace pixelgpudetails {
     return hits_d;
   }
 
+  template class PixelRecHitGPUKernel<pixelTopology::Phase1>;
+  template class PixelRecHitGPUKernel<pixelTopology::Phase2>;
 }  // namespace pixelgpudetails
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
index 8289c8db7f2f4..0a3c2b647f22e 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
@@ -9,9 +9,11 @@
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+//#define GPU_DEBUG 1
 namespace pixelgpudetails {
 
+  template <typename TrackerTraits>
   class PixelRecHitGPUKernel {
   public:
     PixelRecHitGPUKernel() = default;
@@ -22,13 +24,15 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
-    TrackingRecHit2DGPU makeHitsAsync(SiPixelDigisCUDA const& digis_d,
-                                      SiPixelClustersCUDA const& clusters_d,
-                                      BeamSpotCUDA const& bs_d,
-                                      pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                      bool isPhase2,
-                                      cudaStream_t stream) const;
+    using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>;
+
+    TrackingRecHit2DGPUT<TrackerTraits> makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                                                      SiPixelClustersCUDA const& clusters_d,
+                                                      BeamSpotCUDA const& bs_d,
+                                                      ParamsOnGPU const* cpeParams,
+                                                      cudaStream_t stream) const;
   };
+
 }  // namespace pixelgpudetails
 
 #endif  // RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHitGPUKernel_h
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index 8112e9ebd19c8..b23fa7dcc11ed 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -20,13 +20,15 @@
 #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 #include "PixelRecHitGPUKernel.h"
 
-class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
+template <typename TrackerTraits>
+class SiPixelRecHitCUDAT : public edm::global::EDProducer<> {
 public:
-  explicit SiPixelRecHitCUDA(const edm::ParameterSet& iConfig);
-  ~SiPixelRecHitCUDA() override = default;
+  explicit SiPixelRecHitCUDAT(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitCUDAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -37,31 +39,40 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
   const edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
-  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHit_;
+  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>>> tokenHit_;
 
-  const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
+  const pixelgpudetails::PixelRecHitGPUKernel<TrackerTraits> gpuAlgo_;
 };
 
-SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+SiPixelRecHitCUDAT<TrackerTraits>::SiPixelRecHitCUDAT(const edm::ParameterSet& iConfig)
     : cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter<std::string>("CPE")))),
       tBeamSpot(consumes<cms::cuda::Product<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
       token_(consumes<cms::cuda::Product<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenDigi_(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenHit_(produces<cms::cuda::Product<TrackingRecHit2DGPU>>()) {}
+      tokenHit_(produces<cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>>>()) {}
 
-void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void SiPixelRecHitCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpotCUDA"));
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplittingCUDA"));
-  desc.add<std::string>("CPE", "PixelCPEFast");
-  descriptions.add("siPixelRecHitCUDA", desc);
+
+  std::string cpe = "PixelCPEFast";
+  cpe += TrackerTraits::nameModifier;
+  desc.add<std::string>("CPE", cpe);
+
+  descriptions.addWithDefaultLabel(desc);
 }
 
-void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
-  PixelCPEFast const* fcpe = dynamic_cast<const PixelCPEFast*>(&es.getData(cpeToken_));
+template <typename TrackerTraits>
+void SiPixelRecHitCUDAT<TrackerTraits>::produce(edm::StreamID streamID,
+                                                edm::Event& iEvent,
+                                                const edm::EventSetup& es) const {
+  PixelCPEFast<TrackerTraits> const* fcpe = dynamic_cast<const PixelCPEFast<TrackerTraits>*>(&es.getData(cpeToken_));
   if (not fcpe) {
-    throw cms::Exception("Configuration") << "SiPixelRecHitSoAFromLegacy can only use a CPE of type PixelCPEFast";
+    throw cms::Exception("Configuration") << "SiPixelRecHitCUDA can only use a CPE of type PixelCPEFast";
   }
 
   edm::Handle<cms::cuda::Product<SiPixelClustersCUDA>> hclusters;
@@ -80,8 +91,14 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons
 
   ctx.emplace(iEvent,
               tokenHit_,
-              gpuAlgo_.makeHitsAsync(
-                  digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), fcpe->isPhase2(), ctx.stream()));
+              gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), ctx.stream()));
 }
 
+using SiPixelRecHitCUDA = SiPixelRecHitCUDAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(SiPixelRecHitCUDA);
+
+using SiPixelRecHitCUDAPhase1 = SiPixelRecHitCUDAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(SiPixelRecHitCUDAPhase1);
+
+using SiPixelRecHitCUDAPhase2 = SiPixelRecHitCUDAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(SiPixelRecHitCUDAPhase2);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index 7ff2da5552e6d..1428efe06a1d1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -24,14 +24,16 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
-class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+template <typename TrackerTraits>
+class SiPixelRecHitFromCUDAT : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-  explicit SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig);
-  ~SiPixelRecHitFromCUDA() override = default;
+  explicit SiPixelRecHitFromCUDAT(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitFromCUDAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
   using HMSstorage = HostProduct<uint32_t[]>;
+  using HitsOnGPU = TrackingRecHit2DGPUT<TrackerTraits>;
 
 private:
   void acquire(edm::Event const& iEvent,
@@ -40,64 +42,70 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
-  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsToken_;  // CUDA hits
-  const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;           // legacy clusters
-  const edm::EDPutTokenT<SiPixelRecHitCollection> rechitsPutToken_;            // legacy rechits
+  const edm::EDGetTokenT<cms::cuda::Product<HitsOnGPU>> hitsToken_;   // CUDA hits
+  const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;  // legacy clusters
+  const edm::EDPutTokenT<SiPixelRecHitCollection> rechitsPutToken_;   // legacy rechits
   const edm::EDPutTokenT<HMSstorage> hostPutToken_;
 
   uint32_t nHits_;
-  uint32_t nMaxModules_;
   cms::cuda::host::unique_ptr<float[]> store32_;
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
 };
 
-SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+SiPixelRecHitFromCUDAT<TrackerTraits>::SiPixelRecHitFromCUDAT(const edm::ParameterSet& iConfig)
     : geomToken_(esConsumes()),
-      hitsToken_(
-          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      hitsToken_(consumes<cms::cuda::Product<HitsOnGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
       clusterToken_(consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))),
       rechitsPutToken_(produces<SiPixelRecHitCollection>()),
       hostPutToken_(produces<HMSstorage>()) {}
 
-void SiPixelRecHitFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void SiPixelRecHitFromCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplitting"));
+
   descriptions.addWithDefaultLabel(desc);
 }
 
-void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,
-                                    edm::EventSetup const& iSetup,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsToken_);
+template <typename TrackerTraits>
+void SiPixelRecHitFromCUDAT<TrackerTraits>::acquire(edm::Event const& iEvent,
+                                                    edm::EventSetup const& iSetup,
+                                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<HitsOnGPU> const& inputDataWrapped = iEvent.get(hitsToken_);
+
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
   auto const& inputData = ctx.get(inputDataWrapped);
 
   nHits_ = inputData.nHits();
-  nMaxModules_ = inputData.nMaxModules();
   LogDebug("SiPixelRecHitFromCUDA") << "converting " << nHits_ << " Hits";
 
   if (0 == nHits_)
     return;
   store32_ = inputData.localCoordToHostAsync(ctx.stream());
+
   hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
 }
 
-void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
+template <typename TrackerTraits>
+void SiPixelRecHitFromCUDAT<TrackerTraits>::produce(edm::Event& iEvent, edm::EventSetup const& es) {
   // allocate a buffer for the indices of the clusters
-  auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
+  constexpr auto nMaxModules = TrackerTraits::numberOfModules;
+  auto hmsp = std::make_unique<uint32_t[]>(nMaxModules + 1);
 
   SiPixelRecHitCollection output;
-  output.reserve(nMaxModules_, nHits_);
+  output.reserve(nMaxModules, nHits_);
 
   if (0 == nHits_) {
     iEvent.emplace(rechitsPutToken_, std::move(output));
     iEvent.emplace(hostPutToken_, std::move(hmsp));
     return;
   }
-  output.reserve(nMaxModules_, nHits_);
+  output.reserve(nMaxModules, nHits_);
 
-  std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
+  std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules + 1, hmsp.get());
   // wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart
   iEvent.emplace(hostPutToken_, std::move(hmsp));
 
@@ -130,7 +138,7 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
 
     assert(lc > fc);
     LogDebug("SiPixelRecHitFromCUDA") << "in det " << gind << ": conv " << nhits << " hits from " << dsv.size()
-                                      << " legacy clusters" << ' ' << fc << ',' << lc;
+                                      << " legacy clusters" << ' ' << fc << ',' << lc << "\n";
     if (nhits > maxHitsInModule)
       edm::LogWarning("SiPixelRecHitFromCUDA") << fmt::sprintf(
           "Too many clusters %d in module %d. Only the first %d hits will be converted", nhits, gind, maxHitsInModule);
@@ -185,4 +193,11 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
   iEvent.emplace(rechitsPutToken_, std::move(output));
 }
 
+using SiPixelRecHitFromCUDA = SiPixelRecHitFromCUDAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(SiPixelRecHitFromCUDA);
+
+using SiPixelRecHitFromCUDAPhase1 = SiPixelRecHitFromCUDAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(SiPixelRecHitFromCUDAPhase1);
+
+using SiPixelRecHitFromCUDAPhase2 = SiPixelRecHitFromCUDAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(SiPixelRecHitFromCUDAPhase2);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
index 7532470ebd3d4..8bcb218255548 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
@@ -24,13 +24,15 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
-class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+template <typename TrackerTraits>
+class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-  explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig);
-  ~SiPixelRecHitSoAFromCUDA() override = default;
+  explicit SiPixelRecHitSoAFromCUDAT(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitSoAFromCUDAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
   using HMSstorage = HostProduct<uint32_t[]>;
+  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
 
 private:
   void acquire(edm::Event const& iEvent,
@@ -38,34 +40,36 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWor
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsTokenGPU_;  // CUDA hits
-  const edm::EDPutTokenT<TrackingRecHit2DCPU> hitsPutTokenCPU_;
+  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>>> hitsTokenGPU_;  // CUDA hits
+  const edm::EDPutTokenT<TrackingRecHit2DCPUT<TrackerTraits>> hitsPutTokenCPU_;
   const edm::EDPutTokenT<HMSstorage> hostPutToken_;
 
   uint32_t nHits_;
-  uint32_t nMaxModules_;
 
   cms::cuda::host::unique_ptr<float[]> store32_;
   cms::cuda::host::unique_ptr<uint16_t[]> store16_;
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
 };
 
-SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : hitsTokenGPU_(
-          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
-      hitsPutTokenCPU_(produces<TrackingRecHit2DCPU>()),
+template <typename TrackerTraits>
+SiPixelRecHitSoAFromCUDAT<TrackerTraits>::SiPixelRecHitSoAFromCUDAT(const edm::ParameterSet& iConfig)
+    : hitsTokenGPU_(consumes(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      hitsPutTokenCPU_(produces<TrackingRecHit2DCPUT<TrackerTraits>>()),
       hostPutToken_(produces<HMSstorage>()) {}
 
-void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void SiPixelRecHitSoAFromCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
+
   descriptions.addWithDefaultLabel(desc);
 }
 
-void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
-                                       edm::EventSetup const& iSetup,
-                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
+template <typename TrackerTraits>
+void SiPixelRecHitSoAFromCUDAT<TrackerTraits>::acquire(edm::Event const& iEvent,
+                                                       edm::EventSetup const& iSetup,
+                                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
@@ -74,20 +78,27 @@ void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
 
   if (0 == nHits_)
     return;
-  nMaxModules_ = inputData.nMaxModules();
   store32_ = inputData.store32ToHostAsync(ctx.stream());
   store16_ = inputData.store16ToHostAsync(ctx.stream());
   hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
 }
 
-void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
-  auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
+template <typename TrackerTraits>
+void SiPixelRecHitSoAFromCUDAT<TrackerTraits>::produce(edm::Event& iEvent, edm::EventSetup const& es) {
+  auto hmsp = std::make_unique<uint32_t[]>(TrackerTraits::numberOfModules + 1);
 
   if (nHits_ > 0)
-    std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
+    std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + TrackerTraits::numberOfModules + 1, hmsp.get());
 
   iEvent.emplace(hostPutToken_, std::move(hmsp));
-  iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
+  iEvent.emplace(hitsPutTokenCPU_, store32_, store16_, hitsModuleStart_.get(), nHits_);
 }
 
+using SiPixelRecHitSoAFromCUDA = SiPixelRecHitSoAFromCUDAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
+
+using SiPixelRecHitSoAFromCUDAPhase1 = SiPixelRecHitSoAFromCUDAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDAPhase1);
+
+using SiPixelRecHitSoAFromCUDAPhase2 = SiPixelRecHitSoAFromCUDAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDAPhase2);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index d23ecec66fea0..1edc7870f4800 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -27,10 +27,11 @@
 
 #include "gpuPixelRecHits.h"
 
-class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> {
+template <typename TrackerTraits>
+class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> {
 public:
-  explicit SiPixelRecHitSoAFromLegacy(const edm::ParameterSet& iConfig);
-  ~SiPixelRecHitSoAFromLegacy() override = default;
+  explicit SiPixelRecHitSoAFromLegacyT(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitSoAFromLegacyT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -44,39 +45,44 @@ class SiPixelRecHitSoAFromLegacy : public edm::global::EDProducer<> {
   const edm::ESGetToken<PixelClusterParameterEstimator, TkPixelCPERecord> cpeToken_;
   const edm::EDGetTokenT<reco::BeamSpot> bsGetToken_;
   const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;  // Legacy Clusters
-  const edm::EDPutTokenT<TrackingRecHit2DCPU> tokenHit_;
+  const edm::EDPutTokenT<TrackingRecHit2DCPUT<TrackerTraits>> tokenHit_;
   const edm::EDPutTokenT<HMSstorage> tokenModuleStart_;
   const bool convert2Legacy_;
-  const bool isPhase2_;
 };
 
-SiPixelRecHitSoAFromLegacy::SiPixelRecHitSoAFromLegacy(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+SiPixelRecHitSoAFromLegacyT<TrackerTraits>::SiPixelRecHitSoAFromLegacyT(const edm::ParameterSet& iConfig)
     : geomToken_(esConsumes()),
       cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter<std::string>("CPE")))),
       bsGetToken_{consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))},
       clusterToken_{consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))},
-      tokenHit_{produces<TrackingRecHit2DCPU>()},
+      tokenHit_{produces<TrackingRecHit2DCPUT<TrackerTraits>>()},
       tokenModuleStart_{produces<HMSstorage>()},
-      convert2Legacy_(iConfig.getParameter<bool>("convertToLegacy")),
-      isPhase2_(iConfig.getParameter<bool>("isPhase2")) {
+      convert2Legacy_(iConfig.getParameter<bool>("convertToLegacy")) {
   if (convert2Legacy_)
     produces<SiPixelRecHitCollectionNew>();
 }
 
-void SiPixelRecHitSoAFromLegacy::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplitting"));
-  desc.add<std::string>("CPE", "PixelCPEFast");
+  std::string cpeName = "PixelCPEFast";
+  cpeName += TrackerTraits::nameModifier;
+  desc.add<std::string>("CPE", cpeName);
   desc.add<bool>("convertToLegacy", false);
-  desc.add<bool>("isPhase2", false);
+
   descriptions.addWithDefaultLabel(desc);
 }
 
-void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
+template <typename TrackerTraits>
+void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
+                                                         edm::Event& iEvent,
+                                                         const edm::EventSetup& es) const {
   const TrackerGeometry* geom_ = &es.getData(geomToken_);
-  PixelCPEFast const* fcpe = dynamic_cast<const PixelCPEFast*>(&es.getData(cpeToken_));
+  PixelCPEFast<TrackerTraits> const* fcpe = dynamic_cast<const PixelCPEFast<TrackerTraits>*>(&es.getData(cpeToken_));
   if (not fcpe) {
     throw cms::Exception("Configuration") << "SiPixelRecHitSoAFromLegacy can only use a CPE of type PixelCPEFast";
   }
@@ -93,14 +99,11 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   iEvent.getByToken(clusterToken_, hclusters);
   auto const& input = *hclusters;
 
-  const int nMaxModules = isPhase2_ ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
-  const int startBPIX2 = isPhase2_ ? phase2PixelTopology::layerStart[1] : phase1PixelTopology::layerStart[1];
-
-  assert(nMaxModules < gpuClustering::maxNumModules);
-  assert(startBPIX2 < nMaxModules);
+  constexpr int maxModules = TrackerTraits::numberOfModules;
+  constexpr int startBPIX2 = pixelTopology::layerStart<TrackerTraits>(1);
 
   // allocate a buffer for the indices of the clusters
-  auto hmsp = std::make_unique<uint32_t[]>(nMaxModules + 1);
+  auto hmsp = std::make_unique<uint32_t[]>(maxModules + 1);
   // hitsModuleStart is a non-owning pointer to the buffer
   auto hitsModuleStart = hmsp.get();
   // wrap the buffer in a HostProduct
@@ -141,24 +144,23 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
     DetId detIdObject(detid);
     const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject);
     auto gind = genericDet->index();
-    assert(gind < nMaxModules);
+    assert(gind < maxModules);
     auto const nclus = dsv.size();
     clusInModule_[gind] = nclus;
     numberOfClusters += nclus;
   }
   hitsModuleStart[0] = 0;
 
-  for (int i = 1, n = nMaxModules + 1; i < n; ++i)
+  for (int i = 1, n = maxModules + 1; i < n; ++i)
     hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1];
 
-  assert(numberOfClusters == int(hitsModuleStart[nMaxModules]));
+  assert(numberOfClusters == int(hitsModuleStart[maxModules]));
 
   // output SoA
   // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
 
-  auto output = std::make_unique<TrackingRecHit2DCPU>(
-      numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr);
-  assert(output->nMaxModules() == uint32_t(nMaxModules));
+  auto output = std::make_unique<TrackingRecHit2DCPUT<TrackerTraits>>(
+      numberOfClusters, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr);
 
   if (0 == numberOfClusters) {
     iEvent.put(std::move(output));
@@ -168,7 +170,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   }
 
   if (convert2Legacy_)
-    legacyOutput->reserve(nMaxModules, numberOfClusters);
+    legacyOutput->reserve(maxModules, numberOfClusters);
 
   int numberOfDetUnits = 0;
   int numberOfHits = 0;
@@ -178,7 +180,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
     DetId detIdObject(detid);
     const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject);
     auto const gind = genericDet->index();
-    assert(gind < nMaxModules);
+    assert(gind < maxModules);
     const PixelGeomDetUnit* pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
     assert(pixDet);
     auto const nclus = dsv.size();
@@ -249,6 +251,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
 
         if (ih >= maxHitsInModule)
           break;
+
         assert(ih < clusterRef.size());
         LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h));
         LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h));
@@ -262,7 +265,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   assert(numberOfHits == numberOfClusters);
 
   // fill data structure to support CA
-  const auto nLayers = isPhase2_ ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
+  constexpr auto nLayers = TrackerTraits::numberOfLayers;
   for (auto i = 0U; i < nLayers + 1; ++i) {
     output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
     LogDebug("SiPixelRecHitSoAFromLegacy")
@@ -279,10 +282,18 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
                                 output->phiBinnerStorage());
 
   LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in "
-                                         << numberOfDetUnits << " Dets";
+                                         << numberOfDetUnits << " Dets"
+                                         << "\n";
   iEvent.put(std::move(output));
   if (convert2Legacy_)
     iEvent.put(std::move(legacyOutput));
 }
 
+using SiPixelRecHitSoAFromLegacy = SiPixelRecHitSoAFromLegacyT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(SiPixelRecHitSoAFromLegacy);
+
+using SiPixelRecHitSoAFromLegacyPhase1 = SiPixelRecHitSoAFromLegacyT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(SiPixelRecHitSoAFromLegacyPhase1);
+
+using SiPixelRecHitSoAFromLegacyPhase2 = SiPixelRecHitSoAFromLegacyT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(SiPixelRecHitSoAFromLegacyPhase2);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 5b862b2cf63b9..f0798cc74a975 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -13,30 +13,31 @@
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
 
+//#define GPU_DEBUG 1
 namespace gpuPixelRecHits {
 
-  __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
+  template <typename TrackerTraits>
+  __global__ void getHits(pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* __restrict__ cpeParams,
                           BeamSpotPOD const* __restrict__ bs,
                           SiPixelDigisCUDASOAView const digis,
                           int numElements,
                           SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters,
-                          TrackingRecHit2DSOAView* phits) {
+                          TrackingRecHit2DSOAViewT<TrackerTraits>* phits) {
     // FIXME
     // the compiler seems NOT to optimize loads from views (even in a simple test case)
     // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
     // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
+
     assert(phits);
     assert(cpeParams);
     auto& hits = *phits;
 
     auto const& clusters = *pclusters;
-    auto isPhase2 = cpeParams->commonParams().isPhase2;
     // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
     if (0 == blockIdx.x) {
       auto& agc = hits.averageGeometry();
       auto const& ag = cpeParams->averageGeometry();
-      auto nLadders =
-          isPhase2 ? phase2PixelTopology::numberOfLaddersInBarrel : phase1PixelTopology::numberOfLaddersInBarrel;
+      auto nLadders = TrackerTraits::numberOfLaddersInBarrel;
 
       for (int il = threadIdx.x, nl = nLadders; il < nl; il += blockDim.x) {
         agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
@@ -68,19 +69,20 @@ namespace gpuPixelRecHits {
 
     if (0 == nclus)
       return;
-#ifdef GPU_DEBUG
-    if (threadIdx.x == 0) {
-      auto k = clusters.moduleStart(1 + blockIdx.x);
-      while (digis.moduleInd(k) == invalidModuleId)
-        ++k;
-      assert(digis.moduleInd(k) == me);
-    }
-#endif
+// #ifdef GPU_DEBUG
+//     if (threadIdx.x == 0) {
+//       auto k = clusters.moduleStart(1 + blockIdx.x);
+//       while (digis.moduleInd(k) == invalidModuleId)
+//         ++k;
+//       assert(digis.moduleInd(k) == me);
+//     }
+// #endif
 #ifdef GPU_DEBUG
     if (me % 100 == 1)
       if (threadIdx.x == 0)
         printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
 #endif
+
     for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
       int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
       int lastClus = startClus + nClusInIter;
@@ -168,11 +170,9 @@ namespace gpuPixelRecHits {
         assert(h < hits.nHits());
         assert(h < clusters.clusModuleStart(me + 1));
 
-        pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
-        if (!isPhase2)
-          pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
-        else
-          pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+        pixelCPEforGPU::position<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+
+        pixelCPEforGPU::errorFromDB<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
         // store it
         hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
diff --git a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py
index 5fa4e0ffaf68c..cf48b22d02a10 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py
@@ -10,12 +10,11 @@
 # 2. Pixel Generic CPE
 #
 from RecoLocalTracker.SiPixelRecHits.PixelCPEGeneric_cfi import *
-from RecoLocalTracker.SiPixelRecHits.PixelCPEFastESProducer_cfi import *
+from RecoLocalTracker.SiPixelRecHits.pixelCPEFastESProducer_cfi import pixelCPEFastESProducer as PixelCPEFastESProducer
+#from RecoLocalTracker.SiPixelRecHits.pixelCPEFastESProducerPhase1_cfi import pixelCPEFastESProducerPhase1 as PixelCPEFastESProducerPhase1
+from RecoLocalTracker.SiPixelRecHits.pixelCPEFastESProducerPhase2_cfi import pixelCPEFastESProducerPhase2 as PixelCPEFastESProducerPhase2
 #
 # 3. ESProducer for the Magnetic-field dependent template records
 #
 from CalibTracker.SiPixelESProducers.SiPixelTemplateDBObjectESProducer_cfi import *
 from CalibTracker.SiPixelESProducers.SiPixel2DTemplateDBObjectESProducer_cfi import *
-
-from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
-phase2_tracker.toModify(PixelCPEFastESProducer, isPhase2 = True)
diff --git a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py
index 43a61651c53a3..370eabae2b06d 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEGeneric_cfi.py
@@ -11,13 +11,13 @@
 # customize the Pixel CPE generic producer for phase2
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
 phase2_tracker.toModify(PixelCPEGenericESProducer,
-  UseErrorsFromTemplates = True,    
-  LoadTemplatesFromDB = True,       
+  UseErrorsFromTemplates = True,
+  LoadTemplatesFromDB = True,
   NoTemplateErrorsWhenNoTrkAngles = True,
   TruncatePixelCharge = False,
   IrradiationBiasCorrection = False, # set IBC off
   DoCosmics = False,
-  Upgrade = True,                    # use 'upgrade' version of hardcoded CPE errors
+  isPhase2 = True,                    # use 'Phase2' version of hardcoded CPE errors
   xerr_barrel_ln = [0.00025, 0.00030, 0.00035, 0.00035],
   xerr_barrel_ln_def = 0.00035,
   yerr_barrel_ln = [0.00210, 0.00115, 0.00125],
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index 4af0238682abb..ec3e068bca422 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -16,16 +16,15 @@
     )
 )
 
-# convert the pixel rechits from legacy to SoA format
-from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy as _siPixelRecHitsPreSplittingSoA
-from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDA_cfi import siPixelRecHitSoAFromCUDA as _siPixelRecHitSoAFromCUDA
-
-siPixelRecHitsPreSplittingCPU = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
-
 # phase 2 tracker modifier
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
-phase2_tracker.toModify(siPixelRecHitsPreSplittingCPU,
-    isPhase2 = True)
+
+# convert the pixel rechits from legacy to SoA format on CPU
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacyPhase1_cfi import siPixelRecHitSoAFromLegacyPhase1 as _siPixelRecHitsPreSplittingSoA
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacyPhase2_cfi import siPixelRecHitSoAFromLegacyPhase2 as _siPixelRecHitsPreSplittingSoAPhase2
+
+siPixelRecHitsPreSplittingCPU = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
+phase2_tracker.toReplaceWith(siPixelRecHitsPreSplittingCPU, _siPixelRecHitsPreSplittingSoAPhase2.clone(convertToLegacy=True, CPE = cms.string('PixelCPEFastPhase2')))
 
 # modifier used to prompt patatrack pixel tracks reconstruction on cpu
 from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit
@@ -45,25 +44,44 @@
 
 # reconstruct the pixel rechits on the gpu
 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDA_cfi import siPixelRecHitCUDA as _siPixelRecHitCUDA
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDAPhase2_cfi import siPixelRecHitCUDAPhase2 as _siPixelRecHitCUDAPhase2
 siPixelRecHitsPreSplittingCUDA = _siPixelRecHitCUDA.clone(
     beamSpot = "offlineBeamSpotToCUDA"
 )
-
-# transfer the pixel rechits to the host and convert them from SoA
-from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDA_cfi import siPixelRecHitFromCUDA as _siPixelRecHitFromCUDA
+phase2_tracker.toReplaceWith(siPixelRecHitsPreSplittingCUDA,_siPixelRecHitCUDAPhase2.clone(
+    beamSpot = "offlineBeamSpotToCUDA"
+))
 
 #this is an alias for the SoA on GPU or CPU to be used for DQM
 siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
     cpu = cms.EDAlias(
             siPixelRecHitsPreSplittingCPU = cms.VPSet(
-                 cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")),
+                 cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHit2DCPUT")),
                  cms.PSet(type = cms.string("uintAsHostProduct"))
              )),
 )
 
-(gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplittingSoA,cuda = _siPixelRecHitSoAFromCUDA.clone())
+phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA,
+cpu = cms.EDAlias(
+        siPixelRecHitsPreSplittingCPU = cms.VPSet(
+             cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHit2DCPUT")),
+             cms.PSet(type = cms.string("uintAsHostProduct"))
+         )))
+
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDAPhase1_cfi import siPixelRecHitSoAFromCUDAPhase1 as _siPixelRecHitSoAFromCUDA
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDAPhase2_cfi import siPixelRecHitSoAFromCUDAPhase2 as _siPixelRecHitSoAFromCUDAPhase2
+
+(gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplittingSoA, cuda = _siPixelRecHitSoAFromCUDA.clone())
+(gpu & pixelNtupletFit & phase2_tracker).toModify(siPixelRecHitsPreSplittingSoA, cuda = _siPixelRecHitSoAFromCUDAPhase2.clone())
+
+# transfer the pixel rechits to the host and convert them from SoA
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDAPhase1_cfi import siPixelRecHitFromCUDAPhase1 as _siPixelRecHitFromCUDA
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDAPhase2_cfi import siPixelRecHitFromCUDAPhase2 as _siPixelRecHitFromCUDAPhase2
 
 (gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplitting, cuda = _siPixelRecHitFromCUDA.clone())
+(gpu & pixelNtupletFit & phase2_tracker).toModify(siPixelRecHitsPreSplitting, cuda = _siPixelRecHitFromCUDAPhase2.clone())
+
+
 
 pixelNtupletFit.toReplaceWith(siPixelRecHitsPreSplittingTask, cms.Task(
     cms.Task(
@@ -76,6 +94,9 @@
         )
         )
 
+
+#(gpu & pixelNtupletFit & phase2_tracker).toReplaceWith(siPixelRecHitsPreSplitting , cuda = _siPixelRecHitFromCUDAPhase2.clone())
+
 (gpu & pixelNtupletFit).toReplaceWith(siPixelRecHitsPreSplittingTask, cms.Task(
     # reconstruct the pixel rechits on the gpu or on the cpu
     # (normally only one of the two is run because only one is consumed from later stages)
diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
index 4b26153cc72c1..9e30bfe50a1ce 100644
--- a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
+++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
@@ -20,13 +20,14 @@ namespace {
 //-----------------------------------------------------------------------------
 //!  The constructor.
 //-----------------------------------------------------------------------------
-PixelCPEFast::PixelCPEFast(edm::ParameterSet const& conf,
-                           const MagneticField* mag,
-                           const TrackerGeometry& geom,
-                           const TrackerTopology& ttopo,
-                           const SiPixelLorentzAngle* lorentzAngle,
-                           const SiPixelGenErrorDBObject* genErrorDBObject,
-                           const SiPixelLorentzAngle* lorentzAngleWidth)
+template <typename TrackerTraits>
+PixelCPEFast<TrackerTraits>::PixelCPEFast(edm::ParameterSet const& conf,
+                                          const MagneticField* mag,
+                                          const TrackerGeometry& geom,
+                                          const TrackerTopology& ttopo,
+                                          const SiPixelLorentzAngle* lorentzAngle,
+                                          const SiPixelGenErrorDBObject* genErrorDBObject,
+                                          const SiPixelLorentzAngle* lorentzAngleWidth)
     : PixelCPEGenericBase(conf, mag, geom, ttopo, lorentzAngle, genErrorDBObject, lorentzAngleWidth) {
   // Use errors from templates or from GenError
   if (useErrorsFromTemplates_) {
@@ -36,8 +37,6 @@ PixelCPEFast::PixelCPEFast(edm::ParameterSet const& conf,
           << (*genErrorDBObject_).version();
   }
 
-  isPhase2_ = conf.getParameter<bool>("isPhase2");
-
   fillParamsForGpu();
 
   cpuData_ = {
@@ -48,18 +47,23 @@ PixelCPEFast::PixelCPEFast(edm::ParameterSet const& conf,
   };
 }
 
-const pixelCPEforGPU::ParamsOnGPU* PixelCPEFast::getGPUProductAsync(cudaStream_t cudaStream) const {
+template <typename TrackerTraits>
+const pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>* PixelCPEFast<TrackerTraits>::getGPUProductAsync(
+    cudaStream_t cudaStream) const {
+  using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>;
+  using LayerGeometry = pixelCPEforGPU::LayerGeometryT<TrackerTraits>;
+  using AverageGeometry = pixelTopology::AverageGeometryT<TrackerTraits>;
+
   const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
     // and now copy to device...
 
     cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
     cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_detParams,
                          this->detParamsGPU_.size() * sizeof(pixelCPEforGPU::DetParams)));
-    cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry)));
-    cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry)));
-    cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_d, sizeof(pixelCPEforGPU::ParamsOnGPU)));
-    cudaCheck(cudaMemcpyAsync(
-        data.paramsOnGPU_d, &data.paramsOnGPU_h, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream));
+    cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_averageGeometry, sizeof(AverageGeometry)));
+    cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_h.m_layerGeometry, sizeof(LayerGeometry)));
+    cudaCheck(cudaMalloc((void**)&data.paramsOnGPU_d, sizeof(ParamsOnGPU)));
+    cudaCheck(cudaMemcpyAsync(data.paramsOnGPU_d, &data.paramsOnGPU_h, sizeof(ParamsOnGPU), cudaMemcpyDefault, stream));
     cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_commonParams,
                               &this->commonParamsGPU_,
                               sizeof(pixelCPEforGPU::CommonParams),
@@ -67,12 +71,12 @@ const pixelCPEforGPU::ParamsOnGPU* PixelCPEFast::getGPUProductAsync(cudaStream_t
                               stream));
     cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_averageGeometry,
                               &this->averageGeometry_,
-                              sizeof(pixelCPEforGPU::AverageGeometry),
+                              sizeof(AverageGeometry),
                               cudaMemcpyDefault,
                               stream));
     cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_layerGeometry,
                               &this->layerGeometry_,
-                              sizeof(pixelCPEforGPU::LayerGeometry),
+                              sizeof(LayerGeometry),
                               cudaMemcpyDefault,
                               stream));
     cudaCheck(cudaMemcpyAsync((void*)data.paramsOnGPU_h.m_detParams,
@@ -84,7 +88,8 @@ const pixelCPEforGPU::ParamsOnGPU* PixelCPEFast::getGPUProductAsync(cudaStream_t
   return data.paramsOnGPU_d;
 }
 
-void PixelCPEFast::fillParamsForGpu() {
+template <typename TrackerTraits>
+void PixelCPEFast<TrackerTraits>::fillParamsForGpu() {
   //
   // this code executes only once per job, computation inefficiency is not an issue
   // many code blocks are repeated: better keep the computation local and self oconsistent as blocks may in future move around, be deleted ...
@@ -95,15 +100,13 @@ void PixelCPEFast::fillParamsForGpu() {
   commonParamsGPU_.thePitchX = m_DetParams[0].thePitchX;
   commonParamsGPU_.thePitchY = m_DetParams[0].thePitchY;
 
-  commonParamsGPU_.numberOfLaddersInBarrel =
-      isPhase2_ ? phase2PixelTopology::numberOfLaddersInBarrel : phase1PixelTopology::numberOfLaddersInBarrel;
-  commonParamsGPU_.isPhase2 = isPhase2_;
+  commonParamsGPU_.numberOfLaddersInBarrel = TrackerTraits::numberOfLaddersInBarrel;
 
   LogDebug("PixelCPEFast") << "pitch & thickness " << commonParamsGPU_.thePitchX << ' ' << commonParamsGPU_.thePitchY
                            << "  " << commonParamsGPU_.theThicknessB << ' ' << commonParamsGPU_.theThicknessE;
 
   // zero average geometry
-  memset(&averageGeometry_, 0, sizeof(pixelCPEforGPU::AverageGeometry));
+  memset(&averageGeometry_, 0, sizeof(pixelTopology::AverageGeometryT<TrackerTraits>));
 
   uint32_t oldLayer = 0;
   uint32_t oldLadder = 0;
@@ -118,22 +121,12 @@ void PixelCPEFast::fillParamsForGpu() {
     auto& p = m_DetParams[i];
     auto& g = detParamsGPU_[i];
 
-    if (!isPhase2_) {
-      g.nRowsRoc = phase1PixelTopology::numRowsInRoc;
-      g.nColsRoc = phase1PixelTopology::numColsInRoc;
-      g.nRows = phase1PixelTopology::numRowsInModule;
-      g.nCols = phase1PixelTopology::numColsInModule;
-
-      g.numPixsInModule = g.nRows * g.nCols;
+    g.nRowsRoc = p.theDet->specificTopology().rowsperroc();
+    g.nColsRoc = p.theDet->specificTopology().colsperroc();
+    g.nRows = p.theDet->specificTopology().rocsX() * g.nRowsRoc;
+    g.nCols = p.theDet->specificTopology().rocsY() * g.nColsRoc;
 
-    } else {
-      g.nRowsRoc = p.theDet->specificTopology().rowsperroc();
-      g.nColsRoc = p.theDet->specificTopology().colsperroc();
-      g.nRows = p.theDet->specificTopology().rocsX() * g.nRowsRoc;
-      g.nCols = p.theDet->specificTopology().rocsY() * g.nColsRoc;
-
-      g.numPixsInModule = g.nRows * g.nCols;
-    }
+    g.numPixsInModule = g.nRows * g.nCols;
 
     assert(p.theDet->index() == int(i));
     assert(commonParamsGPU_.thePitchY == p.thePitchY);
@@ -164,7 +157,7 @@ void PixelCPEFast::fillParamsForGpu() {
       rl = 0;
       zl = 0;
       pl = 0;
-      miz = isPhase2_ ? 500 : 90;
+      miz = 500;
       mxz = 0;
       nl++;
     }
@@ -213,10 +206,7 @@ void PixelCPEFast::fillParamsForGpu() {
       cp.cotalpha = gvx * gvz;
       cp.cotbeta = gvy * gvz;
 
-      if (!isPhase2_)
-        errorFromTemplates(p, cp, 20000.);
-      else
-        cp.qBin_ = 0.f;
+      errorFromTemplates(p, cp, 20000.);
     }
 
 #ifdef EDM_ML_DEBUG
@@ -234,8 +224,11 @@ void PixelCPEFast::fillParamsForGpu() {
     g.sy1 = std::max(21, toMicron(cp.sy1));  // for some angles sy1 is very small
     g.sy2 = std::max(55, toMicron(cp.sy2));  // sometimes sy2 is smaller than others (due to angle?)
 
-    // sample xerr as function of position
-    auto const xoff = float(phase1PixelTopology::xOffset) * commonParamsGPU_.thePitchX;
+    //sample xerr as function of position
+    // moduleOffsetX is the definition of TrackerTraits::xOffset,
+    // needs to be calculated because for Phase2 the modules are not uniform
+    float moduleOffsetX = -(0.5f * float(g.nRows) + TrackerTraits::bigPixXCorrection);
+    auto const xoff = moduleOffsetX * commonParamsGPU_.thePitchX;
 
     for (int ix = 0; ix < CPEFastParametrisation::kNumErrorBins; ++ix) {
       auto x = xoff * (1.f - (0.5f + float(ix)) / 8.f);
@@ -252,7 +245,10 @@ void PixelCPEFast::fillParamsForGpu() {
     }
 #ifdef EDM_ML_DEBUG
     // sample yerr as function of position
-    auto const yoff = float(phase1PixelTopology::yOffset) * commonParamsGPU_.thePitchY;
+    // moduleOffsetY is the definition of TrackerTraits::yOffset (removed)
+    float moduleOffsetY = 0.5f * float(g.nCols) + TrackerTraits::bigPixYCorrection;
+    auto const yoff = -moduleOffsetY * commonParamsGPU_.thePitchY;
+
     for (int ix = 0; ix < CPEFastParametrisation::kNumErrorBins; ++ix) {
       auto y = yoff * (1.f - (0.5f + float(ix)) / 8.f);
       auto gvx = p.theOrigin.x() + 40.f * commonParamsGPU_.thePitchY;
@@ -320,14 +316,14 @@ void PixelCPEFast::fillParamsForGpu() {
     }
   }  // loop over det
 
-  const int numberOfModulesInLadder =
-      isPhase2_ ? int(phase2PixelTopology::numberOfModulesInLadder) : int(phase1PixelTopology::numberOfModulesInLadder);
-  const int numberOfModulesInBarrel =
-      isPhase2_ ? int(phase2PixelTopology::numberOfModulesInBarrel) : int(phase1PixelTopology::numberOfModulesInBarrel);
-  const int numberOfLaddersInBarrel = commonParamsGPU_.numberOfLaddersInBarrel;
+  constexpr int numberOfModulesInLadder = TrackerTraits::numberOfModulesInLadder;
+  constexpr int numberOfLaddersInBarrel = TrackerTraits::numberOfLaddersInBarrel;
+  constexpr int numberOfModulesInBarrel = TrackerTraits::numberOfModulesInBarrel;
+
+  constexpr float ladderFactor = 1.f / float(numberOfModulesInLadder);
 
-  const int firstEndcapPos = 4, firstEndcapNeg = isPhase2_ ? 16 : 7;
-  const float ladderFactor = 1.f / float(numberOfModulesInLadder);
+  constexpr int firstEndcapPos = TrackerTraits::firstEndcapPos;
+  constexpr int firstEndcapNeg = TrackerTraits::firstEndcapNeg;
 
   // compute ladder baricenter (only in global z) for the barrel
   //
@@ -347,44 +343,25 @@ void PixelCPEFast::fillParamsForGpu() {
   }
   assert(il + 1 == int(numberOfLaddersInBarrel));
   // add half_module and tollerance
-  const float module_length = isPhase2_ ? 4.345f : 6.7f;
+  constexpr float moduleLength = TrackerTraits::moduleLength;
   constexpr float module_tolerance = 0.2f;
   for (int il = 0, nl = numberOfLaddersInBarrel; il < nl; ++il) {
-    aveGeom.ladderMinZ[il] -= (0.5f * module_length - module_tolerance);
-    aveGeom.ladderMaxZ[il] += (0.5f * module_length - module_tolerance);
+    aveGeom.ladderMinZ[il] -= (0.5f * moduleLength - module_tolerance);
+    aveGeom.ladderMaxZ[il] += (0.5f * moduleLength - module_tolerance);
   }
 
   // compute "max z" for first layer in endcap (should we restrict to the outermost ring?)
-  if (!isPhase2_) {
-    for (auto im = phase1PixelTopology::layerStart[firstEndcapPos];
-         im < phase1PixelTopology::layerStart[firstEndcapPos + 1];
-         ++im) {
-      auto const& g = detParamsGPU_[im];
-      aveGeom.endCapZ[0] = std::max(aveGeom.endCapZ[0], g.frame.z());
-    }
-    for (auto im = phase1PixelTopology::layerStart[firstEndcapNeg];
-         im < phase1PixelTopology::layerStart[firstEndcapNeg + 1];
-         ++im) {
-      auto const& g = detParamsGPU_[im];
-      aveGeom.endCapZ[1] = std::min(aveGeom.endCapZ[1], g.frame.z());
-    }
-    // correct for outer ring being closer
-    aveGeom.endCapZ[0] -= 1.5f;
-    aveGeom.endCapZ[1] += 1.5f;
-  } else {
-    for (auto im = phase2PixelTopology::layerStart[firstEndcapPos];
-         im < phase2PixelTopology::layerStart[firstEndcapPos + 1];
-         ++im) {
-      auto const& g = detParamsGPU_[im];
-      aveGeom.endCapZ[0] = std::max(aveGeom.endCapZ[0], g.frame.z());
-    }
-    for (auto im = phase2PixelTopology::layerStart[firstEndcapNeg];
-         im < phase2PixelTopology::layerStart[firstEndcapNeg + 1];
-         ++im) {
-      auto const& g = detParamsGPU_[im];
-      aveGeom.endCapZ[1] = std::min(aveGeom.endCapZ[1], g.frame.z());
-    }
+  for (auto im = TrackerTraits::layerStart[firstEndcapPos]; im < TrackerTraits::layerStart[firstEndcapPos + 1]; ++im) {
+    auto const& g = detParamsGPU_[im];
+    aveGeom.endCapZ[0] = std::max(aveGeom.endCapZ[0], g.frame.z());
+  }
+  for (auto im = TrackerTraits::layerStart[firstEndcapNeg]; im < TrackerTraits::layerStart[firstEndcapNeg + 1]; ++im) {
+    auto const& g = detParamsGPU_[im];
+    aveGeom.endCapZ[1] = std::min(aveGeom.endCapZ[1], g.frame.z());
   }
+  // correct for outer ring being closer
+  aveGeom.endCapZ[0] -= TrackerTraits::endcapCorrection;
+  aveGeom.endCapZ[1] += TrackerTraits::endcapCorrection;
 #ifdef EDM_ML_DEBUG
   for (int jl = 0, nl = numberOfLaddersInBarrel; jl < nl; ++jl) {
     LogDebug("PixelCPEFast") << jl << ':' << aveGeom.ladderR[jl] << '/'
@@ -397,19 +374,16 @@ void PixelCPEFast::fillParamsForGpu() {
 #endif  // EDM_ML_DEBUG
 
   // fill Layer and ladders geometry
-  memset(&layerGeometry_, 0, sizeof(pixelCPEforGPU::LayerGeometry));
-  if (!isPhase2_) {
-    memcpy(layerGeometry_.layerStart, phase1PixelTopology::layerStart, sizeof(phase1PixelTopology::layerStart));
-    memcpy(layerGeometry_.layer, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size());
-    layerGeometry_.maxModuleStride = phase1PixelTopology::maxModuleStride;
-  } else {
-    memcpy(layerGeometry_.layerStart, phase2PixelTopology::layerStart, sizeof(phase2PixelTopology::layerStart));
-    memcpy(layerGeometry_.layer, phase2PixelTopology::layer.data(), phase2PixelTopology::layer.size());
-    layerGeometry_.maxModuleStride = phase2PixelTopology::maxModuleStride;
-  }
+  memset(&layerGeometry_, 0, sizeof(pixelCPEforGPU::LayerGeometryT<TrackerTraits>));
+  memcpy(layerGeometry_.layerStart,
+         TrackerTraits::layerStart,
+         sizeof(pixelCPEforGPU::LayerGeometryT<TrackerTraits>::layerStart));
+  memcpy(layerGeometry_.layer, pixelTopology::layer<TrackerTraits>.data(), pixelTopology::layer<TrackerTraits>.size());
+  layerGeometry_.maxModuleStride = pixelTopology::maxModuleStride<TrackerTraits>;
 }
 
-PixelCPEFast::GPUData::~GPUData() {
+template <typename TrackerTraits>
+PixelCPEFast<TrackerTraits>::GPUData::~GPUData() {
   if (paramsOnGPU_d != nullptr) {
     cudaFree((void*)paramsOnGPU_h.m_commonParams);
     cudaFree((void*)paramsOnGPU_h.m_detParams);
@@ -419,9 +393,10 @@ PixelCPEFast::GPUData::~GPUData() {
   }
 }
 
-void PixelCPEFast::errorFromTemplates(DetParam const& theDetParam,
-                                      ClusterParamGeneric& theClusterParam,
-                                      float qclus) const {
+template <typename TrackerTraits>
+void PixelCPEFast<TrackerTraits>::errorFromTemplates(DetParam const& theDetParam,
+                                                     ClusterParamGeneric& theClusterParam,
+                                                     float qclus) const {
   float locBz = theDetParam.bz;
   float locBx = theDetParam.bx;
   LogDebug("PixelCPEFast") << "PixelCPEFast::localPosition(...) : locBz = " << locBz;
@@ -470,12 +445,21 @@ void PixelCPEFast::errorFromTemplates(DetParam const& theDetParam,
   theClusterParam.sy2 = theClusterParam.sy2 * micronsToCm;
 }
 
+template <>
+void PixelCPEFast<pixelTopology::Phase2>::errorFromTemplates(DetParam const& theDetParam,
+                                                             ClusterParamGeneric& theClusterParam,
+                                                             float qclus) const {
+  theClusterParam.qBin_ = 0.0f;
+}
+
 //-----------------------------------------------------------------------------
 //! Hit position in the local frame (in cm).  Unlike other CPE's, this
 //! one converts everything from the measurement frame (in channel numbers)
 //! into the local frame (in centimeters).
 //-----------------------------------------------------------------------------
-LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, ClusterParam& theClusterParamBase) const {
+template <typename TrackerTraits>
+LocalPoint PixelCPEFast<TrackerTraits>::localPosition(DetParam const& theDetParam,
+                                                      ClusterParam& theClusterParamBase) const {
   ClusterParamGeneric& theClusterParam = static_cast<ClusterParamGeneric&>(theClusterParamBase);
 
   assert(!theClusterParam.with_track_angle);
@@ -508,12 +492,12 @@ LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, ClusterParam
   cp.charge[0] = theClusterParam.theCluster->charge();
 
   auto ind = theDetParam.theDet->index();
-  pixelCPEforGPU::position(commonParamsGPU_, detParamsGPU_[ind], cp, 0);
+  pixelCPEforGPU::position<TrackerTraits>(commonParamsGPU_, detParamsGPU_[ind], cp, 0);
   auto xPos = cp.xpos[0];
   auto yPos = cp.ypos[0];
 
   // set the error  (mind ape....)
-  pixelCPEforGPU::errorFromDB(commonParamsGPU_, detParamsGPU_[ind], cp, 0);
+  pixelCPEforGPU::errorFromDB<TrackerTraits>(commonParamsGPU_, detParamsGPU_[ind], cp, 0);
   theClusterParam.sigmax = cp.xerr[0];
   theClusterParam.sigmay = cp.yerr[0];
 
@@ -530,7 +514,9 @@ LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, ClusterParam
 //-------------------------------------------------------------------------
 //  Hit error in the local frame
 //-------------------------------------------------------------------------
-LocalError PixelCPEFast::localError(DetParam const& theDetParam, ClusterParam& theClusterParamBase) const {
+template <typename TrackerTraits>
+LocalError PixelCPEFast<TrackerTraits>::localError(DetParam const& theDetParam,
+                                                   ClusterParam& theClusterParamBase) const {
   ClusterParamGeneric& theClusterParam = static_cast<ClusterParamGeneric&>(theClusterParamBase);
 
   auto xerr = theClusterParam.sigmax;
@@ -544,8 +530,11 @@ LocalError PixelCPEFast::localError(DetParam const& theDetParam, ClusterParam& t
   return LocalError(xerr_sq, 0, yerr_sq);
 }
 
-void PixelCPEFast::fillPSetDescription(edm::ParameterSetDescription& desc) {
+template <typename TrackerTraits>
+void PixelCPEFast<TrackerTraits>::fillPSetDescription(edm::ParameterSetDescription& desc) {
   // call PixelCPEGenericBase fillPSetDescription to add common rechit errors
   PixelCPEGenericBase::fillPSetDescription(desc);
-  desc.add<bool>("isPhase2", false);
 }
+
+template class PixelCPEFast<pixelTopology::Phase1>;
+template class PixelCPEFast<pixelTopology::Phase2>;
diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc
index efffc63015b45..707b2c15d79c6 100644
--- a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc
+++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEGeneric.cc
@@ -54,8 +54,7 @@ PixelCPEGeneric::PixelCPEGeneric(edm::ParameterSet const& conf,
   IrradiationBiasCorrection_ = conf.getParameter<bool>("IrradiationBiasCorrection");
   DoCosmics_ = conf.getParameter<bool>("DoCosmics");
 
-  // Upgrade means phase 2
-  isPhase2_ = conf.getParameter<bool>("Upgrade");
+  isPhase2_ = conf.getParameter<bool>("isPhase2");
 
   // For cosmics force the use of simple errors
   if ((DoCosmics_))
@@ -450,6 +449,6 @@ void PixelCPEGeneric::fillPSetDescription(edm::ParameterSetDescription& desc) {
   desc.add<bool>("TruncatePixelCharge", true);
   desc.add<bool>("IrradiationBiasCorrection", false);
   desc.add<bool>("DoCosmics", false);
-  desc.add<bool>("Upgrade", false);
+  desc.add<bool>("isPhase2", false);
   desc.add<bool>("SmallPitch", false);
 }
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 6954b536aba1f..e941ffb207fce 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -14,7 +14,9 @@
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
 
 # build the pixel vertices in SoA format on the CPU
-from RecoPixelVertexing.PixelVertexFinding.pixelVerticesCUDA_cfi import pixelVerticesCUDA as _pixelVerticesCUDA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexProducerCUDAPhase1_cfi import pixelVertexProducerCUDAPhase1 as _pixelVerticesCUDA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexProducerCUDAPhase2_cfi import pixelVertexProducerCUDAPhase2 as _pixelVerticesCUDAPhase2
+
 pixelVerticesSoA = SwitchProducerCUDA(
     cpu = _pixelVerticesCUDA.clone(
         pixelTrackSrc = "pixelTracksSoA",
@@ -22,13 +24,20 @@
     )
 )
 
+phase2_tracker.toModify(pixelVerticesSoA,cpu = _pixelVerticesCUDAPhase2.clone(
+    pixelTrackSrc = "pixelTracksSoA",
+    onGPU = False,
+    PtMin = 2.0
+))
+
 # convert the pixel vertices from SoA to legacy format
 from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA
-(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelVertices, _pixelVertexFromSoA.clone(
+
+(pixelNtupletFit).toReplaceWith(pixelVertices, _pixelVertexFromSoA.clone(
     src = "pixelVerticesSoA"
 ))
 
-(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelVerticesTask, cms.Task(
+(pixelNtupletFit).toReplaceWith(pixelVerticesTask, cms.Task(
     # build the pixel vertices in SoA format on the CPU
     pixelVerticesSoA,
     # convert the pixel vertices from SoA to legacy format
@@ -45,6 +54,12 @@
     onGPU = True
 )
 
+phase2_tracker.toReplaceWith(pixelVerticesCUDA,_pixelVerticesCUDAPhase2.clone(
+    pixelTrackSrc = "pixelTracksCUDA",
+    onGPU = True,
+    PtMin = 2.0
+))
+
 # transfer the pixel vertices in SoA format to the CPU
 from RecoPixelVertexing.PixelVertexFinding.pixelVerticesSoA_cfi import pixelVerticesSoA as _pixelVerticesSoA
 gpu.toModify(pixelVerticesSoA,
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py
index 51abcd3ea7982..141a999e4979f 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForTriplets.py
@@ -3,8 +3,10 @@
 def customizePixelTracksForTriplets(process):
 
   from HLTrigger.Configuration.common import producers_by_type
-  for producer in producers_by_type(process, 'CAHitNtupletCUDA'):
-        producer.includeJumpingForwardDoublets = True
-        producer.minHitsPerNtuplet = 3
+  producers = ['CAHitNtupletCUDA','CAHitNtupletCUDAPhase1','CAHitNtupletCUDAPhase2']
+  for name in producers:
+  	for producer in producers_by_type(process, name):
+        	producer.includeJumpingForwardDoublets = True
+        	producer.minHitsPerNtuplet = 3
  
   return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index f3d6022e21654..ef73c625ebfa8 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -20,10 +20,12 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
-class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
+template <typename TrackerTraits>
+class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> {
 public:
-  explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);
-  ~PixelTrackDumpCUDA() override = default;
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  explicit PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig);
+  ~PixelTrackDumpCUDAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -36,7 +38,8 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
   edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
 };
 
-PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+PixelTrackDumpCUDAT<TrackerTraits>::PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig)
     : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
     tokenGPUTrack_ =
@@ -44,23 +47,25 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
     tokenGPUVertex_ =
         consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
-    tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoATrack_ = consumes(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   }
 }
 
-void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void PixelTrackDumpCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
   desc.add<bool>("onGPU", true);
   desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("pixelTracksCUDA"));
   desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVerticesCUDA"));
-  descriptions.add("pixelTrackDumpCUDA", desc);
+  descriptions.addWithDefaultLabel(desc);
 }
 
-void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
-                                 edm::Event const& iEvent,
-                                 const edm::EventSetup& iSetup) const {
+template <typename TrackerTraits>
+void PixelTrackDumpCUDAT<TrackerTraits>::analyze(edm::StreamID streamID,
+                                                 edm::Event const& iEvent,
+                                                 const edm::EventSetup& iSetup) const {
   if (m_onGPU) {
     auto const& hTracks = iEvent.get(tokenGPUTrack_);
     cms::cuda::ScopedContextProduce ctx{hTracks};
@@ -82,4 +87,11 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
   }
 }
 
+using PixelTrackDumpCUDA = PixelTrackDumpCUDAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(PixelTrackDumpCUDA);
+
+using PixelTrackDumpCUDAPhase1 = PixelTrackDumpCUDAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(PixelTrackDumpCUDAPhase1);
+
+using PixelTrackDumpCUDAPhase2 = PixelTrackDumpCUDAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(PixelTrackDumpCUDAPhase2);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 59ba877e9e626..6a0f918b0d979 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -29,20 +29,24 @@
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 #include "storeTracks.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 
 /**
  * This class creates "leagcy"  reco::Track
- * objects from the output of SoA CA. 
+ * objects from the output of SoA CA.
  */
-class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
+template <typename TrackerTraits>
+class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> {
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+
 public:
-  using IndToEdm = std::vector<uint16_t>;
+  using IndToEdm = std::vector<uint32_t>;
 
-  explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig);
-  ~PixelTrackProducerFromSoA() override = default;
+  explicit PixelTrackProducerFromSoAT(const edm::ParameterSet &iConfig);
+  ~PixelTrackProducerFromSoAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
 
@@ -65,9 +69,10 @@ class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
   pixelTrack::Quality const minQuality_;
 };
 
-PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig)
+template <typename TrackerTraits>
+PixelTrackProducerFromSoAT<TrackerTraits>::PixelTrackProducerFromSoAT(const edm::ParameterSet &iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      tokenTrack_(consumes(iConfig.getParameter<edm::InputTag>("trackSrc"))),
       cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
       idealMagneticFieldToken_(esConsumes()),
@@ -91,7 +96,8 @@ PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iC
   produces<IndToEdm>();
 }
 
-void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+template <typename TrackerTraits>
+void PixelTrackProducerFromSoAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
   desc.add<edm::InputTag>("trackSrc", edm::InputTag("pixelTracksSoA"));
@@ -101,9 +107,10 @@ void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions
   descriptions.addWithDefaultLabel(desc);
 }
 
-void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
-                                        edm::Event &iEvent,
-                                        const edm::EventSetup &iSetup) const {
+template <typename TrackerTraits>
+void PixelTrackProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
+                                                        edm::Event &iEvent,
+                                                        const edm::EventSetup &iSetup) const {
   // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity };
   reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality,
                                                  reco::TrackBase::undefQuality,
@@ -175,9 +182,10 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
     auto nHits = tsoa.nHits(it);
     assert(nHits >= 3);
     auto q = quality[it];
+
     if (q < minQuality_)
       continue;
-    if (nHits < minNumberOfHits_)
+    if (tsoa.nLayers(it) < minNumberOfHits_)
       continue;
     indToEdm[it] = nt;
     ++nt;
@@ -244,4 +252,11 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   iEvent.put(std::move(indToEdmP));
 }
 
+using PixelTrackProducerFromSoA = PixelTrackProducerFromSoAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(PixelTrackProducerFromSoA);
+
+using PixelTrackProducerFromSoAPhase1 = PixelTrackProducerFromSoAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoAPhase1);
+
+using PixelTrackProducerFromSoAPhase2 = PixelTrackProducerFromSoAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoAPhase2);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 5cf4aac491901..0675effd091e8 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -19,10 +19,14 @@
 // Switch on to enable checks and printout for found tracks
 // #define PIXEL_DEBUG_PRODUCE
 
-class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+template <typename TrackerTraits>
+class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer<edm::ExternalWork> {
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using TrackSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+
 public:
-  explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig);
-  ~PixelTrackSoAFromCUDA() override = default;
+  explicit PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig);
+  ~PixelTrackSoAFromCUDAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -35,23 +39,26 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
+  cms::cuda::host::unique_ptr<TrackSoA> soa_;
 };
 
-PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+PixelTrackSoAFromCUDAT<TrackerTraits>::PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig)
     : tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
 
-void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void PixelTrackSoAFromCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("src", edm::InputTag("pixelTracksCUDA"));
-  descriptions.add("pixelTracksSoA", desc);
+  descriptions.addWithDefaultLabel(desc);
 }
 
-void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
-                                    edm::EventSetup const& iSetup,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+template <typename TrackerTraits>
+void PixelTrackSoAFromCUDAT<TrackerTraits>::acquire(edm::Event const& iEvent,
+                                                    edm::EventSetup const& iSetup,
+                                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
@@ -59,10 +66,11 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
   soa_ = inputData.toHostAsync(ctx.stream());
 }
 
-void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  // check that the fixed-size SoA does not overflow
+template <typename TrackerTraits>
+void PixelTrackSoAFromCUDAT<TrackerTraits>::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   auto const& tsoa = *soa_;
   auto maxTracks = tsoa.stride();
+
   auto nTracks = tsoa.nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
@@ -91,4 +99,11 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
   assert(!soa_);
 }
 
+using PixelTrackSoAFromCUDA = PixelTrackSoAFromCUDAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
+
+using PixelTrackSoAFromCUDAPhase1 = PixelTrackSoAFromCUDAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(PixelTrackSoAFromCUDAPhase1);
+
+using PixelTrackSoAFromCUDAPhase2 = PixelTrackSoAFromCUDAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(PixelTrackSoAFromCUDAPhase2);
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 4f0f6f93cab62..7aeb0e80c60b0 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -91,7 +91,10 @@
 # "Patatrack" pixel ntuplets, fishbone cleaning, Broken Line fit, and density-based vertex reconstruction
 from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit
 
-from RecoPixelVertexing.PixelTriplets.pixelTracksCUDA_cfi import pixelTracksCUDA as _pixelTracksCUDA
+from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDAPhase1_cfi import caHitNtupletCUDAPhase1 as _pixelTracksCUDA
+from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDAPhase2_cfi import caHitNtupletCUDAPhase2 as _pixelTracksCUDAPhase2
+
+from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
 
 # SwitchProducer providing the pixel tracks in SoA format on the CPU
 pixelTracksSoA = SwitchProducerCUDA(
@@ -102,25 +105,31 @@
         onGPU = False
     )
 )
+
 # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows
 run3_common.toModify(pixelTracksSoA.cpu,
     idealConditions = True
 )
 
 # convert the pixel tracks from SoA to legacy format
-from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackProducerFromSoA
-(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoA.clone(
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoAPhase1_cfi import pixelTrackProducerFromSoAPhase1 as _pixelTrackProducerFromSoA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoAPhase2_cfi import pixelTrackProducerFromSoAPhase2 as _pixelTrackProducerFromSoAPhase2
+
+pixelNtupletFit.toReplaceWith(pixelTracks, _pixelTrackProducerFromSoA.clone(
     pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting",
 ))
 
-(pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelTracksTask, cms.Task(
+(pixelNtupletFit & phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoAPhase2.clone(
+    pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting",
+))
+
+pixelNtupletFit.toReplaceWith(pixelTracksTask, cms.Task(
     # build the pixel ntuplets and the pixel tracks in SoA format on the GPU
     pixelTracksSoA,
     # convert the pixel tracks from SoA to legacy format
     pixelTracks
 ))
 
-
 # "Patatrack" sequence running on GPU (or CPU if not available)
 from Configuration.ProcessModifiers.gpu_cff import gpu
 
@@ -128,23 +137,37 @@
 pixelTracksCUDA = _pixelTracksCUDA.clone(
     pixelRecHitSrc = "siPixelRecHitsPreSplittingCUDA",
     idealConditions = False,
-    onGPU = True
+    onGPU = True,
 )
+
 # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows
 run3_common.toModify(pixelTracksCUDA,
     idealConditions = True
 )
 
 # SwitchProducer providing the pixel tracks in SoA format on the CPU
-from RecoPixelVertexing.PixelTrackFitting.pixelTracksSoA_cfi import pixelTracksSoA as _pixelTracksSoA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoAFromCUDAPhase1_cfi import pixelTrackSoAFromCUDAPhase1 as _pixelTracksSoA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoAFromCUDAPhase2_cfi import pixelTrackSoAFromCUDAPhase2 as _pixelTracksSoAPhase2
+
 gpu.toModify(pixelTracksSoA,
     # transfer the pixel tracks in SoA format to the host
     cuda = _pixelTracksSoA.clone()
 )
 
-from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
+(gpu & phase2_tracker).toModify(pixelTracksSoA,cuda = _pixelTracksSoAPhase2.clone(
+))
+
+phase2_tracker.toModify(pixelTracksSoA,cpu = _pixelTracksCUDAPhase2.clone(
+    pixelRecHitSrc = "siPixelRecHitsPreSplittingSoA",
+    onGPU = False
+))
+
+phase2_tracker.toReplaceWith(pixelTracksCUDA,_pixelTracksCUDAPhase2.clone(
+    pixelRecHitSrc = "siPixelRecHitsPreSplittingCUDA",
+    onGPU = True,
+))
 
-(pixelNtupletFit & gpu & ~phase2_tracker).toReplaceWith(pixelTracksTask, cms.Task(
+(pixelNtupletFit & gpu).toReplaceWith(pixelTracksTask, cms.Task(
     # build the pixel ntuplets and pixel tracks in SoA format on the GPU
     pixelTracksCUDA,
     # transfer the pixel tracks in SoA format to the CPU, and convert them to legacy format
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index 1523640e2ef8f..d6a9db4953be1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -1,6 +1,9 @@
 #include "BrokenLineFitOnGPU.h"
 
-void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
+template <typename TrackerTraits>
+void HelixFitOnGPU<TrackerTraits>::launchBrokenLineKernelsOnCPU(HitsView const* hv,
+                                                                uint32_t hitsInFit,
+                                                                uint32_t maxNumberOfTuples) {
   assert(tuples_);
 
 #ifdef BROKENLINE_DEBUG
@@ -8,7 +11,7 @@ void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hi
 #endif
 
   //  Fit internals
-  auto tkidGPU = std::make_unique<caConstants::tindex_type[]>(maxNumberOfConcurrentFits_);
+  auto tkidGPU = std::make_unique<typename TrackerTraits::tindex_type[]>(maxNumberOfConcurrentFits_);
   auto hitsGPU =
       std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double));
   auto hits_geGPU =
@@ -18,104 +21,97 @@ void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hi
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernel_BLFastFit<3>(tuples_,
-                        tupleMultiplicity_,
-                        hv,
-                        tkidGPU.get(),
-                        hitsGPU.get(),
-                        hits_geGPU.get(),
-                        fast_fit_resultsGPU.get(),
-                        3,
-                        3,
-                        offset);
-
-    kernel_BLFit<3>(tupleMultiplicity_,
-                    bField_,
-                    outputSoa_,
-                    tkidGPU.get(),
-                    hitsGPU.get(),
-                    hits_geGPU.get(),
-                    fast_fit_resultsGPU.get());
+    kernel_BLFastFit<3, TrackerTraits>(tuples_,
+                                       tupleMultiplicity_,
+                                       hv,
+                                       tkidGPU.get(),
+                                       hitsGPU.get(),
+                                       hits_geGPU.get(),
+                                       fast_fit_resultsGPU.get(),
+                                       3,
+                                       3,
+                                       offset);
 
+    kernel_BLFit<3, TrackerTraits>(tupleMultiplicity_,
+                                   bField_,
+                                   outputSoa_,
+                                   tkidGPU.get(),
+                                   hitsGPU.get(),
+                                   hits_geGPU.get(),
+                                   fast_fit_resultsGPU.get());
     if (fitNas4_) {
-      // fit all as 4
-      kernel_BLFastFit<4>(tuples_,
-                          tupleMultiplicity_,
-                          hv,
-                          tkidGPU.get(),
-                          hitsGPU.get(),
-                          hits_geGPU.get(),
-                          fast_fit_resultsGPU.get(),
-                          4,
-                          8,
-                          offset);
+      riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrack, 1>(
+          [this, &hv, &tkidGPU, &hitsGPU, &hits_geGPU, &fast_fit_resultsGPU, &offset](auto i) {
+            kernel_BLFastFit<4, TrackerTraits>(tuples_,
+                                               tupleMultiplicity_,
+                                               hv,
+                                               tkidGPU.get(),
+                                               hitsGPU.get(),
+                                               hits_geGPU.get(),
+                                               fast_fit_resultsGPU.get(),
+                                               4,
+                                               i,
+                                               offset);
+
+            kernel_BLFit<4, TrackerTraits>(tupleMultiplicity_,
+                                           bField_,
+                                           outputSoa_,
+                                           tkidGPU.get(),
+                                           hitsGPU.get(),
+                                           hits_geGPU.get(),
+                                           fast_fit_resultsGPU.get());
+          });
 
-      kernel_BLFit<4>(tupleMultiplicity_,
-                      bField_,
-                      outputSoa_,
-                      tkidGPU.get(),
-                      hitsGPU.get(),
-                      hits_geGPU.get(),
-                      fast_fit_resultsGPU.get());
     } else {
-      // fit quads
-      kernel_BLFastFit<4>(tuples_,
-                          tupleMultiplicity_,
-                          hv,
-                          tkidGPU.get(),
-                          hitsGPU.get(),
-                          hits_geGPU.get(),
-                          fast_fit_resultsGPU.get(),
-                          4,
-                          4,
-                          offset);
+      //Fit these using all the hits they have
+      riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrackForFullFit, 1>(
+          [this, &hv, &tkidGPU, &hitsGPU, &hits_geGPU, &fast_fit_resultsGPU, &offset](auto i) {
+            kernel_BLFastFit<i, TrackerTraits>(tuples_,
+                                               tupleMultiplicity_,
+                                               hv,
+                                               tkidGPU.get(),
+                                               hitsGPU.get(),
+                                               hits_geGPU.get(),
+                                               fast_fit_resultsGPU.get(),
+                                               i,
+                                               i,
+                                               offset);
+
+            kernel_BLFit<i, TrackerTraits>(tupleMultiplicity_,
+                                           bField_,
+                                           outputSoa_,
+                                           tkidGPU.get(),
+                                           hitsGPU.get(),
+                                           hits_geGPU.get(),
+                                           fast_fit_resultsGPU.get());
+          });
 
-      kernel_BLFit<4>(tupleMultiplicity_,
-                      bField_,
-                      outputSoa_,
-                      tkidGPU.get(),
-                      hitsGPU.get(),
-                      hits_geGPU.get(),
-                      fast_fit_resultsGPU.get());
-      // fit penta (all 5)
-      kernel_BLFastFit<5>(tuples_,
-                          tupleMultiplicity_,
-                          hv,
-                          tkidGPU.get(),
-                          hitsGPU.get(),
-                          hits_geGPU.get(),
-                          fast_fit_resultsGPU.get(),
-                          5,
-                          5,
-                          offset);
+      static_assert(TrackerTraits::maxHitsOnTrackForFullFit < TrackerTraits::maxHitsOnTrack);
 
-      kernel_BLFit<5>(tupleMultiplicity_,
-                      bField_,
-                      outputSoa_,
-                      tkidGPU.get(),
-                      hitsGPU.get(),
-                      hits_geGPU.get(),
-                      fast_fit_resultsGPU.get());
-      // fit sexta and above (as 6)
-      kernel_BLFastFit<6>(tuples_,
-                          tupleMultiplicity_,
-                          hv,
-                          tkidGPU.get(),
-                          hitsGPU.get(),
-                          hits_geGPU.get(),
-                          fast_fit_resultsGPU.get(),
-                          6,
-                          8,
-                          offset);
+      //Fit all the rest using the maximum from previous call
 
-      kernel_BLFit<6>(tupleMultiplicity_,
-                      bField_,
-                      outputSoa_,
-                      tkidGPU.get(),
-                      hitsGPU.get(),
-                      hits_geGPU.get(),
-                      fast_fit_resultsGPU.get());
+      kernel_BLFastFit<TrackerTraits::maxHitsOnTrackForFullFit, TrackerTraits>(tuples_,
+                                                                               tupleMultiplicity_,
+                                                                               hv,
+                                                                               tkidGPU.get(),
+                                                                               hitsGPU.get(),
+                                                                               hits_geGPU.get(),
+                                                                               fast_fit_resultsGPU.get(),
+                                                                               TrackerTraits::maxHitsOnTrackForFullFit,
+                                                                               TrackerTraits::maxHitsOnTrack - 1,
+                                                                               offset);
+
+      kernel_BLFit<TrackerTraits::maxHitsOnTrackForFullFit, TrackerTraits>(tupleMultiplicity_,
+                                                                           bField_,
+                                                                           outputSoa_,
+                                                                           tkidGPU.get(),
+                                                                           hitsGPU.get(),
+                                                                           hits_geGPU.get(),
+                                                                           fast_fit_resultsGPU.get());
     }
 
   }  // loop on concurrent fits
 }
+
+template class HelixFitOnGPU<pixelTopology::Phase1>;
+template class HelixFitOnGPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index d99a96b705451..b1ee028b8863e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,17 +1,19 @@
 #include "BrokenLineFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
-                                            uint32_t hitsInFit,
-                                            uint32_t maxNumberOfTuples,
-                                            cudaStream_t stream) {
+template <typename TrackerTraits>
+void HelixFitOnGPU<TrackerTraits>::launchBrokenLineKernels(HitsView const *hv,
+                                                           uint32_t hitsInFit,
+                                                           uint32_t maxNumberOfTuples,
+                                                           cudaStream_t stream) {
   assert(tuples_);
 
   auto blockSize = 64;
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto tkidGPU = cms::cuda::make_device_unique<caConstants::tindex_type[]>(maxNumberOfConcurrentFits_, stream);
+  auto tkidGPU =
+      cms::cuda::make_device_unique<typename TrackerTraits::tindex_type[]>(maxNumberOfConcurrentFits_, stream);
   auto hitsGPU = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream);
   auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
@@ -21,112 +23,122 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tuples_,
-                                                                  tupleMultiplicity_,
-                                                                  hv,
-                                                                  tkidGPU.get(),
-                                                                  hitsGPU.get(),
-                                                                  hits_geGPU.get(),
-                                                                  fast_fit_resultsGPU.get(),
-                                                                  3,
-                                                                  3,
-                                                                  offset);
+
+    kernel_BLFastFit<3, TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(tuples_,
+                                                                                 tupleMultiplicity_,
+                                                                                 hv,
+                                                                                 tkidGPU.get(),
+                                                                                 hitsGPU.get(),
+                                                                                 hits_geGPU.get(),
+                                                                                 fast_fit_resultsGPU.get(),
+                                                                                 3,
+                                                                                 3,
+                                                                                 offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                              bField_,
-                                                              outputSoa_,
-                                                              tkidGPU.get(),
-                                                              hitsGPU.get(),
-                                                              hits_geGPU.get(),
-                                                              fast_fit_resultsGPU.get());
+    kernel_BLFit<3, TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                             bField_,
+                                                                             outputSoa_,
+                                                                             tkidGPU.get(),
+                                                                             hitsGPU.get(),
+                                                                             hits_geGPU.get(),
+                                                                             fast_fit_resultsGPU.get());
     cudaCheck(cudaGetLastError());
 
     if (fitNas4_) {
       // fit all as 4
-      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
-                                                                        tupleMultiplicity_,
-                                                                        hv,
-                                                                        tkidGPU.get(),
-                                                                        hitsGPU.get(),
-                                                                        hits_geGPU.get(),
-                                                                        fast_fit_resultsGPU.get(),
-                                                                        4,
-                                                                        8,
-                                                                        offset);
-      cudaCheck(cudaGetLastError());
-
-      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                    bField_,
-                                                                    outputSoa_,
-                                                                    tkidGPU.get(),
-                                                                    hitsGPU.get(),
-                                                                    hits_geGPU.get(),
-                                                                    fast_fit_resultsGPU.get());
+      riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrack, 1>([this,
+                                                                     &hv,
+                                                                     &tkidGPU,
+                                                                     &hitsGPU,
+                                                                     &hits_geGPU,
+                                                                     &fast_fit_resultsGPU,
+                                                                     &offset,
+                                                                     &numberOfBlocks,
+                                                                     &blockSize,
+                                                                     &stream](auto i) {
+        kernel_BLFastFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
+                                                                                         tupleMultiplicity_,
+                                                                                         hv,
+                                                                                         tkidGPU.get(),
+                                                                                         hitsGPU.get(),
+                                                                                         hits_geGPU.get(),
+                                                                                         fast_fit_resultsGPU.get(),
+                                                                                         4,
+                                                                                         4,
+                                                                                         offset);
+
+        cudaCheck(cudaGetLastError());
+
+        kernel_BLFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                     bField_,
+                                                                                     outputSoa_,
+                                                                                     tkidGPU.get(),
+                                                                                     hitsGPU.get(),
+                                                                                     hits_geGPU.get(),
+                                                                                     fast_fit_resultsGPU.get());
+
+        cudaCheck(cudaGetLastError());
+      });
+
     } else {
-      // fit quads
-      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
-                                                                        tupleMultiplicity_,
-                                                                        hv,
-                                                                        tkidGPU.get(),
-                                                                        hitsGPU.get(),
-                                                                        hits_geGPU.get(),
-                                                                        fast_fit_resultsGPU.get(),
-                                                                        4,
-                                                                        4,
-                                                                        offset);
-      cudaCheck(cudaGetLastError());
-
-      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+      riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrackForFullFit, 1>([this,
+                                                                               &hv,
+                                                                               &tkidGPU,
+                                                                               &hitsGPU,
+                                                                               &hits_geGPU,
+                                                                               &fast_fit_resultsGPU,
+                                                                               &offset,
+                                                                               &numberOfBlocks,
+                                                                               &blockSize,
+                                                                               &stream](auto i) {
+        kernel_BLFastFit<i, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
+                                                                                         tupleMultiplicity_,
+                                                                                         hv,
+                                                                                         tkidGPU.get(),
+                                                                                         hitsGPU.get(),
+                                                                                         hits_geGPU.get(),
+                                                                                         fast_fit_resultsGPU.get(),
+                                                                                         i,
+                                                                                         i,
+                                                                                         offset);
+
+        kernel_BLFit<i, TrackerTraits><<<8, blockSize, 0, stream>>>(tupleMultiplicity_,
                                                                     bField_,
                                                                     outputSoa_,
                                                                     tkidGPU.get(),
                                                                     hitsGPU.get(),
                                                                     hits_geGPU.get(),
                                                                     fast_fit_resultsGPU.get());
-      // fit penta (all 5)
-      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
-                                                                        tupleMultiplicity_,
-                                                                        hv,
-                                                                        tkidGPU.get(),
-                                                                        hitsGPU.get(),
-                                                                        hits_geGPU.get(),
-                                                                        fast_fit_resultsGPU.get(),
-                                                                        5,
-                                                                        5,
-                                                                        offset);
-      cudaCheck(cudaGetLastError());
-
-      kernel_BLFit<5><<<8, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                   bField_,
-                                                   outputSoa_,
-                                                   tkidGPU.get(),
-                                                   hitsGPU.get(),
-                                                   hits_geGPU.get(),
-                                                   fast_fit_resultsGPU.get());
-      cudaCheck(cudaGetLastError());
-      // fit sexta and above (as 6)
-      kernel_BLFastFit<6><<<4, blockSize, 0, stream>>>(tuples_,
-                                                       tupleMultiplicity_,
-                                                       hv,
-                                                       tkidGPU.get(),
-                                                       hitsGPU.get(),
-                                                       hits_geGPU.get(),
-                                                       fast_fit_resultsGPU.get(),
-                                                       6,
-                                                       8,
-                                                       offset);
-      cudaCheck(cudaGetLastError());
-
-      kernel_BLFit<6><<<4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                   bField_,
-                                                   outputSoa_,
-                                                   tkidGPU.get(),
-                                                   hitsGPU.get(),
-                                                   hits_geGPU.get(),
-                                                   fast_fit_resultsGPU.get());
-      cudaCheck(cudaGetLastError());
+      });
+
+      static_assert(TrackerTraits::maxHitsOnTrackForFullFit < TrackerTraits::maxHitsOnTrack);
+
+      //Fit all the rest using the maximum from previous call
+      kernel_BLFastFit<TrackerTraits::maxHitsOnTrackForFullFit, TrackerTraits>
+          <<<numberOfBlocks / 4, blockSize, 0, stream>>>(tuples_,
+                                                         tupleMultiplicity_,
+                                                         hv,
+                                                         tkidGPU.get(),
+                                                         hitsGPU.get(),
+                                                         hits_geGPU.get(),
+                                                         fast_fit_resultsGPU.get(),
+                                                         TrackerTraits::maxHitsOnTrackForFullFit,
+                                                         TrackerTraits::maxHitsOnTrack - 1,
+                                                         offset);
+
+      kernel_BLFit<TrackerTraits::maxHitsOnTrackForFullFit, TrackerTraits>
+          <<<8, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                        bField_,
+                                        outputSoa_,
+                                        tkidGPU.get(),
+                                        hitsGPU.get(),
+                                        hits_geGPU.get(),
+                                        fast_fit_resultsGPU.get());
     }
 
   }  // loop on concurrent fits
 }
+
+template class HelixFitOnGPU<pixelTopology::Phase1>;
+template class HelixFitOnGPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 6ec6afb83cba1..4d1d57c4e27a8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -2,8 +2,8 @@
 // Author: Felice Pantaleo, CERN
 //
 
-// #define BROKENLINE_DEBUG
-
+//#define BROKENLINE_DEBUG
+//#define BL_DUMP_HITS
 #include <cstdint>
 
 #include <cuda_runtime.h>
@@ -16,19 +16,25 @@
 
 #include "HelixFitOnGPU.h"
 
-using HitsOnGPU = TrackingRecHit2DSOAView;
-using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
-using tindex_type = caConstants::tindex_type;
-constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
+template <typename TrackerTraits>
+using HitsOnGPU = TrackingRecHit2DSOAViewT<TrackerTraits>;
+template <typename TrackerTraits>
+using Tuples = pixelTrack::HitContainerT<TrackerTraits>;
+template <typename TrackerTraits>
+using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+template <typename TrackerTraits>
+using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+
+// using tindex_type = typename TrackerTraits::tindex_type;
+// constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
 
 // #define BL_DUMP_HITS
 
-template <int N>
-__global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
-                                 caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                 HitsOnGPU const *__restrict__ hhp,
-                                 tindex_type *__restrict__ ptkids,
+template <int N, typename TrackerTraits>
+__global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ foundNtuplets,
+                                 TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
+                                 HitsOnGPU<TrackerTraits> const *__restrict__ hhp,
+                                 typename TrackerTraits::tindex_type *__restrict__ ptkids,
                                  double *__restrict__ phits,
                                  float *__restrict__ phits_ge,
                                  double *__restrict__ pfast_fit,
@@ -36,6 +42,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
                                  uint32_t nHitsH,
                                  int32_t offset) {
   constexpr uint32_t hitsInFit = N;
+  constexpr auto invalidTkId = std::numeric_limits<typename TrackerTraits::tindex_type>::max();
 
   assert(hitsInFit <= nHitsL);
   assert(nHitsL <= nHitsH);
@@ -67,7 +74,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
     }
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx);
-    assert(tkid < foundNtuplets->nOnes());
+    assert(int(tkid) < foundNtuplets->nOnes());
 
     ptkids[local_idx] = tkid;
 
@@ -166,29 +173,28 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
   }
 }
 
-template <int N>
-__global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+template <int N, typename TrackerTraits>
+__global__ void kernel_BLFit(TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
                              double bField,
-                             OutputSoA *results,
-                             tindex_type const *__restrict__ ptkids,
+                             OutputSoA<TrackerTraits> *results,
+                             typename TrackerTraits::tindex_type const *__restrict__ ptkids,
                              double *__restrict__ phits,
                              float *__restrict__ phits_ge,
                              double *__restrict__ pfast_fit) {
   assert(results);
   assert(pfast_fit);
+  constexpr auto invalidTkId = std::numeric_limits<typename TrackerTraits::tindex_type>::max();
 
   // same as above...
-
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
   for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     if (invalidTkId == ptkids[local_idx])
       break;
-
     auto tkid = ptkids[local_idx];
 
-    assert(tkid < caConstants::maxTuples);
+    assert(tkid < TrackerTraits::maxNumberOfTuples);
 
     riemannFit::Map3xNd<N> hits(phits + local_idx);
     riemannFit::Map4d fast_fit(pfast_fit + local_idx);
@@ -213,7 +219,7 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__
       printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
     printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
            N,
-           nHits,
+           N,
            tkid,
            circle.par(0),
            circle.par(1),
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
deleted file mode 100644
index 127831e0e2eb7..0000000000000
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
-#define RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
-
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
-//#define ONLY_PHICUT
-
-// Cellular automaton constants
-namespace caConstants {
-
-  // constants
-#ifdef ONLY_PHICUT
-  constexpr uint32_t maxCellNeighbors = 64;
-  constexpr uint32_t maxCellTracks = 64;
-  constexpr uint32_t maxNumberOfTuples = 48 * 1024;
-  constexpr uint32_t maxNumberOfDoublets = 2 * 1024 * 1024;
-  constexpr uint32_t maxCellsPerHit = 8 * 128;
-#else  // ONLY_PHICUT
-  constexpr uint32_t maxCellNeighbors = 36;
-  constexpr uint32_t maxCellTracks = 48;
-#ifdef GPU_SMALL_EVENTS
-  // kept for testing and debugging
-  constexpr uint32_t maxNumberOfTuples = 3 * 1024;
-  constexpr uint32_t maxNumberOfDoublets = 128 * 1024;
-  constexpr uint32_t maxCellsPerHit = 128 / 2;
-#else   // GPU_SMALL_EVENTS
-  // tested on MC events with 55-75 pileup events
-  // and extended for Heavy Ions operations (24k -> 32k tuples, 128 -> 256 cells)
-  constexpr uint32_t maxNumberOfTuples = 32 * 1024;
-  constexpr uint32_t maxNumberOfDoublets = 512 * 1024;
-  constexpr uint32_t maxCellsPerHit = 256;
-#endif  // GPU_SMALL_EVENTS
-#endif  // ONLY_PHICUT
-  constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
-  constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples;
-
-  constexpr uint32_t maxNumberOfLayerPairs = 20;
-  constexpr uint32_t maxNumberOfLayers = 10;
-  constexpr uint32_t maxTuples = maxNumberOfTuples;
-  constexpr int32_t maxHitsOnTrack = 10;
-
-  // Modules constants
-  constexpr uint32_t max_ladder_bpx0 = 12;
-  constexpr uint32_t first_ladder_bpx0 = 0;
-  constexpr float module_length_bpx0 = 6.7f;
-  constexpr float module_tolerance_bpx0 = 0.4f;  // projection to cylinder is inaccurate on BPIX1
-  constexpr uint32_t max_ladder_bpx4 = 64;
-  constexpr uint32_t first_ladder_bpx4 = 84;
-  constexpr float radius_even_ladder = 15.815f;
-  constexpr float radius_odd_ladder = 16.146f;
-  constexpr float module_length_bpx4 = 6.7f;
-  constexpr float module_tolerance_bpx4 = 0.2f;
-  constexpr float barrel_z_length = 26.f;
-  constexpr float forward_z_begin = 32.f;
-
-  // Last indexes
-  constexpr uint32_t last_bpix1_detIndex = 96;
-  constexpr uint32_t last_barrel_detIndex = 1184;
-
-  // types
-  using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
-  using tindex_type = uint16_t;  // for tuples
-
-  using CellNeighbors = cms::cuda::VecArray<uint32_t, maxCellNeighbors>;
-  using CellTracks = cms::cuda::VecArray<tindex_type, maxCellTracks>;
-
-  using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
-  using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
-
-  using OuterHitOfCellContainer = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
-  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
-  using HitToTuple = cms::cuda::OneToManyAssoc<tindex_type, -1, 4 * maxTuples>;  // 3.5 should be enough
-  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, maxHitsOnTrack + 1, maxTuples>;
-
-  struct OuterHitOfCell {
-    OuterHitOfCellContainer* container;
-    int32_t offset;
-    constexpr auto& operator[](int i) { return container[i - offset]; }
-    constexpr auto const& operator[](int i) const { return container[i - offset]; }
-  };
-
-}  // namespace caConstants
-
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 72c482c6189db..fade739410e2f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -23,10 +23,18 @@
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 
-class CAHitNtupletCUDA : public edm::global::EDProducer<> {
+template <typename TrackerTraits>
+class CAHitNtupletCUDAT : public edm::global::EDProducer<> {
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+
+  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using HitsOnGPU = TrackingRecHit2DGPUT<TrackerTraits>;
+  using HitsOnCPU = TrackingRecHit2DCPUT<TrackerTraits>;
+  using GPUAlgo = CAHitNtupletGeneratorOnGPU<TrackerTraits>;
+
 public:
-  explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig);
-  ~CAHitNtupletCUDA() override = default;
+  explicit CAHitNtupletCUDAT(const edm::ParameterSet& iConfig);
+  ~CAHitNtupletCUDAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -39,49 +47,57 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
   bool onGPU_;
 
   edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> tokenField_;
-  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
+  edm::EDGetTokenT<cms::cuda::Product<HitsOnGPU>> tokenHitGPU_;
   edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
-  edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
+  edm::EDGetTokenT<HitsOnCPU> tokenHitCPU_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
 
-  CAHitNtupletGeneratorOnGPU gpuAlgo_;
+  GPUAlgo gpuAlgo_;
 };
 
-CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+CAHitNtupletCUDAT<TrackerTraits>::CAHitNtupletCUDAT(const edm::ParameterSet& iConfig)
     : onGPU_(iConfig.getParameter<bool>("onGPU")), tokenField_(esConsumes()), gpuAlgo_(iConfig, consumesCollector()) {
   if (onGPU_) {
-    tokenHitGPU_ =
-        consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenHitGPU_ = consumes(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
   } else {
-    tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenHitCPU_ = consumes(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
   }
 }
 
-void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void CAHitNtupletCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
   desc.add<bool>("onGPU", true);
   desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
 
-  CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
-  descriptions.add("pixelTracksCUDA", desc);
+  GPUAlgo::fillDescriptions(desc);
+  descriptions.addWithDefaultLabel(desc);
 }
 
-void CAHitNtupletCUDA::beginJob() { gpuAlgo_.beginJob(); }
+template <typename TrackerTraits>
+void CAHitNtupletCUDAT<TrackerTraits>::beginJob() {
+  gpuAlgo_.beginJob();
+}
 
-void CAHitNtupletCUDA::endJob() { gpuAlgo_.endJob(); }
+template <typename TrackerTraits>
+void CAHitNtupletCUDAT<TrackerTraits>::endJob() {
+  gpuAlgo_.endJob();
+}
 
-void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
+template <typename TrackerTraits>
+void CAHitNtupletCUDAT<TrackerTraits>::produce(edm::StreamID streamID,
+                                               edm::Event& iEvent,
+                                               const edm::EventSetup& es) const {
   auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV();
 
   if (onGPU_) {
     auto hHits = iEvent.getHandle(tokenHitGPU_);
-
     cms::cuda::ScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
-
     ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
   } else {
     auto const& hits = iEvent.get(tokenHitCPU_);
@@ -89,4 +105,11 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
   }
 }
 
+using CAHitNtupletCUDA = CAHitNtupletCUDAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(CAHitNtupletCUDA);
+
+using CAHitNtupletCUDAPhase1 = CAHitNtupletCUDAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(CAHitNtupletCUDAPhase1);
+
+using CAHitNtupletCUDAPhase2 = CAHitNtupletCUDAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(CAHitNtupletCUDAPhase2);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 66208debdc98d..75fbbffb49190 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -8,13 +8,21 @@ namespace {
   std::mutex lock_stat;
 }  // namespace
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
-  kernel_printCounters(counters);
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::printCounters(Counters const *counters) {
+  caHitNtupletGeneratorKernels::kernel_printCounters(counters);
 }
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  using namespace gpuPixelDoublets;
+
+  using GPUCACell = GPUCACellT<TrackerTraits>;
+  using OuterHitOfCell = typename GPUCACell::OuterHitOfCell;
+  using CellNeighbors = typename GPUCACell::CellNeighbors;
+  using CellTracks = typename GPUCACell::CellTracks;
+  using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer;
+
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
@@ -24,61 +32,54 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   // use "nhits" to heuristically dimension the workspace
 
   // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
-  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
-  assert(device_isOuterHitOfCell_.get());
-  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+  //this->device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  this->device_isOuterHitOfCell_ = std::make_unique<OuterHitOfCellContainer[]>(std::max(1U, nhits));
+  assert(this->device_isOuterHitOfCell_.get());
+  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
 
-  auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
-                         caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
+  auto cellStorageSize = TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) +
+                         TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks);
   // no need to use the Traits allocations, since we know this is being compiled for the CPU
   //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
-  cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
-  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
-  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
-                                                                                      sizeof(GPUCACell::CellNeighbors));
-
-  gpuPixelDoublets::initDoublets(isOuterHitOfCell_,
-                                 nhits,
-                                 device_theCellNeighbors_.get(),
-                                 device_theCellNeighborsContainer_,
-                                 device_theCellTracks_.get(),
-                                 device_theCellTracksContainer_);
+  this->cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
+  this->device_theCellNeighborsContainer_ = (CellNeighbors *)this->cellStorage_.get();
+  this->device_theCellTracksContainer_ =
+      (CellTracks *)(this->cellStorage_.get() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors));
+
+  initDoublets<TrackerTraits>(this->isOuterHitOfCell_,
+                              nhits,
+                              this->device_theCellNeighbors_.get(),
+                              this->device_theCellNeighborsContainer_,
+                              this->device_theCellTracks_.get(),
+                              this->device_theCellTracksContainer_);
 
   // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
-  device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
+  //this->device_theCells_ = Traits::template make_unique<GPUCACell[]>(this->params_.cellCuts_.maxNumberOfDoublets_, stream);
+  this->device_theCells_ = std::make_unique<GPUCACell[]>(this->params_.cellCuts_.maxNumberOfDoublets_);
   if (0 == nhits)
     return;  // protect against empty events
 
   // take all layer pairs into account
-  auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (not params_.includeJumpingForwardDoublets_) {
-    // exclude forward "jumping" layer pairs
-    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
-  }
-  if (params_.minHitsPerNtuplet_ > 3) {
-    // for quadruplets, exclude all "jumping" layer pairs
-    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
-  }
-
-  assert(nActualPairs <= gpuPixelDoublets::nPairs);
-  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
-                                         device_nCells_,
-                                         device_theCellNeighbors_.get(),
-                                         device_theCellTracks_.get(),
-                                         hh.view(),
-                                         isOuterHitOfCell_,
-                                         nActualPairs,
-                                         params_.idealConditions_,
-                                         params_.doClusterCut_,
-                                         params_.doZ0Cut_,
-                                         params_.doPtCut_,
-                                         params_.maxNumberOfDoublets_);
+  auto nActualPairs = this->params_.nPairs();
+
+  assert(nActualPairs <= TrackerTraits::nPairs);
+
+  getDoubletsFromHisto<TrackerTraits>(this->device_theCells_.get(),
+                                      this->device_nCells_,
+                                      this->device_theCellNeighbors_.get(),
+                                      this->device_theCellTracks_.get(),
+                                      hh.view(),
+                                      this->isOuterHitOfCell_,
+                                      nActualPairs,
+                                      this->params_.cellCuts_);
 }
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::launchKernels(HitsOnCPU const &hh,
+                                                                   TkSoA *tracks_d,
+                                                                   cudaStream_t cudaStream) {
+  using namespace caHitNtupletGeneratorKernels;
+
   auto *tuples_d = &tracks_d->hitIndices;
   auto *detId_d = &tracks_d->detIndices;
   auto *quality_d = tracks_d->qualityData();
@@ -90,125 +91,139 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   auto nhits = hh.nHits();
 
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+#ifdef NTUPLE_DEBUG
+  std::cout << "start tuple building. N hits " << nhits << std::endl;
+  if (nhits < 2)
+    std::cout << "too few hits " << nhits << std::endl;
+#endif
 
   //
   // applying conbinatoric cleaning such as fishbone at this stage is too expensive
   //
 
-  kernel_connect(device_hitTuple_apc_,
-                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-                 hh.view(),
-                 device_theCells_.get(),
-                 device_nCells_,
-                 device_theCellNeighbors_.get(),
-                 isOuterHitOfCell_,
-                 params_.hardCurvCut_,
-                 params_.ptmin_,
-                 params_.CAThetaCutBarrel_,
-                 params_.CAThetaCutForward_,
-                 params_.dcaCutInnerTriplet_,
-                 params_.dcaCutOuterTriplet_);
-
-  if (nhits > 1 && params_.earlyFishbone_) {
-    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
+  kernel_connect<TrackerTraits>(this->device_hitTuple_apc_,
+                                this->device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+                                hh.view(),
+                                this->device_theCells_.get(),
+                                this->device_nCells_,
+                                this->device_theCellNeighbors_.get(),
+                                this->isOuterHitOfCell_,
+                                this->params_.caParams_);
+
+  if (nhits > 1 && this->params_.earlyFishbone_) {
+    gpuPixelDoublets::fishbone<TrackerTraits>(
+        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false);
   }
 
-  kernel_find_ntuplets(hh.view(),
-                       device_theCells_.get(),
-                       device_nCells_,
-                       device_theCellTracks_.get(),
-                       tuples_d,
-                       device_hitTuple_apc_,
-                       quality_d,
-                       params_.minHitsPerNtuplet_);
-  if (params_.doStats_)
-    kernel_mark_used(device_theCells_.get(), device_nCells_);
+  kernel_find_ntuplets<TrackerTraits>(hh.view(),
+                                      this->device_theCells_.get(),
+                                      this->device_nCells_,
+                                      this->device_theCellTracks_.get(),
+                                      tuples_d,
+                                      this->device_hitTuple_apc_,
+                                      quality_d,
+                                      this->params_.caParams_);
+  if (this->params_.doStats_)
+    kernel_mark_used(this->device_theCells_.get(), this->device_nCells_);
 
-  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk(this->device_hitTuple_apc_, tuples_d);
 
-  kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
-  kernel_fillNLayers(tracks_d, device_hitTuple_apc_);
+  kernel_fillHitDetIndices<TrackerTraits>(tuples_d, hh.view(), detId_d);
+  kernel_fillNLayers<TrackerTraits>(tracks_d, this->device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
+  kernel_earlyDuplicateRemover<TrackerTraits>(
+      this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_);
 
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_countMultiplicity<TrackerTraits>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
+  cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream);
+  kernel_fillMultiplicity<TrackerTraits>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
 
-  if (nhits > 1 && params_.lateFishbone_) {
-    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
+  if (nhits > 1 && this->params_.lateFishbone_) {
+    gpuPixelDoublets::fishbone<TrackerTraits>(
+        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true);
   }
 }
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::classifyTuples(HitsOnCPU const &hh,
+                                                                    TkSoA *tracks_d,
+                                                                    cudaStream_t cudaStream) {
+  using namespace caHitNtupletGeneratorKernels;
+
   int32_t nhits = hh.nHits();
 
   auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = tracks_d->qualityData();
 
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);
-
-  if (params_.lateFishbone_) {
+  kernel_classifyTracks<TrackerTraits>(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d);
+  if (this->params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
+    kernel_fishboneCleaner<TrackerTraits>(this->device_theCells_.get(), this->device_nCells_, quality_d);
   }
 
   // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
+  kernel_fastDuplicateRemover<TrackerTraits>(
+      this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_);
 
   // fill hit->track "map"
-  if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+  if (this->params_.doSharedHitCut_ || this->params_.doStats_) {
+    kernel_countHitInTracks<TrackerTraits>(tuples_d, quality_d, this->device_hitToTuple_.get());
+    cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream);
+    kernel_fillHitInTracks<TrackerTraits>(tuples_d, quality_d, this->device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
-  if (params_.doSharedHitCut_) {
-    kernel_rejectDuplicate(
-        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-
-    kernel_sharedHitCleaner(hh.view(),
-                            tracks_d,
-                            quality_d,
-                            params_.minHitsForSharingCut_,
-                            params_.dupPassThrough_,
-                            device_hitToTuple_.get());
-    if (params_.useSimpleTripletCleaner_) {
-      kernel_simpleTripletCleaner(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+  if (this->params_.doSharedHitCut_) {
+    kernel_rejectDuplicate<TrackerTraits>(tracks_d,
+                                          quality_d,
+                                          this->params_.minHitsForSharingCut_,
+                                          this->params_.dupPassThrough_,
+                                          this->device_hitToTuple_.get());
+
+    kernel_sharedHitCleaner<TrackerTraits>(hh.view(),
+                                           tracks_d,
+                                           quality_d,
+                                           this->params_.minHitsForSharingCut_,
+                                           this->params_.dupPassThrough_,
+                                           this->device_hitToTuple_.get());
+    if (this->params_.useSimpleTripletCleaner_) {
+      kernel_simpleTripletCleaner<TrackerTraits>(tracks_d,
+                                                 quality_d,
+                                                 this->params_.minHitsForSharingCut_,
+                                                 this->params_.dupPassThrough_,
+                                                 this->device_hitToTuple_.get());
     } else {
-      kernel_tripletCleaner(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+      kernel_tripletCleaner<TrackerTraits>(tracks_d,
+                                           quality_d,
+                                           this->params_.minHitsForSharingCut_,
+                                           this->params_.dupPassThrough_,
+                                           this->device_hitToTuple_.get());
     }
   }
 
-  if (params_.doStats_) {
+  if (this->params_.doStats_) {
     std::lock_guard guard(lock_stat);
-    kernel_checkOverflows(tuples_d,
-                          device_tupleMultiplicity_.get(),
-                          device_hitToTuple_.get(),
-                          device_hitTuple_apc_,
-                          device_theCells_.get(),
-                          device_nCells_,
-                          device_theCellNeighbors_.get(),
-                          device_theCellTracks_.get(),
-                          isOuterHitOfCell_,
-                          nhits,
-                          params_.maxNumberOfDoublets_,
-                          counters_);
+    kernel_checkOverflows<TrackerTraits>(tuples_d,
+                                         this->device_tupleMultiplicity_.get(),
+                                         this->device_hitToTuple_.get(),
+                                         this->device_hitTuple_apc_,
+                                         this->device_theCells_.get(),
+                                         this->device_nCells_,
+                                         this->device_theCellNeighbors_.get(),
+                                         this->device_theCellTracks_.get(),
+                                         this->isOuterHitOfCell_,
+                                         nhits,
+                                         this->params_.cellCuts_.maxNumberOfDoublets_,
+                                         this->counters_);
   }
 
-  if (params_.doStats_) {
+  if (this->params_.doStats_) {
     // counters (add flag???)
     std::lock_guard guard(lock_stat);
-    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
-    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
+    kernel_doStatsForHitInTracks<TrackerTraits>(this->device_hitToTuple_.get(), this->counters_);
+    kernel_doStatsForTracks<TrackerTraits>(tuples_d, quality_d, this->counters_);
   }
 
 #ifdef DUMP_GPU_TK_TUPLES
@@ -217,7 +232,11 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   {
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
-    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev);
+    kernel_print_found_ntuplets<TrackerTraits>(
+        hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 0, 1000000, iev);
   }
 #endif
 }
+
+template class CAHitNtupletGeneratorKernelsCPU<pixelTopology::Phase1>;
+template class CAHitNtupletGeneratorKernelsCPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 913b6d5a32d28..59ae2041b44aa 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,8 +1,15 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 #include <mutex>
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+// #define NTUPLE_DEBUG
+// #define GPU_DEBUG
+
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU const &hh,
+                                                                   TkSoA *tracks_d,
+                                                                   cudaStream_t cudaStream) {
+  using namespace gpuPixelDoublets;
+  using namespace caHitNtupletGeneratorKernels;
   // these are pointer on GPU!
   auto *tuples_d = &tracks_d->hitIndices;
   auto *detId_d = &tracks_d->detIndices;
@@ -26,58 +33,57 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   auto nthTot = 64;
   auto stride = 4;
   auto blockSize = nthTot / stride;
-  auto numberOfBlocks = nDoubletBlocks(blockSize);
+  auto numberOfBlocks = this->nDoubletBlocks(blockSize);
   auto rescale = numberOfBlocks / 65536;
   blockSize *= (rescale + 1);
-  numberOfBlocks = nDoubletBlocks(blockSize);
+  numberOfBlocks = this->nDoubletBlocks(blockSize);
   assert(numberOfBlocks < 65536);
   assert(blockSize > 0 && 0 == blockSize % 16);
   dim3 blks(1, numberOfBlocks, 1);
   dim3 thrs(stride, blockSize, 1);
 
-  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
-      device_hitTuple_apc_,
-      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-      hh.view(),
-      device_theCells_.get(),
-      device_nCells_,
-      device_theCellNeighbors_.get(),
-      isOuterHitOfCell_,
-      params_.hardCurvCut_,
-      params_.ptmin_,
-      params_.CAThetaCutBarrel_,
-      params_.CAThetaCutForward_,
-      params_.dcaCutInnerTriplet_,
-      params_.dcaCutOuterTriplet_);
+  kernel_connect<TrackerTraits>
+      <<<blks, thrs, 0, cudaStream>>>(this->device_hitTuple_apc_,
+                                      this->device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+                                      hh.view(),
+                                      this->device_theCells_.get(),
+                                      this->device_nCells_,
+                                      this->device_theCellNeighbors_.get(),
+                                      this->isOuterHitOfCell_,
+                                      this->params_.caParams_);
+
   cudaCheck(cudaGetLastError());
 
   // do not run the fishbone if there are hits only in BPIX1
-  if (nhits > isOuterHitOfCell_.offset && params_.earlyFishbone_) {
+  if (nhits > this->isOuterHitOfCell_.offset && this->params_.earlyFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
+    auto numberOfBlocks = (nhits - this->isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
+    fishbone<TrackerTraits><<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false);
     cudaCheck(cudaGetLastError());
   }
 
   blockSize = 64;
-  numberOfBlocks = (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                     device_theCells_.get(),
-                                                                     device_nCells_,
-                                                                     device_theCellTracks_.get(),
-                                                                     tuples_d,
-                                                                     device_hitTuple_apc_,
-                                                                     quality_d,
-                                                                     params_.minHitsPerNtuplet_);
+  numberOfBlocks = (3 * this->params_.cellCuts_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  kernel_find_ntuplets<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                                                    this->device_theCells_.get(),
+                                                                                    this->device_nCells_,
+                                                                                    this->device_theCellTracks_.get(),
+                                                                                    tuples_d,
+                                                                                    this->device_hitTuple_apc_,
+                                                                                    quality_d,
+                                                                                    this->params_.caParams_);
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
-
-  if (params_.doStats_)
-    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_.get(), device_nCells_);
+#endif
+  if (this->params_.doStats_)
+    kernel_mark_used<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_theCells_.get(), this->device_nCells_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -87,38 +93,63 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   blockSize = 128;
   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
-  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_hitTuple_apc_, tuples_d);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
-  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, device_hitTuple_apc_);
+#endif
+
+  kernel_fillHitDetIndices<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
   cudaCheck(cudaGetLastError());
 
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+  kernel_fillNLayers<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, this->device_hitTuple_apc_);
+  cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = nDoubletBlocks(blockSize);
-  kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
+  numberOfBlocks = this->nDoubletBlocks(blockSize);
+
+  kernel_earlyDuplicateRemover<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_);
+  cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
+#endif
 
   blockSize = 128;
-  numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  numberOfBlocks = (3 * TrackerTraits::maxNumberOfTuples / 4 + blockSize - 1) / blockSize;
+  kernel_countMultiplicity<TrackerTraits>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
+  cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream);
+  kernel_fillMultiplicity<TrackerTraits>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
 
   // do not run the fishbone if there are hits only in BPIX1
-  if (nhits > isOuterHitOfCell_.offset && params_.lateFishbone_) {
+  if (nhits > this->isOuterHitOfCell_.offset && this->params_.lateFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
+    auto numberOfBlocks = (nhits - this->isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
-    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
+    fishbone<TrackerTraits><<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true);
     cudaCheck(cudaGetLastError());
   }
 
@@ -128,14 +159,22 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 #endif
 
   // free space asap
-  // device_isOuterHitOfCell_.reset();
+  // this->device_isOuterHitOfCell_.reset();
 }
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   int32_t nhits = hh.nHits();
 
-  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+  using namespace gpuPixelDoublets;
+
+  using GPUCACell = GPUCACellT<TrackerTraits>;
+  using OuterHitOfCell = typename GPUCACell::OuterHitOfCell;
+  using CellNeighbors = typename GPUCACell::CellNeighbors;
+  using CellTracks = typename GPUCACell::CellTracks;
+  using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer;
+
+  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
 
 #ifdef NTUPLE_DEBUG
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
@@ -147,34 +186,35 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCellContainer[]>(
-      std::max(1, nhits - hh.offsetBPIX2()), stream);
-  assert(device_isOuterHitOfCell_.get());
+  this->device_isOuterHitOfCell_ =
+      cms::cuda::make_device_unique<OuterHitOfCellContainer[]>(std::max(1, nhits - hh.offsetBPIX2()), stream);
+  assert(this->device_isOuterHitOfCell_.get());
 
-  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
 
-  cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
-      caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
-          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
-      stream);
-  device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
-  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
-                                                                                      sizeof(GPUCACell::CellNeighbors));
+  this->cellStorage_ =
+      cms::cuda::make_device_unique<unsigned char[]>(TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) +
+                                                         TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks),
+                                                     stream);
+  this->device_theCellNeighborsContainer_ = (CellNeighbors *)this->cellStorage_.get();
+  this->device_theCellTracksContainer_ =
+      (CellTracks *)(this->cellStorage_.get() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors));
 
   {
     int threadsPerBlock = 128;
     // at least one block!
     int blocks = (std::max(1, nhits - hh.offsetBPIX2()) + threadsPerBlock - 1) / threadsPerBlock;
-    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(isOuterHitOfCell_,
-                                                                           nhits,
-                                                                           device_theCellNeighbors_.get(),
-                                                                           device_theCellNeighborsContainer_,
-                                                                           device_theCellTracks_.get(),
-                                                                           device_theCellTracksContainer_);
+    initDoublets<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(this->isOuterHitOfCell_,
+                                                                        nhits,
+                                                                        this->device_theCellNeighbors_.get(),
+                                                                        this->device_theCellNeighborsContainer_,
+                                                                        this->device_theCellTracks_.get(),
+                                                                        this->device_theCellTracksContainer_);
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
+  this->device_theCells_ =
+      cms::cuda::make_device_unique<GPUCACell[]>(this->params_.cellCuts_.maxNumberOfDoublets_, stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -185,34 +225,21 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     return;  // protect against empty events
 
   // take all layer pairs into account
-  auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (not params_.includeJumpingForwardDoublets_) {
-    // exclude forward "jumping" layer pairs
-    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
-  }
-  if (params_.minHitsPerNtuplet_ > 3) {
-    // for quadruplets, exclude all "jumping" layer pairs
-    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
-  }
+  auto nActualPairs = this->params_.nPairs();
 
-  assert(nActualPairs <= gpuPixelDoublets::nPairs);
   int stride = 4;
-  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
+  int threadsPerBlock = TrackerTraits::getDoubletsFromHistoMaxBlockSize / stride;
   int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
   dim3 blks(1, blocks, 1);
   dim3 thrs(stride, threadsPerBlock, 1);
-  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
-                                                                    device_nCells_,
-                                                                    device_theCellNeighbors_.get(),
-                                                                    device_theCellTracks_.get(),
-                                                                    hh.view(),
-                                                                    isOuterHitOfCell_,
-                                                                    nActualPairs,
-                                                                    params_.idealConditions_,
-                                                                    params_.doClusterCut_,
-                                                                    params_.doZ0Cut_,
-                                                                    params_.doPtCut_,
-                                                                    params_.maxNumberOfDoublets_);
+  getDoubletsFromHisto<TrackerTraits><<<blks, thrs, 0, stream>>>(this->device_theCells_.get(),
+                                                                 this->device_nCells_,
+                                                                 this->device_theCellNeighbors_.get(),
+                                                                 this->device_theCellTracks_.get(),
+                                                                 hh.view(),
+                                                                 this->isOuterHitOfCell_,
+                                                                 nActualPairs,
+                                                                 this->params_.cellCuts_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -221,8 +248,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 }
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU const &hh,
+                                                                    TkSoA *tracks_d,
+                                                                    cudaStream_t cudaStream) {
+  using namespace caHitNtupletGeneratorKernels;
+
   // these are pointer on GPU!
   auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = tracks_d->qualityData();
@@ -232,65 +263,80 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   auto blockSize = 64;
 
   // classify tracks based on kinematics
-  auto numberOfBlocks = nQuadrupletBlocks(blockSize);
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, params_.cuts_, quality_d);
-  cudaCheck(cudaGetLastError());
+  auto numberOfBlocks = this->nQuadrupletBlocks(blockSize);
+  kernel_classifyTracks<TrackerTraits>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d);
 
-  if (params_.lateFishbone_) {
+  if (this->params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    numberOfBlocks = nDoubletBlocks(blockSize);
-    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        device_theCells_.get(), device_nCells_, quality_d);
+    numberOfBlocks = this->nDoubletBlocks(blockSize);
+    kernel_fishboneCleaner<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_theCells_.get(), this->device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
   }
 
   // mark duplicates (tracks that share a doublet)
-  numberOfBlocks = nDoubletBlocks(blockSize);
-  kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
+  numberOfBlocks = this->nDoubletBlocks(blockSize);
+  kernel_fastDuplicateRemover<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
 #endif
 
-  if (params_.doSharedHitCut_ || params_.doStats_) {
+  if (this->params_.doSharedHitCut_ || this->params_.doStats_) {
     // fill hit->track "map"
-    assert(hitToTupleView_.offSize > nhits);
-    numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, device_hitToTuple_.get());
+    assert(this->hitToTupleView_.offSize > nhits);
+    numberOfBlocks = this->nQuadrupletBlocks(blockSize);
+    kernel_countHitInTracks<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
-    assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
-           (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
-    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
+    assert((this->hitToTupleView_.assoc == this->device_hitToTuple_.get()) &&
+           (this->hitToTupleView_.offStorage == this->device_hitToTupleStorage_.get()) &&
+           (this->hitToTupleView_.offSize > 0));
+    cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_fillHitInTracks<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
 #endif
   }
 
-  if (params_.doSharedHitCut_) {
+  if (this->params_.doSharedHitCut_) {
     // mark duplicates (tracks that share at least one hit)
-    numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
-
-    kernel_rejectDuplicate<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
-
-    kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                          tracks_d,
-                                                                          quality_d,
-                                                                          params_.minHitsForSharingCut_,
-                                                                          params_.dupPassThrough_,
-                                                                          device_hitToTuple_.get());
-
-    if (params_.useSimpleTripletCleaner_) {
-      kernel_simpleTripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+    numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize;
+
+    kernel_rejectDuplicate<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
+                                                       quality_d,
+                                                       this->params_.minHitsForSharingCut_,
+                                                       this->params_.dupPassThrough_,
+                                                       this->device_hitToTuple_.get());
+
+    kernel_sharedHitCleaner<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                       tracks_d,
+                                                       quality_d,
+                                                       this->params_.minHitsForSharingCut_,
+                                                       this->params_.dupPassThrough_,
+                                                       this->device_hitToTuple_.get());
+
+    if (this->params_.useSimpleTripletCleaner_) {
+      kernel_simpleTripletCleaner<TrackerTraits>
+          <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
+                                                         quality_d,
+                                                         this->params_.minHitsForSharingCut_,
+                                                         this->params_.dupPassThrough_,
+                                                         this->device_hitToTuple_.get());
     } else {
-      kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
+      kernel_tripletCleaner<TrackerTraits>
+          <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
+                                                         quality_d,
+                                                         this->params_.minHitsForSharingCut_,
+                                                         this->params_.dupPassThrough_,
+                                                         this->device_hitToTuple_.get());
     }
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
@@ -298,30 +344,33 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 #endif
   }
 
-  if (params_.doStats_) {
-    numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
-                                                                        device_tupleMultiplicity_.get(),
-                                                                        device_hitToTuple_.get(),
-                                                                        device_hitTuple_apc_,
-                                                                        device_theCells_.get(),
-                                                                        device_nCells_,
-                                                                        device_theCellNeighbors_.get(),
-                                                                        device_theCellTracks_.get(),
-                                                                        isOuterHitOfCell_,
-                                                                        nhits,
-                                                                        params_.maxNumberOfDoublets_,
-                                                                        counters_);
+  if (this->params_.doStats_) {
+    numberOfBlocks = (std::max(nhits, int(this->params_.cellCuts_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                       this->device_tupleMultiplicity_.get(),
+                                                       this->device_hitToTuple_.get(),
+                                                       this->device_hitTuple_apc_,
+                                                       this->device_theCells_.get(),
+                                                       this->device_nCells_,
+                                                       this->device_theCellNeighbors_.get(),
+                                                       this->device_theCellTracks_.get(),
+                                                       this->isOuterHitOfCell_,
+                                                       nhits,
+                                                       this->params_.cellCuts_.maxNumberOfDoublets_,
+                                                       this->counters_);
     cudaCheck(cudaGetLastError());
   }
 
-  if (params_.doStats_) {
+  if (this->params_.doStats_) {
     // counters (add flag???)
-    numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
-    kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
+    numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize;
+    kernel_doStatsForHitInTracks<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_hitToTuple_.get(), this->counters_);
     cudaCheck(cudaGetLastError());
-    numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
-    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
+    numberOfBlocks = (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+    kernel_doStatsForTracks<TrackerTraits>
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->counters_);
     cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
@@ -336,19 +385,22 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
     for (int k = 0; k < 20000; k += 500) {
-      kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-          hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev);
+      kernel_print_found_ntuplets<TrackerTraits><<<1, 32, 0, cudaStream>>>(
+          hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), k, k + 500, iev);
       cudaDeviceSynchronize();
     }
-    kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev);
+    kernel_print_found_ntuplets<TrackerTraits><<<1, 32, 0, cudaStream>>>(
+        hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 20000, 1000000, iev);
     cudaDeviceSynchronize();
     // cudaStreamSynchronize(cudaStream);
   }
 #endif
 }
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) {
-  kernel_printCounters<<<1, 1>>>(counters);
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::printCounters(Counters const *counters) {
+  caHitNtupletGeneratorKernels::kernel_printCounters<<<1, 1>>>(counters);
 }
+
+template class CAHitNtupletGeneratorKernelsGPU<pixelTopology::Phase1>;
+template class CAHitNtupletGeneratorKernelsGPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 8af1176fe92c6..b595106299d71 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -5,10 +5,167 @@
 
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "GPUCACell.h"
-
+#include "gpuPixelDoublets.h"
 // #define DUMP_GPU_TK_TUPLES
 
-namespace cAHitNtupletGenerator {
+namespace caHitNtupletGenerator {
+
+  //Configuration params common to all topologies, for the algorithms
+  struct AlgoParams {
+    const bool onGPU_;
+    const uint32_t minHitsForSharingCut_;
+    const bool useRiemannFit_;
+    const bool fitNas4_;
+    const bool includeJumpingForwardDoublets_;
+    const bool earlyFishbone_;
+    const bool lateFishbone_;
+    const bool doStats_;
+    const bool doSharedHitCut_;
+    const bool dupPassThrough_;
+    const bool useSimpleTripletCleaner_;
+  };
+
+  //CAParams
+  struct CACommon {
+    const uint32_t minHitsPerNtuplet_;
+    const float ptmin_;
+    const float CAThetaCutBarrel_;
+    const float CAThetaCutForward_;
+    const float hardCurvCut_;
+    const float dcaCutInnerTriplet_;
+    const float dcaCutOuterTriplet_;
+  };
+
+  template <typename TrackerTraits, typename Enable = void>
+  struct CAParamsT : public CACommon {
+    __device__ __forceinline__ bool startingLayerPair(int16_t pid) const { return false; };
+    __device__ __forceinline__ bool startAt0(int16_t pid) const { return false; };
+  };
+
+  template <typename TrackerTraits>
+  struct CAParamsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> : public CACommon {
+    /// Is is a starting layer pair?
+    __device__ __forceinline__ bool startingLayerPair(int16_t pid) const {
+      return minHitsPerNtuplet_ > 3 ? pid < 3 : pid < 8 || pid > 12;
+    }
+
+    /// Is this a pair with inner == 0?
+    __device__ __forceinline__ bool startAt0(int16_t pid) const {
+      assert((pixelTopology::Phase1::layerPairs[pid * 2] == 0) ==
+             (pid < 3 || pid == 13 || pid == 15 || pid == 16));  // to be 100% sure it's working, may be removed
+      return pixelTopology::Phase1::layerPairs[pid * 2] == 0;
+    }
+  };
+
+  template <typename TrackerTraits>
+  struct CAParamsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> : public CACommon {
+    const bool includeFarForwards_;
+    /// Is is a starting layer pair?
+    __device__ __forceinline__ bool startingLayerPair(int16_t pid) const {
+      return pid < 33;  // in principle one could remove 5,6,7 23, 28 and 29
+    }
+
+    /// Is this a pair with inner == 0
+    __device__ __forceinline__ bool startAt0(int16_t pid) const {
+      assert((pixelTopology::Phase2::layerPairs[pid * 2] == 0) == ((pid < 3) | (pid >= 23 && pid < 28)));
+      return pixelTopology::Phase2::layerPairs[pid * 2] == 0;
+    }
+  };
+
+  //Full list of params = algo params + ca params + cell params + quality cuts
+  //Generic template
+  template <typename TrackerTraits, typename Enable = void>
+  struct ParamsT : public AlgoParams {
+    // one should define the params for its own pixelTopology
+    // not defining anything here
+    inline uint32_t nPairs() const { return 0; }
+  };
+
+  template <typename TrackerTraits>
+  struct ParamsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> : public AlgoParams {
+    using TT = TrackerTraits;
+    using QualityCuts = pixelTrack::QualityCutsT<TT>;  //track quality cuts
+    using CellCuts = gpuPixelDoublets::CellCutsT<TT>;  //cell building cuts
+    using CAParams = CAParamsT<TT>;                    //params to be used on device
+
+    ParamsT(AlgoParams const& commonCuts,
+            CellCuts const& cellCuts,
+            QualityCuts const& cutsCuts,
+            CAParams const& caParams)
+        : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(cutsCuts), caParams_(caParams) {}
+
+    const CellCuts cellCuts_;
+    const QualityCuts qualityCuts_{// polynomial coefficients for the pT-dependent chi2 cut
+                                   {0.68177776, 0.74609577, -0.08035491, 0.00315399},
+                                   // max pT used to determine the chi2 cut
+                                   10.,
+                                   // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                                   30.,
+                                   // regional cuts for triplets
+                                   {
+                                       0.3,  // |Tip| < 0.3 cm
+                                       0.5,  // pT > 0.5 GeV
+                                       12.0  // |Zip| < 12.0 cm
+                                   },
+                                   // regional cuts for quadruplets
+                                   {
+                                       0.5,  // |Tip| < 0.5 cm
+                                       0.3,  // pT > 0.3 GeV
+                                       12.0  // |Zip| < 12.0 cm
+                                   }};
+    const CAParams caParams_;
+    /// Compute the number of pairs
+    inline uint32_t nPairs() const {
+      // take all layer pairs into account
+      uint32_t nActualPairs = TT::nPairs;
+      if (not includeJumpingForwardDoublets_) {
+        // exclude forward "jumping" layer pairs
+        nActualPairs = TT::nPairsForTriplets;
+      }
+      if (caParams_.minHitsPerNtuplet_ > 3) {
+        // for quadruplets, exclude all "jumping" layer pairs
+        nActualPairs = TT::nPairsForQuadruplets;
+      }
+
+      return nActualPairs;
+    }
+
+  };  // Params Phase1
+
+  template <typename TrackerTraits>
+  struct ParamsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> : public AlgoParams {
+    using TT = TrackerTraits;
+    using QualityCuts = pixelTrack::QualityCutsT<TT>;
+    using CellCuts = gpuPixelDoublets::CellCutsT<TT>;
+    using CAParams = CAParamsT<TT>;
+
+    ParamsT(AlgoParams const& commonCuts,
+            CellCuts const& cellCuts,
+            QualityCuts const& qualityCuts,
+            CAParams const& caParams)
+        : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(qualityCuts), caParams_(caParams) {}
+
+    // quality cuts
+    const CellCuts cellCuts_;
+    const QualityCuts qualityCuts_{5.0f, /*chi2*/ 0.9f, /* pT in Gev*/ 0.4f, /*zip in cm*/ 12.0f /*tip in cm*/};
+    const CAParams caParams_;
+
+    inline uint32_t nPairs() const {
+      // take all layer pairs into account
+      uint32_t nActualPairs = TT::nPairsMinimal;
+      if (caParams_.includeFarForwards_) {
+        // considera far forwards (> 11 & > 23)
+        nActualPairs = TT::nPairsFarForwards;
+      }
+      if (includeJumpingForwardDoublets_) {
+        // include jumping forwards
+        nActualPairs = TT::nPairs;
+      }
+
+      return nActualPairs;
+    }
+
+  };  // Params Phase1
 
   // counters
   struct Counters {
@@ -27,157 +184,44 @@ namespace cAHitNtupletGenerator {
     unsigned long long nZeroTrackCells;
   };
 
-  using HitsView = TrackingRecHit2DSOAView;
-  using HitsOnGPU = TrackingRecHit2DSOAView;
-
-  using HitToTuple = caConstants::HitToTuple;
-  using TupleMultiplicity = caConstants::TupleMultiplicity;
-
   using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
-  using HitContainer = pixelTrack::HitContainer;
-
-  struct QualityCuts {
-    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
-    float chi2Coeff[4];
-    float chi2MaxPt;  // GeV
-    float chi2Scale;
-
-    struct Region {
-      float maxTip;  // cm
-      float minPt;   // GeV
-      float maxZip;  // cm
-    };
-
-    Region triplet;
-    Region quadruplet;
-  };
 
-  // params (FIXME: thi si a POD: so no constructor no traling _  and no const as params_ is already const)
-  struct Params {
-    Params(bool onGPU,
-           uint32_t minHitsPerNtuplet,
-           uint32_t maxNumberOfDoublets,
-           uint16_t minHitsForSharingCuts,
-           bool useRiemannFit,
-           bool fitNas4,
-           bool includeJumpingForwardDoublets,
-           bool earlyFishbone,
-           bool lateFishbone,
-           bool idealConditions,
-           bool doStats,
-           bool doClusterCut,
-           bool doZ0Cut,
-           bool doPtCut,
-           bool doSharedHitCut,
-           bool dupPassThrough,
-           bool useSimpleTripletCleaner,
-           float ptmin,
-           float CAThetaCutBarrel,
-           float CAThetaCutForward,
-           float hardCurvCut,
-           float dcaCutInnerTriplet,
-           float dcaCutOuterTriplet,
-
-           QualityCuts const& cuts)
-        : onGPU_(onGPU),
-          minHitsPerNtuplet_(minHitsPerNtuplet),
-          maxNumberOfDoublets_(maxNumberOfDoublets),
-          minHitsForSharingCut_(minHitsForSharingCuts),
-          useRiemannFit_(useRiemannFit),
-          fitNas4_(fitNas4),
-          includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
-          earlyFishbone_(earlyFishbone),
-          lateFishbone_(lateFishbone),
-          idealConditions_(idealConditions),
-          doStats_(doStats),
-          doClusterCut_(doClusterCut),
-          doZ0Cut_(doZ0Cut),
-          doPtCut_(doPtCut),
-          doSharedHitCut_(doSharedHitCut),
-          dupPassThrough_(dupPassThrough),
-          useSimpleTripletCleaner_(useSimpleTripletCleaner),
-          ptmin_(ptmin),
-          CAThetaCutBarrel_(CAThetaCutBarrel),
-          CAThetaCutForward_(CAThetaCutForward),
-          hardCurvCut_(hardCurvCut),
-          dcaCutInnerTriplet_(dcaCutInnerTriplet),
-          dcaCutOuterTriplet_(dcaCutOuterTriplet),
-          cuts_(cuts) {}
+}  // namespace caHitNtupletGenerator
 
-    const bool onGPU_;
-    const uint32_t minHitsPerNtuplet_;
-    const uint32_t maxNumberOfDoublets_;
-    const uint16_t minHitsForSharingCut_;
-    const bool useRiemannFit_;
-    const bool fitNas4_;
-    const bool includeJumpingForwardDoublets_;
-    const bool earlyFishbone_;
-    const bool lateFishbone_;
-    const bool idealConditions_;
-    const bool doStats_;
-    const bool doClusterCut_;
-    const bool doZ0Cut_;
-    const bool doPtCut_;
-    const bool doSharedHitCut_;
-    const bool dupPassThrough_;
-    const bool useSimpleTripletCleaner_;
-    const float ptmin_;
-    const float CAThetaCutBarrel_;
-    const float CAThetaCutForward_;
-    const float hardCurvCut_;
-    const float dcaCutInnerTriplet_;
-    const float dcaCutOuterTriplet_;
-
-    // quality cuts
-    QualityCuts cuts_{// polynomial coefficients for the pT-dependent chi2 cut
-                      {0.68177776, 0.74609577, -0.08035491, 0.00315399},
-                      // max pT used to determine the chi2 cut
-                      10.,
-                      // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
-                      30.,
-                      // regional cuts for triplets
-                      {
-                          0.3,  // |Tip| < 0.3 cm
-                          0.5,  // pT > 0.5 GeV
-                          12.0  // |Zip| < 12.0 cm
-                      },
-                      // regional cuts for quadruplets
-                      {
-                          0.5,  // |Tip| < 0.5 cm
-                          0.3,  // pT > 0.3 GeV
-                          12.0  // |Zip| < 12.0 cm
-                      }};
-
-  };  // Params
-
-}  // namespace cAHitNtupletGenerator
-
-template <typename TTraits>
+template <typename TTraits, typename TTTraits>
 class CAHitNtupletGeneratorKernels {
 public:
   using Traits = TTraits;
-
-  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
-  using Params = cAHitNtupletGenerator::Params;
-  using Counters = cAHitNtupletGenerator::Counters;
+  using TrackerTraits = TTTraits;
+  using QualityCuts = pixelTrack::QualityCutsT<TrackerTraits>;
+  using Params = caHitNtupletGenerator::ParamsT<TrackerTraits>;
+  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+  using Counters = caHitNtupletGenerator::Counters;
 
   template <typename T>
   using unique_ptr = typename Traits::template unique_ptr<T>;
 
-  using HitsView = TrackingRecHit2DSOAView;
-  using HitsOnGPU = TrackingRecHit2DSOAView;
-  using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
+  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using HitsOnCPU = TrackingRecHit2DHeterogeneousT<Traits, TrackerTraits>;
 
-  using HitToTuple = caConstants::HitToTuple;
-  using TupleMultiplicity = caConstants::TupleMultiplicity;
+  using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
+  using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+  using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT<TrackerTraits>;
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+
+  using CACell = GPUCACellT<TrackerTraits>;
 
   using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoA;
-  using HitContainer = pixelTrack::HitContainer;
+  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
 
   CAHitNtupletGeneratorKernels(Params const& params)
-      : params_(params), paramsMaxDoubletes3Quarters_(3 * params.maxNumberOfDoublets_ / 4) {}
+      : params_(params), paramsMaxDoubletes3Quarters_(3 * params.cellCuts_.maxNumberOfDoublets_ / 4) {}
+
   ~CAHitNtupletGeneratorKernels() = default;
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
@@ -193,24 +237,23 @@ class CAHitNtupletGeneratorKernels {
   static void printCounters(Counters const* counters);
   void setCounters(Counters* counters) { counters_ = counters; }
 
-private:
+protected:
   Counters* counters_ = nullptr;
-
   // workspace
   unique_ptr<unsigned char[]> cellStorage_;
-  unique_ptr<caConstants::CellNeighborsVector> device_theCellNeighbors_;
-  caConstants::CellNeighbors* device_theCellNeighborsContainer_;
-  unique_ptr<caConstants::CellTracksVector> device_theCellTracks_;
-  caConstants::CellTracks* device_theCellTracksContainer_;
-
-  unique_ptr<GPUCACell[]> device_theCells_;
-  unique_ptr<GPUCACell::OuterHitOfCellContainer[]> device_isOuterHitOfCell_;
-  GPUCACell::OuterHitOfCell isOuterHitOfCell_;
+  unique_ptr<CellNeighborsVector> device_theCellNeighbors_;
+  CellNeighbors* device_theCellNeighborsContainer_;
+  unique_ptr<CellTracksVector> device_theCellTracks_;
+  CellTracks* device_theCellTracksContainer_;
+
+  unique_ptr<CACell[]> device_theCells_;
+  unique_ptr<OuterHitOfCellContainer[]> device_isOuterHitOfCell_;
+  OuterHitOfCell isOuterHitOfCell_;
   uint32_t* device_nCells_ = nullptr;
 
   unique_ptr<HitToTuple> device_hitToTuple_;
-  unique_ptr<HitToTuple::Counter[]> device_hitToTupleStorage_;
-  HitToTuple::View hitToTupleView_;
+  unique_ptr<uint32_t[]> device_hitToTupleStorage_;
+  typename HitToTuple::View hitToTupleView_;
 
   cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
 
@@ -219,8 +262,9 @@ class CAHitNtupletGeneratorKernels {
   unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
 
   unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
+
   // params
-  Params const& params_;
+  Params params_;
   /// Intermediate result avoiding repeated computations.
   const uint32_t paramsMaxDoubletes3Quarters_;
   /// Compute the number of doublet blocks for block size
@@ -231,12 +275,50 @@ class CAHitNtupletGeneratorKernels {
 
   /// Compute the number of quadruplet blocks for block size
   inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
-    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
-    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+    // pixelTopology::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
+    return (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
   }
 };
 
-using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
-using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits>;
+template <typename TrackerTraits>
+class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits, TrackerTraits> {
+  using CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits, TrackerTraits>::CAHitNtupletGeneratorKernels;
+  using HitsOnCPU = TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits>;
+  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+  using Counters = caHitNtupletGenerator::Counters;
+  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+public:
+  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void allocateOnGPU(int32_t nHits, cudaStream_t stream);
+  static void printCounters(Counters const* counters);
+};
+
+template <typename TrackerTraits>
+class CAHitNtupletGeneratorKernelsCPU : public CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits, TrackerTraits> {
+  using CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits, TrackerTraits>::CAHitNtupletGeneratorKernels;
+  using HitsOnCPU = TrackingRecHit2DHeterogeneousT<cms::cudacompat::CPUTraits, TrackerTraits>;
+  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+  using Counters = caHitNtupletGenerator::Counters;
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+public:
+  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void allocateOnGPU(int32_t nHits, cudaStream_t stream);
+  static void printCounters(Counters const* counters);
+};
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
index 5978ef8851c73..af085bb12eddd 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -2,18 +2,21 @@
 
 #include "CAHitNtupletGeneratorKernels.h"
 
-template <>
+//#define GPU_DEBUG
+template <typename TrackerTraits>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
+  using Traits = cms::cudacompat::GPUTraits;
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
+  using Traits = cms::cudacompat::CPUTraits;
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(stream);
-  device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(stream);
+  this->device_theCellNeighbors_ = Traits::template make_unique<CellNeighborsVector>(stream);
+  this->device_theCellTracks_ = Traits::template make_unique<CellTracksVector>(stream);
 
 #ifdef GPU_DEBUG
   std::cout << "Allocation for tuple building. N hits " << nHits << std::endl;
@@ -21,30 +24,36 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t
 
   nHits++;  // storage requires one more counter;
   assert(nHits > 0);
-  device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
-  device_hitToTupleStorage_ = Traits::template make_unique<HitToTuple::Counter[]>(nHits, stream);
-  hitToTupleView_.assoc = device_hitToTuple_.get();
-  hitToTupleView_.offStorage = device_hitToTupleStorage_.get();
-  hitToTupleView_.offSize = nHits;
+  this->device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
+  this->device_hitToTupleStorage_ = Traits::template make_unique<typename HitToTuple::Counter[]>(nHits, stream);
+  this->hitToTupleView_.assoc = this->device_hitToTuple_.get();
+  this->hitToTupleView_.offStorage = this->device_hitToTupleStorage_.get();
+  this->hitToTupleView_.offSize = nHits;
 
-  device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
+  this->device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
 
-  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
+  this->device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
 
-  device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
-  device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
-  device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
+  this->device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)this->device_storage_.get();
+  this->device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)this->device_storage_.get() + 1;
+  this->device_nCells_ = (uint32_t*)(this->device_storage_.get() + 2);
 
   // FIXME: consider collapsing these 3 in one adhoc kernel
   if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
-    cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
+    cudaCheck(cudaMemsetAsync(this->device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
-    *device_nCells_ = 0;
+    *(this->device_nCells_) = 0;
   }
-  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
-  cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
+  cms::cuda::launchZero(this->device_tupleMultiplicity_.get(), stream);
+  cms::cuda::launchZero(this->hitToTupleView_, stream);  // we may wish to keep it in the edm
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
 }
+
+template class CAHitNtupletGeneratorKernelsGPU<pixelTopology::Phase1>;
+template class CAHitNtupletGeneratorKernelsGPU<pixelTopology::Phase2>;
+
+template class CAHitNtupletGeneratorKernelsCPU<pixelTopology::Phase1>;
+template class CAHitNtupletGeneratorKernelsCPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index bbe5df891a735..03112e0f3fc48 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -15,923 +15,964 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
-#include "CAConstants.h"
+#include "CAStructures.h"
 #include "CAHitNtupletGeneratorKernels.h"
 #include "GPUCACell.h"
 #include "gpuFishbone.h"
 #include "gpuPixelDoublets.h"
 
-using HitsOnGPU = TrackingRecHit2DSOAView;
-using HitsOnCPU = TrackingRecHit2DGPU;
+namespace caHitNtupletGeneratorKernels {
 
-using HitToTuple = caConstants::HitToTuple;
-using TupleMultiplicity = caConstants::TupleMultiplicity;
+  constexpr uint32_t tkNotFound = std::numeric_limits<uint16_t>::max();
+  constexpr float maxScore = std::numeric_limits<float>::max();
+  constexpr float nSigma2 = 25.f;
 
-using Quality = pixelTrack::Quality;
-using TkSoA = pixelTrack::TrackSoA;
-using HitContainer = pixelTrack::HitContainer;
+  //all of these below are mostly to avoid brining around the relative namespace
+  template <typename TrackerTraits>
+  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
 
-namespace {
+  template <typename TrackerTraits>
+  using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
 
-  constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
-  constexpr float maxScore = std::numeric_limits<float>::max();
-  constexpr float nSigma2 = 25.f;
+  template <typename TrackerTraits>
+  using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
 
-}  // namespace
-
-__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
-                                      caConstants::TupleMultiplicity const *tupleMultiplicity,
-                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
-                                      cms::cuda::AtomicPairCounter *apc,
-                                      GPUCACell const *__restrict__ cells,
-                                      uint32_t const *__restrict__ nCells,
-                                      gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
-                                      gpuPixelDoublets::CellTracksVector const *cellTracks,
-                                      GPUCACell::OuterHitOfCell const isOuterHitOfCell,
-                                      int32_t nHits,
-                                      uint32_t maxNumberOfDoublets,
-                                      CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
-  auto first = threadIdx.x + blockIdx.x * blockDim.x;
-
-  auto &c = *counters;
-  // counters once per event
-  if (0 == first) {
-    atomicAdd(&c.nEvents, 1);
-    atomicAdd(&c.nHits, nHits);
-    atomicAdd(&c.nCells, *nCells);
-    atomicAdd(&c.nTuples, apc->get().m);
-    atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
-  }
+  template <typename TrackerTraits>
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+
+  using Quality = pixelTrack::Quality;
+
+  template <typename TrackerTraits>
+  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
+
+  template <typename TrackerTraits>
+  using QualityCuts = pixelTrack::QualityCutsT<TrackerTraits>;
+
+  template <typename TrackerTraits>
+  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+  using Counters = caHitNtupletGenerator::Counters;
+
+  template <typename TrackerTraits>
+  __global__ void kernel_checkOverflows(HitContainer<TrackerTraits> const *foundNtuplets,
+                                        TupleMultiplicity<TrackerTraits> const *tupleMultiplicity,
+                                        HitToTuple<TrackerTraits> const *hitToTuple,
+                                        cms::cuda::AtomicPairCounter *apc,
+                                        GPUCACellT<TrackerTraits> const *__restrict__ cells,
+                                        uint32_t const *__restrict__ nCells,
+                                        CellNeighborsVector<TrackerTraits> const *cellNeighbors,
+                                        CellTracksVector<TrackerTraits> const *cellTracks,
+                                        OuterHitOfCell<TrackerTraits> const isOuterHitOfCell,
+                                        int32_t nHits,
+                                        uint32_t maxNumberOfDoublets,
+                                        Counters *counters) {
+    auto first = threadIdx.x + blockIdx.x * blockDim.x;
+
+    auto &c = *counters;
+    // counters once per event
+    if (0 == first) {
+      atomicAdd(&c.nEvents, 1);
+      atomicAdd(&c.nHits, nHits);
+      atomicAdd(&c.nCells, *nCells);
+      atomicAdd(&c.nTuples, apc->get().m);
+      atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
+    }
 
 #ifdef NTUPLE_DEBUG
-  if (0 == first) {
-    printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
-           *nCells,
-           apc->get().m,
-           apc->get().n,
-           nHits,
-           hitToTuple->totOnes());
-    if (apc->get().m < caConstants::maxNumberOfQuadruplets) {
-      assert(foundNtuplets->size(apc->get().m) == 0);
-      assert(foundNtuplets->size() == apc->get().n);
+    if (0 == first) {
+      printf("number of found cells %d \n found tuples %d with total hits %d out of %d %d\n",
+             *nCells,
+             apc->get().m,
+             apc->get().n,
+             nHits,
+             hitToTuple->totOnes());
+      if (apc->get().m < TrackerTraits::maxNumberOfQuadruplets) {
+        assert(foundNtuplets->size(apc->get().m) == 0);
+        assert(foundNtuplets->size() == apc->get().n);
+      }
     }
-  }
 
-  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
-    if (foundNtuplets->size(idx) > 7)  // current real limit
-      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
-    assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack);
-    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
-      assert(int(*ih) < nHits);
-  }
+    for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
+      if (foundNtuplets->size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
+        printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
+      assert(foundNtuplets->size(idx) <= TrackerTraits::maxHitsOnTrack);
+      for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+        assert(int(*ih) < nHits);
+    }
 #endif
 
-  if (0 == first) {
-    if (apc->get().m >= caConstants::maxNumberOfQuadruplets)
-      printf("Tuples overflow\n");
-    if (*nCells >= maxNumberOfDoublets)
-      printf("Cells overflow\n");
-    if (cellNeighbors && cellNeighbors->full())
-      printf("cellNeighbors overflow\n");
-    if (cellTracks && cellTracks->full())
-      printf("cellTracks overflow\n");
-    if (int(hitToTuple->nOnes()) < nHits)
-      printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
-  }
+    if (0 == first) {
+      if (apc->get().m >= TrackerTraits::maxNumberOfQuadruplets)
+        printf("Tuples overflow\n");
+      if (*nCells >= maxNumberOfDoublets)
+        printf("Cells overflow\n");
+      if (cellNeighbors && cellNeighbors->full())
+        printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size());
+      if (cellTracks && cellTracks->full())
+        printf("cellTracks overflow\n");
+      if (int(hitToTuple->nOnes()) < nHits)
+        printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
+#ifdef GPU_DEBUG
+      printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n",
+             cellNeighbors->size(),
+             cellTracks->size(),
+             hitToTuple->size());
+// printf("cellTracksSizes;");
+// for (int i = 0; i < cellTracks->size(); i++) {
+//   printf("%d;",cellTracks[i].size());
+// }
+//
+// printf("\n");
+// printf("cellNeighborsSizes;");
+// for (int i = 0; i < cellNeighbors->size(); i++) {
+//   printf("%d;",cellNeighbors[i].size());
+// }
+// printf("\n");
+#endif
+    }
 
-  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
-    auto const &thisCell = cells[idx];
-    if (thisCell.hasFishbone() && !thisCell.isKilled())
-      atomicAdd(&c.nFishCells, 1);
-    if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
-      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
-    if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
-      printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
-    if (thisCell.isKilled())
-      atomicAdd(&c.nKilledCells, 1);
-    if (!thisCell.unused())
-      atomicAdd(&c.nEmptyCells, 1);
-    if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
-      atomicAdd(&c.nZeroTrackCells, 1);
-  }
+    for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto const &thisCell = cells[idx];
+      if (thisCell.hasFishbone() && !thisCell.isKilled())
+        atomicAdd(&c.nFishCells, 1);
+      if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+        printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
+      if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+        printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
+      if (thisCell.isKilled())
+        atomicAdd(&c.nKilledCells, 1);
+      if (!thisCell.unused())
+        atomicAdd(&c.nEmptyCells, 1);
+      if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
+        atomicAdd(&c.nZeroTrackCells, 1);
+    }
 
-  for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
-    if (isOuterHitOfCell.container[idx].full())  // ++tooManyOuterHitOfCell;
-      printf("OuterHitOfCell overflow %d\n", idx);
+    for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
+      if (isOuterHitOfCell.container[idx].full())  // ++tooManyOuterHitOfCell;
+        printf("OuterHitOfCell overflow %d\n", idx);
+    }
   }
-}
 
-__global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
-  constexpr auto reject = pixelTrack::Quality::dup;
+  template <typename TrackerTraits>
+  __global__ void kernel_fishboneCleaner(GPUCACellT<TrackerTraits> const *cells,
+                                         uint32_t const *__restrict__ nCells,
+                                         Quality *quality) {
+    constexpr auto reject = pixelTrack::Quality::dup;
 
-  auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
-    auto const &thisCell = cells[idx];
-    if (!thisCell.isKilled())
-      continue;
+    auto first = threadIdx.x + blockIdx.x * blockDim.x;
+    for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto const &thisCell = cells[idx];
+      if (!thisCell.isKilled())
+        continue;
 
-    for (auto it : thisCell.tracks())
-      quality[it] = reject;
-  }
-}
-
-// remove shorter tracks if sharing a cell
-// It does not seem to affect efficiency in any way!
-__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
-                                             uint32_t const *__restrict__ nCells,
-                                             TkSoA const *__restrict__ ptracks,
-                                             Quality *quality,
-                                             bool dupPassThrough) {
-  // quality to mark rejected
-  constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
-
-  auto const &tracks = *ptracks;
-
-  assert(nCells);
-  auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
-    auto const &thisCell = cells[idx];
-
-    if (thisCell.tracks().size() < 2)
-      continue;
-
-    int8_t maxNl = 0;
-
-    // find maxNl
-    for (auto it : thisCell.tracks()) {
-      auto nl = tracks.nLayers(it);
-      maxNl = std::max(nl, maxNl);
+      for (auto it : thisCell.tracks())
+        quality[it] = reject;
     }
+  }
 
-    // if (maxNl<4) continue;
-    // quad pass through (leave it her for tests)
-    //  maxNl = std::min(4, maxNl);
+  // remove shorter tracks if sharing a cell
+  // It does not seem to affect efficiency in any way!
+  template <typename TrackerTraits>
+  __global__ void kernel_earlyDuplicateRemover(GPUCACellT<TrackerTraits> const *cells,
+                                               uint32_t const *__restrict__ nCells,
+                                               TkSoA<TrackerTraits> const *__restrict__ ptracks,
+                                               Quality *quality,
+                                               bool dupPassThrough) {
+    // quality to mark rejected
+    constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
+
+    auto const &tracks = *ptracks;
+
+    assert(nCells);
+    auto first = threadIdx.x + blockIdx.x * blockDim.x;
+    for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto const &thisCell = cells[idx];
+
+      if (thisCell.tracks().size() < 2)
+        continue;
 
-    for (auto it : thisCell.tracks()) {
-      if (tracks.nLayers(it) < maxNl)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
-    }
-  }
-}
+      int8_t maxNl = 0;
 
-// assume the above (so, short tracks already removed)
-__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
-                                            uint32_t const *__restrict__ nCells,
-                                            TkSoA *__restrict__ tracks,
-                                            bool dupPassThrough) {
-  // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
-  constexpr auto loose = pixelTrack::Quality::loose;
+      // find maxNl
+      for (auto it : thisCell.tracks()) {
+        auto nl = tracks.nLayers(it);
+        maxNl = std::max(nl, maxNl);
+      }
 
-  assert(nCells);
+      // if (maxNl<4) continue;
+      // quad pass through (leave it her for tests)
+      //  maxNl = std::min(4, maxNl);
 
-  auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
-    auto const &thisCell = cells[idx];
-    if (thisCell.tracks().size() < 2)
-      continue;
+      for (auto it : thisCell.tracks()) {
+        if (tracks.nLayers(it) < maxNl)
+          quality[it] = reject;  //no race:  simple assignment of the same constant
+      }
+    }
+  }
 
-    float mc = maxScore;
-    uint16_t im = tkNotFound;
+  // assume the above (so, short tracks already removed)
+  template <typename TrackerTraits>
+  __global__ void kernel_fastDuplicateRemover(GPUCACellT<TrackerTraits> const *__restrict__ cells,
+                                              uint32_t const *__restrict__ nCells,
+                                              TkSoA<TrackerTraits> *__restrict__ tracks,
+                                              bool dupPassThrough) {
+    // quality to mark rejected
+    auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
+    constexpr auto loose = pixelTrack::Quality::loose;
+
+    assert(nCells);
+
+    auto first = threadIdx.x + blockIdx.x * blockDim.x;
+    for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto const &thisCell = cells[idx];
+      if (thisCell.tracks().size() < 2)
+        continue;
 
-    /* chi2 penalize higher-pt tracks  (try rescale it?)
+      float mc = maxScore;
+      uint16_t im = tkNotFound;
+
+      /* chi2 penalize higher-pt tracks  (try rescale it?)
     auto score = [&](auto it) {
-      return tracks->nLayers(it) < 4 ? 
+      return tracks->nLayers(it) < 4 ?
               std::abs(tracks->tip(it)) :  // tip for triplets
               tracks->chi2(it);            //chi2 for quads
     };
     */
 
-    auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
+      auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
 
-    // full crazy combinatorics
-    int ntr = thisCell.tracks().size();
-    for (int i = 0; i < ntr - 1; ++i) {
-      auto it = thisCell.tracks()[i];
-      auto qi = tracks->quality(it);
-      if (qi <= reject)
-        continue;
-      auto opi = tracks->stateAtBS.state(it)(2);
-      auto e2opi = tracks->stateAtBS.covariance(it)(9);
-      auto cti = tracks->stateAtBS.state(it)(3);
-      auto e2cti = tracks->stateAtBS.covariance(it)(12);
-      for (auto j = i + 1; j < ntr; ++j) {
-        auto jt = thisCell.tracks()[j];
-        auto qj = tracks->quality(jt);
-        if (qj <= reject)
-          continue;
-        auto opj = tracks->stateAtBS.state(jt)(2);
-        auto ctj = tracks->stateAtBS.state(jt)(3);
-        auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
-        if ((cti - ctj) * (cti - ctj) > dct)
+      // full crazy combinatorics
+      // full crazy combinatorics
+      int ntr = thisCell.tracks().size();
+      for (int i = 0; i < ntr - 1; ++i) {
+        auto it = thisCell.tracks()[i];
+        auto qi = tracks->quality(it);
+        if (qi <= reject)
           continue;
-        auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
-        if ((opi - opj) * (opi - opj) > dop)
-          continue;
-        if ((qj < qi) || (qj == qi && score(it) < score(jt)))
-          tracks->quality(jt) = reject;
-        else {
-          tracks->quality(it) = reject;
-          break;
+        auto opi = tracks->stateAtBS.state(it)(2);
+        auto e2opi = tracks->stateAtBS.covariance(it)(9);
+        auto cti = tracks->stateAtBS.state(it)(3);
+        auto e2cti = tracks->stateAtBS.covariance(it)(12);
+        for (auto j = i + 1; j < ntr; ++j) {
+          auto jt = thisCell.tracks()[j];
+          auto qj = tracks->quality(jt);
+          if (qj <= reject)
+            continue;
+          auto opj = tracks->stateAtBS.state(jt)(2);
+          auto ctj = tracks->stateAtBS.state(jt)(3);
+          auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
+          if ((cti - ctj) * (cti - ctj) > dct)
+            continue;
+          auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
+          if ((opi - opj) * (opi - opj) > dop)
+            continue;
+          if ((qj < qi) || (qj == qi && score(it) < score(jt)))
+            tracks->quality(jt) = reject;
+          else {
+            tracks->quality(it) = reject;
+            break;
+          }
         }
       }
-    }
 
-    // find maxQual
-    auto maxQual = reject;  // no duplicate!
-    for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) > maxQual)
-        maxQual = tracks->quality(it);
-    }
+      // find maxQual
+      auto maxQual = reject;  // no duplicate!
+      for (auto it : thisCell.tracks()) {
+        if (tracks->quality(it) > maxQual)
+          maxQual = tracks->quality(it);
+      }
 
-    if (maxQual <= loose)
-      continue;
+      if (maxQual <= loose)
+        continue;
 
-    // find min score
-    for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) == maxQual && score(it) < mc) {
-        mc = score(it);
-        im = it;
+      // find min score
+      for (auto it : thisCell.tracks()) {
+        if (tracks->quality(it) == maxQual && score(it) < mc) {
+          mc = score(it);
+          im = it;
+        }
       }
-    }
 
-    if (tkNotFound == im)
-      continue;
+      if (tkNotFound == im)
+        continue;
 
-    // mark all other duplicates  (not yet, keep it loose)
-    for (auto it : thisCell.tracks()) {
-      if (tracks->quality(it) > loose && it != im)
-        tracks->quality(it) = loose;  //no race:  simple assignment of the same constant
+      // mark all other duplicates  (not yet, keep it loose)
+      for (auto it : thisCell.tracks()) {
+        if (tracks->quality(it) > loose && it != im)
+          tracks->quality(it) = loose;  //no race:  simple assignment of the same constant
+      }
     }
   }
-}
-
-__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
-                               cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
-                               GPUCACell::Hits const *__restrict__ hhp,
-                               GPUCACell *cells,
-                               uint32_t const *__restrict__ nCells,
-                               gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
-                               GPUCACell::OuterHitOfCell const isOuterHitOfCell,
-                               float hardCurvCut,
-                               float ptmin,
-                               float CAThetaCutBarrel,
-                               float CAThetaCutForward,
-                               float dcaCutInnerTriplet,
-                               float dcaCutOuterTriplet) {
-  auto const &hh = *hhp;
-
-  auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
-  auto first = threadIdx.x;
-  auto stride = blockDim.x;
-
-  if (0 == (firstCellIndex + first)) {
-    (*apc1) = 0;
-    (*apc2) = 0;
-  }  // ready for next kernel
-
-  for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
-    auto cellIndex = idx;
-    auto &thisCell = cells[idx];
-    auto innerHitId = thisCell.inner_hit_id();
-    if (int(innerHitId) < isOuterHitOfCell.offset)
-      continue;
-    int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
-    auto vi = isOuterHitOfCell[innerHitId].data();
-
-    auto ri = thisCell.inner_r(hh);
-    auto zi = thisCell.inner_z(hh);
-
-    auto ro = thisCell.outer_r(hh);
-    auto zo = thisCell.outer_z(hh);
-    auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
-
-    for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
-      auto otherCell = __ldg(vi + j);
-      auto &oc = cells[otherCell];
-      auto r1 = oc.inner_r(hh);
-      auto z1 = oc.inner_z(hh);
-      bool aligned = GPUCACell::areAlignedRZ(
-          r1,
-          z1,
-          ri,
-          zi,
-          ro,
-          zo,
-          ptmin,
-          isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-      if (aligned && thisCell.dcaCut(hh,
-                                     oc,
-                                     oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
-                                                                                              : dcaCutOuterTriplet,
-                                     hardCurvCut)) {  // FIXME tune cuts
-        oc.addOuterNeighbor(cellIndex, *cellNeighbors);
-        thisCell.setStatusBits(GPUCACell::StatusBit::kUsed);
-        oc.setStatusBits(GPUCACell::StatusBit::kUsed);
+
+  template <typename TrackerTraits>
+  __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
+                                 cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
+                                 Hits<TrackerTraits> const *__restrict__ hhp,
+                                 GPUCACellT<TrackerTraits> *cells,
+                                 uint32_t const *__restrict__ nCells,
+                                 CellNeighborsVector<TrackerTraits> *cellNeighbors,
+                                 OuterHitOfCell<TrackerTraits> const isOuterHitOfCell,
+                                 CAParams<TrackerTraits> params) {
+    using Cell = GPUCACellT<TrackerTraits>;
+    auto const &hh = *hhp;
+
+    auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+
+    if (0 == (firstCellIndex + first)) {
+      (*apc1) = 0;
+      (*apc2) = 0;
+    }  // ready for next kernel
+
+    constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex;
+    constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex;
+    for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
+      auto cellIndex = idx;
+      auto &thisCell = cells[idx];
+      auto innerHitId = thisCell.inner_hit_id();
+      if (int(innerHitId) < isOuterHitOfCell.offset)
+        continue;
+      int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+      auto vi = isOuterHitOfCell[innerHitId].data();
+
+      auto ri = thisCell.inner_r(hh);
+      auto zi = thisCell.inner_z(hh);
+
+      auto ro = thisCell.outer_r(hh);
+      auto zo = thisCell.outer_z(hh);
+      auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex;
+
+      for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
+        auto otherCell = __ldg(vi + j);
+        auto &oc = cells[otherCell];
+        auto r1 = oc.inner_r(hh);
+        auto z1 = oc.inner_z(hh);
+        bool aligned = Cell::areAlignedRZ(
+            r1,
+            z1,
+            ri,
+            zi,
+            ro,
+            zo,
+            params.ptmin_,
+            isBarrel ? params.CAThetaCutBarrel_ : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
+        if (aligned && thisCell.dcaCut(hh,
+                                       oc,
+                                       oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_
+                                                                                   : params.dcaCutOuterTriplet_,
+                                       params.hardCurvCut_)) {  // FIXME tune cuts
+          oc.addOuterNeighbor(cellIndex, *cellNeighbors);
+          thisCell.setStatusBits(Cell::StatusBit::kUsed);
+          oc.setStatusBits(Cell::StatusBit::kUsed);
+        }
+      }  // loop on inner cells
+    }    // loop on outer cells
+  }
+
+  template <typename TrackerTraits>
+  __global__ void kernel_find_ntuplets(Hits<TrackerTraits> const *__restrict__ hhp,
+                                       GPUCACellT<TrackerTraits> *__restrict__ cells,
+                                       uint32_t const *nCells,
+                                       CellTracksVector<TrackerTraits> *cellTracks,
+                                       HitContainer<TrackerTraits> *foundNtuplets,
+                                       cms::cuda::AtomicPairCounter *apc,
+                                       Quality *__restrict__ quality,
+                                       CAParams<TrackerTraits> params) {
+    // recursive: not obvious to widen
+    auto const &hh = *hhp;
+
+    using Cell = GPUCACellT<TrackerTraits>;
+
+    auto first = threadIdx.x + blockIdx.x * blockDim.x;
+
+#ifdef GPU_DEBUG
+    if (first == 0)
+      printf("starting producing ntuplets from %d cells \n", *nCells);
+#endif
+    for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto const &thisCell = cells[idx];
+
+      if (thisCell.isKilled())
+        continue;  // cut by earlyFishbone
+
+      // we require at least three hits...
+      if (thisCell.outerNeighbors().empty())
+        continue;
+
+      auto pid = thisCell.layerPairId();
+      bool doit = params.startingLayerPair(pid);
+
+      constexpr uint32_t maxDepth = TrackerTraits::maxDepth;
+      if (doit) {
+        typename Cell::TmpTuple stack;
+        stack.reset();
+
+        bool bpix1Start = params.startAt0(pid);
+
+        thisCell.template find_ntuplets<maxDepth>(
+            hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, params.minHitsPerNtuplet_, bpix1Start);
+
+        assert(stack.empty());
       }
-    }  // loop on inner cells
-  }    // loop on outer cells
-}
-
-__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
-                                     GPUCACell *__restrict__ cells,
-                                     uint32_t const *nCells,
-                                     gpuPixelDoublets::CellTracksVector *cellTracks,
-                                     HitContainer *foundNtuplets,
-                                     cms::cuda::AtomicPairCounter *apc,
-                                     Quality *__restrict__ quality,
-                                     unsigned int minHitsPerNtuplet) {
-  // recursive: not obvious to widen
-  auto const &hh = *hhp;
-
-  auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
-    auto const &thisCell = cells[idx];
-    if (thisCell.isKilled())
-      continue;  // cut by earlyFishbone
-    // we require at least three hits...
-    if (thisCell.outerNeighbors().empty())
-      continue;
-    auto pid = thisCell.layerPairId();
-    auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
-    if (doit) {
-      GPUCACell::TmpTuple stack;
-      stack.reset();
-      thisCell.find_ntuplets<6>(
-          hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
-      assert(stack.empty());
-      // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
     }
   }
-}
-
-__global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) {
-  auto first = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
-    auto &thisCell = cells[idx];
-    if (!thisCell.tracks().empty())
-      thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack);
+  template <typename TrackerTraits>
+  __global__ void kernel_mark_used(GPUCACellT<TrackerTraits> *__restrict__ cells, uint32_t const *nCells) {
+    auto first = threadIdx.x + blockIdx.x * blockDim.x;
+    using Cell = GPUCACellT<TrackerTraits>;
+    for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto &thisCell = cells[idx];
+      if (!thisCell.tracks().empty())
+        thisCell.setStatusBits(Cell::StatusBit::kInTrack);
+    }
   }
-}
 
-__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const *__restrict__ quality,
-                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
-  auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
-    if (nhits < 3)
-      continue;
-    if (quality[it] == pixelTrack::Quality::edup)
-      continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
-    if (nhits > 7)  // current limit
-      printf("wrong mult %d %d\n", it, nhits);
-    assert(nhits <= caConstants::maxHitsOnTrack);
-    tupleMultiplicity->count(nhits);
+  template <typename TrackerTraits>
+  __global__ void kernel_countMultiplicity(HitContainer<TrackerTraits> const *__restrict__ foundNtuplets,
+                                           Quality const *__restrict__ quality,
+                                           TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+      auto nhits = foundNtuplets->size(it);
+      if (nhits < 3)
+        continue;
+      if (quality[it] == pixelTrack::Quality::edup)
+        continue;
+      assert(quality[it] == pixelTrack::Quality::bad);
+      if (nhits > TrackerTraits::maxHitsOnTrack)  // current limit
+        printf("wrong mult %d %d\n", it, nhits);
+      assert(nhits <= TrackerTraits::maxHitsOnTrack);
+      tupleMultiplicity->count(nhits);
+    }
   }
-}
 
-__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const *__restrict__ quality,
-                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
-  auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
-    if (nhits < 3)
-      continue;
-    if (quality[it] == pixelTrack::Quality::edup)
-      continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
-    if (nhits > 7)
-      printf("wrong mult %d %d\n", it, nhits);
-    assert(nhits <= caConstants::maxHitsOnTrack);
-    tupleMultiplicity->fill(nhits, it);
-  }
-}
-
-__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
-                                      TkSoA const *__restrict__ tracks,
-                                      CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
-                                      Quality *__restrict__ quality) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = tuples->size(it);
-    if (nhits == 0)
-      break;  // guard
-
-    // if duplicate: not even fit
-    if (quality[it] == pixelTrack::Quality::edup)
-      continue;
-
-    assert(quality[it] == pixelTrack::Quality::bad);
-
-    // mark doublets as bad
-    if (nhits < 3)
-      continue;
-
-    // if the fit has any invalid parameters, mark it as bad
-    bool isNaN = false;
-    for (int i = 0; i < 5; ++i) {
-      isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
-    }
-    if (isNaN) {
-#ifdef NTUPLE_DEBUG
-      printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
-#endif
-      continue;
+  template <typename TrackerTraits>
+  __global__ void kernel_fillMultiplicity(HitContainer<TrackerTraits> const *__restrict__ foundNtuplets,
+                                          Quality const *__restrict__ quality,
+                                          TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+      auto nhits = foundNtuplets->size(it);
+      if (nhits < 3)
+        continue;
+      if (quality[it] == pixelTrack::Quality::edup)
+        continue;
+      assert(quality[it] == pixelTrack::Quality::bad);
+      if (nhits > TrackerTraits::maxHitsOnTrack)
+        printf("wrong mult %d %d\n", it, nhits);
+      assert(nhits <= TrackerTraits::maxHitsOnTrack);
+      tupleMultiplicity->fill(nhits, it);
     }
+  }
 
-    quality[it] = pixelTrack::Quality::strict;
-
-    // compute a pT-dependent chi2 cut
-
-    auto roughLog = [](float x) {
-      // max diff [0.5,12] at 1.25 0.16143
-      // average diff  0.0662998
-      union IF {
-        uint32_t i;
-        float f;
-      };
-      IF z;
-      z.f = x;
-      uint32_t lsb = 1 < 21;
-      z.i += lsb;
-      z.i >>= 21;
-      auto f = z.i & 3;
-      int ex = int(z.i >> 2) - 127;
-
-      // log2(1+0.25*f)
-      // averaged over bins
-      const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
-      return float(ex) + frac[f];
-    };
+  template <typename TrackerTraits>
+  __global__ void kernel_classifyTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
+                                        TkSoA<TrackerTraits> const *__restrict__ tracks,
+                                        QualityCuts<TrackerTraits> cuts,
+                                        Quality *__restrict__ quality) {
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+      auto nhits = tuples->size(it);
+      if (nhits == 0)
+        break;  // guard
+
+      // if duplicate: not even fit
+      if (quality[it] == pixelTrack::Quality::edup)
+        continue;
+
+      assert(quality[it] == pixelTrack::Quality::bad);
+
+      // mark doublets as bad
+      if (nhits < 3)
+        continue;
 
-    // (see CAHitNtupletGeneratorGPU.cc)
-    float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
-    float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
-    if (tracks->chi2(it) >= chi2Cut) {
-#ifdef NTUPLE_FIT_DEBUG
-      printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
-             it,
-             tuples->size(it),
-             tracks->pt(it),
-             tracks->eta(it),
-             tracks->chi2(it));
+      // if the fit has any invalid parameters, mark it as bad
+      bool isNaN = false;
+      for (int i = 0; i < 5; ++i) {
+        isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
+      }
+      if (isNaN) {
+#ifdef NTUPLE_DEBUG
+        printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
 #endif
-      continue;
-    }
+        continue;
+      }
 
-    quality[it] = pixelTrack::Quality::tight;
+      quality[it] = pixelTrack::Quality::strict;
 
-    // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
-    // default cuts:
-    //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
-    //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
-    // (see CAHitNtupletGeneratorGPU.cc)
-    auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
-    bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
-                (std::abs(tracks->zip(it)) < region.maxZip);
+      if (cuts.strictCut(tracks, it))
+        continue;
 
-    if (isOk)
-      quality[it] = pixelTrack::Quality::highPurity;
-  }
-}
+      quality[it] = pixelTrack::Quality::tight;
 
-__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
-                                        CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      break;  //guard
-    if (quality[idx] < pixelTrack::Quality::loose)
-      continue;
-    atomicAdd(&(counters->nLooseTracks), 1);
-    if (quality[idx] < pixelTrack::Quality::strict)
-      continue;
-    atomicAdd(&(counters->nGoodTracks), 1);
+      if (cuts.isHP(tracks, nhits, it))
+        quality[it] = pixelTrack::Quality::highPurity;
+    }
   }
-}
 
-__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
-                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->count(*h);
-  }
-}
-
-__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                       Quality const *__restrict__ quality,
-                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->fill(*h, idx);
-  }
-}
-
-__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
-                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                         HitContainer *__restrict__ hitDetIndices) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  // copy offsets
-  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    hitDetIndices->off[idx] = tuples->off[idx];
+  template <typename TrackerTraits>
+  __global__ void kernel_doStatsForTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
+                                          Quality const *__restrict__ quality,
+                                          Counters *counters) {
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (tuples->size(idx) == 0)
+        break;  //guard
+      if (quality[idx] < pixelTrack::Quality::loose)
+        continue;
+      atomicAdd(&(counters->nLooseTracks), 1);
+      if (quality[idx] < pixelTrack::Quality::strict)
+        continue;
+      atomicAdd(&(counters->nGoodTracks), 1);
+    }
   }
-  // fill hit indices
-  auto const &hh = *hhp;
-  auto nhits = hh.nHits();
-  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    assert(tuples->content[idx] < nhits);
-    hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
+
+  template <typename TrackerTraits>
+  __global__ void kernel_countHitInTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
+                                          Quality const *__restrict__ quality,
+                                          HitToTuple<TrackerTraits> *hitToTuple) {
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (tuples->size(idx) == 0)
+        break;  // guard
+      for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+        hitToTuple->count(*h);
+    }
   }
-}
-
-__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
-  auto &tracks = *ptracks;
-  auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  // clamp the number of tracks to the capacity of the SoA
-  auto ntracks = std::min<int>(apc->get().m, tracks.stride() - 1);
-  if (0 == first)
-    tracks.setNTracks(ntracks);
-  for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
-    auto nHits = tracks.nHits(idx);
-    assert(nHits >= 3);
-    tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
+
+  template <typename TrackerTraits>
+  __global__ void kernel_fillHitInTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
+                                         Quality const *__restrict__ quality,
+                                         HitToTuple<TrackerTraits> *hitToTuple) {
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (tuples->size(idx) == 0)
+        break;  // guard
+      for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+        hitToTuple->fill(*h, idx);
+    }
   }
-}
-
-__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
-                                             CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
-  auto &c = *counters;
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple->size(idx) == 0)
-      continue;  // SHALL NOT BE break
-    atomicAdd(&c.nUsedHits, 1);
-    if (hitToTuple->size(idx) > 1)
-      atomicAdd(&c.nDupHits, 1);
+
+  template <typename TrackerTraits>
+  __global__ void kernel_fillHitDetIndices(HitContainer<TrackerTraits> const *__restrict__ tuples,
+                                           HitsView<TrackerTraits> const *__restrict__ hhp,
+                                           HitContainer<TrackerTraits> *__restrict__ hitDetIndices) {
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    // copy offsets
+    for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      hitDetIndices->off[idx] = tuples->off[idx];
+    }
+    // fill hit indices
+    auto const &hh = *hhp;
+    auto nhits = hh.nHits();
+
+    for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      assert(tuples->content[idx] < nhits);
+      hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
+    }
   }
-}
 
-__global__ void kernel_countSharedHit(int *__restrict__ nshared,
-                                      HitContainer const *__restrict__ ptuples,
-                                      Quality const *__restrict__ quality,
-                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
-  constexpr auto loose = pixelTrack::Quality::loose;
+  template <typename TrackerTraits>
+  __global__ void kernel_fillNLayers(TkSoA<TrackerTraits> *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
+    auto &tracks = *ptracks;
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    // clamp the number of tracks to the capacity of the SoA
+    auto ntracks = std::min<int>(apc->get().m, tracks.stride() - 1);
+    if (0 == first)
+      tracks.setNTracks(ntracks);
+    for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = tracks.nHits(idx);
+      assert(nHits >= 3);
+      tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
+    }
+  }
 
-  auto &hitToTuple = *phitToTuple;
-  auto const &foundNtuplets = *ptuples;
+  template <typename TrackerTraits>
+  __global__ void kernel_doStatsForHitInTracks(HitToTuple<TrackerTraits> const *__restrict__ hitToTuple,
+                                               Counters *counters) {
+    auto &c = *counters;
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (hitToTuple->size(idx) == 0)
+        continue;  // SHALL NOT BE break
+      atomicAdd(&c.nUsedHits, 1);
+      if (hitToTuple->size(idx) > 1)
+        atomicAdd(&c.nDupHits, 1);
+    }
+  }
 
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple.size(idx) < 2)
-      continue;
+  template <typename TrackerTraits>
+  __global__ void kernel_countSharedHit(int *__restrict__ nshared,
+                                        HitContainer<TrackerTraits> const *__restrict__ ptuples,
+                                        Quality const *__restrict__ quality,
+                                        HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
+    constexpr auto loose = pixelTrack::Quality::loose;
 
-    int nt = 0;
+    auto &hitToTuple = *phitToTuple;
+    auto const &foundNtuplets = *ptuples;
 
-    // count "good" tracks
-    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (quality[*it] < loose)
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (hitToTuple.size(idx) < 2)
         continue;
-      ++nt;
-    }
 
-    if (nt < 2)
-      continue;
+      int nt = 0;
+
+      // count "good" tracks
+      for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+        if (quality[*it] < loose)
+          continue;
+        ++nt;
+      }
 
-    // now mark  each track triplet as sharing a hit
-    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (foundNtuplets.size(*it) > 3)
+      if (nt < 2)
         continue;
-      atomicAdd(&nshared[*it], 1);
-    }
 
-  }  //  hit loop
-}
-
-__global__ void kernel_markSharedHit(int const *__restrict__ nshared,
-                                     HitContainer const *__restrict__ tuples,
-                                     Quality *__restrict__ quality,
-                                     bool dupPassThrough) {
-  // constexpr auto bad = pixelTrack::Quality::bad;
-  constexpr auto dup = pixelTrack::Quality::dup;
-  constexpr auto loose = pixelTrack::Quality::loose;
-  // constexpr auto strict = pixelTrack::Quality::strict;
-
-  // quality to mark rejected
-  auto const reject = dupPassThrough ? loose : dup;
-
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      break;  //guard
-    if (quality[idx] <= reject)
-      continue;
-    if (nshared[idx] > 2)
-      quality[idx] = reject;
+      // now mark  each track triplet as sharing a hit
+      for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+        if (foundNtuplets.size(*it) > 3)
+          continue;
+        atomicAdd(&nshared[*it], 1);
+      }
+
+    }  //  hit loop
   }
-}
 
-// mostly for very forward triplets.....
-__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
+  template <typename TrackerTraits>
+  __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
+                                       HitContainer<TrackerTraits> const *__restrict__ tuples,
                                        Quality *__restrict__ quality,
-                                       uint16_t nmin,
-                                       bool dupPassThrough,
-                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
-  // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
-
-  auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
+                                       bool dupPassThrough) {
+    // constexpr auto bad = pixelTrack::Quality::bad;
+    constexpr auto dup = pixelTrack::Quality::dup;
+    constexpr auto loose = pixelTrack::Quality::loose;
+    // constexpr auto strict = pixelTrack::Quality::strict;
+
+    // quality to mark rejected
+    auto const reject = dupPassThrough ? loose : dup;
+
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (tuples->size(idx) == 0)
+        break;  //guard
+      if (quality[idx] <= reject)
+        continue;
+      if (nshared[idx] > 2)
+        quality[idx] = reject;
+    }
+  }
 
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple.size(idx) < 2)
-      continue;
+  // mostly for very forward triplets.....
+  template <typename TrackerTraits>
+  __global__ void kernel_rejectDuplicate(TkSoA<TrackerTraits> const *__restrict__ ptracks,
+                                         Quality *__restrict__ quality,
+                                         uint16_t nmin,
+                                         bool dupPassThrough,
+                                         HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
+    // quality to mark rejected
+    auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
+
+    auto &hitToTuple = *phitToTuple;
+    auto const &tracks = *ptracks;
+
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (hitToTuple.size(idx) < 2)
+        continue;
 
-    /* chi2 is bad for large pt
+      /* chi2 is bad for large pt
     auto score = [&](auto it, auto nl) {
       return nl < 4 ? std::abs(tracks.tip(it)) :  // tip for triplets
                  tracks.chi2(it);                 //chi2
     };
     */
-    auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
+      auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
 
-    // full combinatorics
-    for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
-      auto const it = *ip;
-      auto qi = quality[it];
-      if (qi <= reject)
-        continue;
-      auto opi = tracks.stateAtBS.state(it)(2);
-      auto e2opi = tracks.stateAtBS.covariance(it)(9);
-      auto cti = tracks.stateAtBS.state(it)(3);
-      auto e2cti = tracks.stateAtBS.covariance(it)(12);
-      auto nli = tracks.nLayers(it);
-      for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
-        auto const jt = *jp;
-        auto qj = quality[jt];
-        if (qj <= reject)
-          continue;
-        auto opj = tracks.stateAtBS.state(jt)(2);
-        auto ctj = tracks.stateAtBS.state(jt)(3);
-        auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
-        if ((cti - ctj) * (cti - ctj) > dct)
+      // full combinatorics
+      for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
+        auto const it = *ip;
+        auto qi = quality[it];
+        if (qi <= reject)
           continue;
-        auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
-        if ((opi - opj) * (opi - opj) > dop)
-          continue;
-        auto nlj = tracks.nLayers(jt);
-        if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
-          quality[jt] = reject;
-        else {
-          quality[it] = reject;
-          break;
+        auto opi = tracks.stateAtBS.state(it)(2);
+        auto e2opi = tracks.stateAtBS.covariance(it)(9);
+        auto cti = tracks.stateAtBS.state(it)(3);
+        auto e2cti = tracks.stateAtBS.covariance(it)(12);
+        auto nli = tracks.nLayers(it);
+        for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
+          auto const jt = *jp;
+          auto qj = quality[jt];
+          if (qj <= reject)
+            continue;
+          auto opj = tracks.stateAtBS.state(jt)(2);
+          auto ctj = tracks.stateAtBS.state(jt)(3);
+          auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
+          if ((cti - ctj) * (cti - ctj) > dct)
+            continue;
+          auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
+          if ((opi - opj) * (opi - opj) > dop)
+            continue;
+          auto nlj = tracks.nLayers(jt);
+          if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
+            quality[jt] = reject;
+          else {
+            quality[it] = reject;
+            break;
+          }
         }
       }
     }
   }
-}
 
-__global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                        TkSoA const *__restrict__ ptracks,
-                                        Quality *__restrict__ quality,
-                                        int nmin,
-                                        bool dupPassThrough,
-                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
-  // quality to mark rejected
-  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
-  // quality of longest track
-  auto const longTqual = pixelTrack::Quality::highPurity;
+  template <typename TrackerTraits>
+  __global__ void kernel_sharedHitCleaner(HitsView<TrackerTraits> const *__restrict__ hhp,
+                                          TkSoA<TrackerTraits> const *__restrict__ ptracks,
+                                          Quality *__restrict__ quality,
+                                          int nmin,
+                                          bool dupPassThrough,
+                                          HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
+    // quality to mark rejected
+    auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
+    // quality of longest track
+    auto const longTqual = pixelTrack::Quality::highPurity;
+
+    auto &hitToTuple = *phitToTuple;
+    auto const &tracks = *ptracks;
+
+    auto const &hh = *hhp;
+    int l1end = hh.hitsLayerStart()[1];
+
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (hitToTuple.size(idx) < 2)
+        continue;
 
-  auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
+      int8_t maxNl = 0;
 
-  auto const &hh = *hhp;
-  int l1end = hh.hitsLayerStart()[1];
+      // find maxNl
+      for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+        if (quality[*it] < longTqual)
+          continue;
+        // if (tracks.nHits(*it)==3) continue;
+        auto nl = tracks.nLayers(*it);
+        maxNl = std::max(nl, maxNl);
+      }
 
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple.size(idx) < 2)
-      continue;
+      if (maxNl < 4)
+        continue;
 
-    int8_t maxNl = 0;
+      // quad pass through (leave for tests)
+      // maxNl = std::min(4, maxNl);
 
-    // find maxNl
-    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (quality[*it] < longTqual)
-        continue;
-      // if (tracks.nHits(*it)==3) continue;
-      auto nl = tracks.nLayers(*it);
-      maxNl = std::max(nl, maxNl);
+      // kill all tracks shorter than maxHl (only triplets???
+      for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+        auto nl = tracks.nLayers(*it);
+
+        //checking if shared hit is on bpix1 and if the tuple is short enough
+        if (idx < l1end and nl > nmin)
+          continue;
+
+        if (nl < maxNl && quality[*it] > reject)
+          quality[*it] = reject;
+      }
     }
+  }
 
-    if (maxNl < 4)
-      continue;
+  template <typename TrackerTraits>
+  __global__ void kernel_tripletCleaner(TkSoA<TrackerTraits> const *__restrict__ ptracks,
+                                        Quality *__restrict__ quality,
+                                        uint16_t nmin,
+                                        bool dupPassThrough,
+                                        HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
+    // quality to mark rejected
+    auto const reject = pixelTrack::Quality::loose;
+    /// min quality of good
+    auto const good = pixelTrack::Quality::strict;
+
+    auto &hitToTuple = *phitToTuple;
+    auto const &tracks = *ptracks;
+
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (hitToTuple.size(idx) < 2)
+        continue;
 
-    // quad pass through (leave for tests)
-    // maxNl = std::min(4, maxNl);
+      float mc = maxScore;
+      uint16_t im = tkNotFound;
+      bool onlyTriplets = true;
 
-    // kill all tracks shorter than maxHl (only triplets???
-    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      auto nl = tracks.nLayers(*it);
+      // check if only triplets
+      for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+        if (quality[*it] <= good)
+          continue;
+        onlyTriplets &= tracks.isTriplet(*it);
+        if (!onlyTriplets)
+          break;
+      }
 
-      //checking if shared hit is on bpix1 and if the tuple is short enough
-      if (idx < l1end and nl > nmin)
+      // only triplets
+      if (!onlyTriplets)
         continue;
 
-      if (nl < maxNl && quality[*it] > reject)
-        quality[*it] = reject;
-    }
+      // for triplets choose best tip!  (should we first find best quality???)
+      for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+        auto const it = *ip;
+        if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
+          mc = std::abs(tracks.tip(it));
+          im = it;
+        }
+      }
+
+      if (tkNotFound == im)
+        continue;
+
+      // mark worse ambiguities
+      for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+        auto const it = *ip;
+        if (quality[it] > reject && it != im)
+          quality[it] = reject;  //no race:  simple assignment of the same constant
+      }
+
+    }  // loop over hits
   }
-}
-
-__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
-                                      Quality *__restrict__ quality,
-                                      uint16_t nmin,
-                                      bool dupPassThrough,
-                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
-  // quality to mark rejected
-  auto const reject = pixelTrack::Quality::loose;
-  /// min quality of good
-  auto const good = pixelTrack::Quality::strict;
-
-  auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
-
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple.size(idx) < 2)
-      continue;
-
-    float mc = maxScore;
-    uint16_t im = tkNotFound;
-    bool onlyTriplets = true;
-
-    // check if only triplets
-    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-      if (quality[*it] <= good)
+
+  template <typename TrackerTraits>
+  __global__ void kernel_simpleTripletCleaner(TkSoA<TrackerTraits> const *__restrict__ ptracks,
+                                              Quality *__restrict__ quality,
+                                              uint16_t nmin,
+                                              bool dupPassThrough,
+                                              HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
+    // quality to mark rejected
+    auto const reject = pixelTrack::Quality::loose;
+    /// min quality of good
+    auto const good = pixelTrack::Quality::loose;
+
+    auto &hitToTuple = *phitToTuple;
+    auto const &tracks = *ptracks;
+
+    int first = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (hitToTuple.size(idx) < 2)
         continue;
-      onlyTriplets &= tracks.isTriplet(*it);
-      if (!onlyTriplets)
-        break;
-    }
 
-    // only triplets
-    if (!onlyTriplets)
-      continue;
+      float mc = maxScore;
+      uint16_t im = tkNotFound;
 
-    // for triplets choose best tip!  (should we first find best quality???)
-    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-      auto const it = *ip;
-      if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-        mc = std::abs(tracks.tip(it));
-        im = it;
+      // choose best tip!  (should we first find best quality???)
+      for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+        auto const it = *ip;
+        if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
+          mc = std::abs(tracks.tip(it));
+          im = it;
+        }
       }
-    }
-
-    if (tkNotFound == im)
-      continue;
 
-    // mark worse ambiguities
-    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-      auto const it = *ip;
-      if (quality[it] > reject && it != im)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
-    }
+      if (tkNotFound == im)
+        continue;
 
-  }  // loop over hits
-}
-
-__global__ void kernel_simpleTripletCleaner(
-    TkSoA const *__restrict__ ptracks,
-    Quality *__restrict__ quality,
-    uint16_t nmin,
-    bool dupPassThrough,
-    CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
-  // quality to mark rejected
-  auto const reject = pixelTrack::Quality::loose;
-  /// min quality of good
-  auto const good = pixelTrack::Quality::loose;
-
-  auto &hitToTuple = *phitToTuple;
-  auto const &tracks = *ptracks;
-
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (hitToTuple.size(idx) < 2)
-      continue;
-
-    float mc = maxScore;
-    uint16_t im = tkNotFound;
-
-    // choose best tip!  (should we first find best quality???)
-    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-      auto const it = *ip;
-      if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-        mc = std::abs(tracks.tip(it));
-        im = it;
+      // mark worse ambiguities
+      for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+        auto const it = *ip;
+        if (quality[it] > reject && tracks.isTriplet(it) && it != im)
+          quality[it] = reject;  //no race:  simple assignment of the same constant
       }
-    }
 
-    if (tkNotFound == im)
-      continue;
+    }  // loop over hits
+  }
 
-    // mark worse ambiguities
-    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
-      auto const it = *ip;
-      if (quality[it] > reject && tracks.isTriplet(it) && it != im)
-        quality[it] = reject;  //no race:  simple assignment of the same constant
+  template <typename TrackerTraits>
+  __global__ void kernel_print_found_ntuplets(HitsView<TrackerTraits> const *__restrict__ hhp,
+                                              HitContainer<TrackerTraits> const *__restrict__ ptuples,
+                                              TkSoA<TrackerTraits> const *__restrict__ ptracks,
+                                              Quality const *__restrict__ quality,
+                                              HitToTuple<TrackerTraits> const *__restrict__ phitToTuple,
+                                              int32_t firstPrint,
+                                              int32_t lastPrint,
+                                              int iev) {
+    constexpr auto loose = pixelTrack::Quality::loose;
+    auto const &hh = *hhp;
+    auto const &foundNtuplets = *ptuples;
+    auto const &tracks = *ptracks;
+    int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
+    for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
+      auto nh = foundNtuplets.size(i);
+      if (nh < 3)
+        continue;
+      if (quality[i] < loose)
+        continue;
+      printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
+             10000 * iev + i,
+             int(quality[i]),
+             nh,
+             tracks.nLayers(i),
+             tracks.charge(i),
+             tracks.pt(i),
+             tracks.eta(i),
+             tracks.phi(i),
+             tracks.tip(i),
+             tracks.zip(i),
+             //           asinhf(fit_results[i].par(3)),
+             tracks.chi2(i),
+             hh.zGlobal(*foundNtuplets.begin(i)),
+             hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
+             hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
+             nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
+             nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
+             nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
+             nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
     }
+  }
 
-  }  // loop over hits
-}
-
-__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                            HitContainer const *__restrict__ ptuples,
-                                            TkSoA const *__restrict__ ptracks,
-                                            Quality const *__restrict__ quality,
-                                            CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
-                                            int32_t firstPrint,
-                                            int32_t lastPrint,
-                                            int iev) {
-  constexpr auto loose = pixelTrack::Quality::loose;
-  auto const &hh = *hhp;
-  auto const &foundNtuplets = *ptuples;
-  auto const &tracks = *ptracks;
-  int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
-    auto nh = foundNtuplets.size(i);
-    if (nh < 3)
-      continue;
-    if (quality[i] < loose)
-      continue;
-    printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
-           10000 * iev + i,
-           int(quality[i]),
-           nh,
-           tracks.nLayers(i),
-           tracks.charge(i),
-           tracks.pt(i),
-           tracks.eta(i),
-           tracks.phi(i),
-           tracks.tip(i),
-           tracks.zip(i),
-           //           asinhf(fit_results[i].par(3)),
-           tracks.chi2(i),
-           hh.zGlobal(*foundNtuplets.begin(i)),
-           hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
-           hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
-           nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
-           nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
-           nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
-           nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
+  __global__ void kernel_printCounters(Counters const *counters) {
+    auto const &c = *counters;
+    printf(
+        "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
+        "nDupHits | "
+        "nFishCells | "
+        "nKilledCells | "
+        "nUsedCells | nZeroTrackCells ||\n");
+    printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+           c.nEvents,
+           c.nHits,
+           c.nCells,
+           c.nTuples,
+           c.nFitTracks,
+           c.nLooseTracks,
+           c.nGoodTracks,
+           c.nUsedHits,
+           c.nDupHits,
+           c.nFishCells,
+           c.nKilledCells,
+           c.nEmptyCells,
+           c.nZeroTrackCells);
+    printf(
+        "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f|  %.3f||\n",
+        c.nEvents,
+        c.nHits / double(c.nEvents),
+        c.nCells / double(c.nEvents),
+        c.nTuples / double(c.nEvents),
+        c.nFitTracks / double(c.nEvents),
+        c.nLooseTracks / double(c.nEvents),
+        c.nGoodTracks / double(c.nEvents),
+        c.nUsedHits / double(c.nEvents),
+        c.nDupHits / double(c.nEvents),
+        c.nFishCells / double(c.nCells),
+        c.nKilledCells / double(c.nCells),
+        c.nEmptyCells / double(c.nCells),
+        c.nZeroTrackCells / double(c.nCells));
   }
-}
-
-__global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
-  auto const &c = *counters;
-  printf(
-      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
-      "nDupHits | "
-      "nFishCells | "
-      "nKilledCells | "
-      "nUsedCells | nZeroTrackCells ||\n");
-  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
-         c.nEvents,
-         c.nHits,
-         c.nCells,
-         c.nTuples,
-         c.nFitTracks,
-         c.nLooseTracks,
-         c.nGoodTracks,
-         c.nUsedHits,
-         c.nDupHits,
-         c.nFishCells,
-         c.nKilledCells,
-         c.nEmptyCells,
-         c.nZeroTrackCells);
-  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f|  %.3f||\n",
-         c.nEvents,
-         c.nHits / double(c.nEvents),
-         c.nCells / double(c.nEvents),
-         c.nTuples / double(c.nEvents),
-         c.nFitTracks / double(c.nEvents),
-         c.nLooseTracks / double(c.nEvents),
-         c.nGoodTracks / double(c.nEvents),
-         c.nUsedHits / double(c.nEvents),
-         c.nDupHits / double(c.nEvents),
-         c.nFishCells / double(c.nCells),
-         c.nKilledCells / double(c.nCells),
-         c.nEmptyCells / double(c.nCells),
-         c.nZeroTrackCells / double(c.nCells));
-}
+
+}  // namespace caHitNtupletGeneratorKernels
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index f650ca8ab2a08..6d9ac785155d2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -3,6 +3,7 @@
 //
 
 // #define GPU_DEBUG
+// #define DUMP_GPU_TK_TUPLES
 
 #include <array>
 #include <cassert>
@@ -24,64 +25,115 @@
 
 namespace {
 
+  using namespace caHitNtupletGenerator;
+  using namespace gpuPixelDoublets;
+  using namespace pixelTopology;
+  using namespace pixelTrack;
+
   template <typename T>
   T sqr(T x) {
     return x * x;
   }
 
-  cAHitNtupletGenerator::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) {
-    auto coeff = pset.getParameter<std::vector<double>>("chi2Coeff");
-    auto ptMax = pset.getParameter<double>("chi2MaxPt");
-    if (coeff.size() != 2) {
-      throw edm::Exception(edm::errors::Configuration,
-                           "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 2 elements");
+  //Common Params
+  AlgoParams makeCommonParams(edm::ParameterSet const& cfg) {
+    return AlgoParams({cfg.getParameter<bool>("onGPU"),
+                       cfg.getParameter<unsigned int>("minHitsForSharingCut"),
+                       cfg.getParameter<bool>("useRiemannFit"),
+                       cfg.getParameter<bool>("fitNas4"),
+                       cfg.getParameter<bool>("includeJumpingForwardDoublets"),
+                       cfg.getParameter<bool>("earlyFishbone"),
+                       cfg.getParameter<bool>("lateFishbone"),
+                       cfg.getParameter<bool>("fillStatistics"),
+                       cfg.getParameter<bool>("doSharedHitCut"),
+                       cfg.getParameter<bool>("dupPassThrough"),
+                       cfg.getParameter<bool>("useSimpleTripletCleaner")});
+  }
+
+  //This is needed to have the partial specialization for  isPhase1Topology/isPhase2Topology
+  template <typename TrackerTraits, typename Enable = void>
+  struct topologyCuts {};
+
+  template <typename TrackerTraits>
+  struct topologyCuts<TrackerTraits, isPhase1Topology<TrackerTraits>> {
+    static constexpr CAParamsT<TrackerTraits> makeCACuts(edm::ParameterSet const& cfg) {
+      return CAParamsT<TrackerTraits>{{cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+                                       (float)cfg.getParameter<double>("ptmin"),
+                                       (float)cfg.getParameter<double>("CAThetaCutBarrel"),
+                                       (float)cfg.getParameter<double>("CAThetaCutForward"),
+                                       (float)cfg.getParameter<double>("hardCurvCut"),
+                                       (float)cfg.getParameter<double>("dcaCutInnerTriplet"),
+                                       (float)cfg.getParameter<double>("dcaCutOuterTriplet")}};
+    };
+
+    static constexpr QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
+      auto coeff = pset.getParameter<std::array<double, 2>>("chi2Coeff");
+      auto ptMax = pset.getParameter<double>("chi2MaxPt");
+
+      coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax);
+      return QualityCutsT<TrackerTraits>{// polynomial coefficients for the pT-dependent chi2 cut
+                                         {(float)coeff[0], (float)coeff[1], 0.f, 0.f},
+                                         // max pT used to determine the chi2 cut
+                                         (float)ptMax,
+                                         // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit
+                                         (float)pset.getParameter<double>("chi2Scale"),
+                                         // regional cuts for triplets
+                                         {(float)pset.getParameter<double>("tripletMaxTip"),
+                                          (float)pset.getParameter<double>("tripletMinPt"),
+                                          (float)pset.getParameter<double>("tripletMaxZip")},
+                                         // regional cuts for quadruplets
+                                         {(float)pset.getParameter<double>("quadrupletMaxTip"),
+                                          (float)pset.getParameter<double>("quadrupletMinPt"),
+                                          (float)pset.getParameter<double>("quadrupletMaxZip")}};
     }
-    coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax);
-    return cAHitNtupletGenerator::QualityCuts{// polynomial coefficients for the pT-dependent chi2 cut
-                                              {(float)coeff[0], (float)coeff[1], 0.f, 0.f},
-                                              // max pT used to determine the chi2 cut
-                                              (float)ptMax,
-                                              // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit
-                                              (float)pset.getParameter<double>("chi2Scale"),
-                                              // regional cuts for triplets
-                                              {(float)pset.getParameter<double>("tripletMaxTip"),
-                                               (float)pset.getParameter<double>("tripletMinPt"),
-                                               (float)pset.getParameter<double>("tripletMaxZip")},
-                                              // regional cuts for quadruplets
-                                              {(float)pset.getParameter<double>("quadrupletMaxTip"),
-                                               (float)pset.getParameter<double>("quadrupletMinPt"),
-                                               (float)pset.getParameter<double>("quadrupletMaxZip")}};
+  };
+
+  template <typename TrackerTraits>
+  struct topologyCuts<TrackerTraits, isPhase2Topology<TrackerTraits>> {
+    static constexpr CAParamsT<TrackerTraits> makeCACuts(edm::ParameterSet const& cfg) {
+      return CAParamsT<TrackerTraits>{{cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+                                       (float)cfg.getParameter<double>("ptmin"),
+                                       (float)cfg.getParameter<double>("CAThetaCutBarrel"),
+                                       (float)cfg.getParameter<double>("CAThetaCutForward"),
+                                       (float)cfg.getParameter<double>("hardCurvCut"),
+                                       (float)cfg.getParameter<double>("dcaCutInnerTriplet"),
+                                       (float)cfg.getParameter<double>("dcaCutOuterTriplet")},
+                                      {(bool)cfg.getParameter<bool>("includeFarForwards")}};
+    }
+
+    static constexpr QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
+      return QualityCutsT<TrackerTraits>{
+          (float)pset.getParameter<double>("maxChi2"),
+          (float)pset.getParameter<double>("minPt"),
+          (float)pset.getParameter<double>("maxTip"),
+          (float)pset.getParameter<double>("maxZip"),
+      };
+    }
+  };
+
+  //Cell Cuts, as they are the cuts have the same logic for Phase2 and Phase1
+  //keeping them separate would allow further differentiation in the future
+  //moving them to topologyCuts and using the same syntax
+  template <typename TrakterTraits>
+  CellCutsT<TrakterTraits> makeCellCuts(edm::ParameterSet const& cfg) {
+    return CellCutsT<TrakterTraits>{cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
+                                    cfg.getParameter<bool>("doClusterCut"),
+                                    cfg.getParameter<bool>("doZ0Cut"),
+                                    cfg.getParameter<bool>("doPtCut"),
+                                    cfg.getParameter<bool>("idealConditions")};
   }
 
 }  // namespace
 
 using namespace std;
 
-CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC)
-    : m_params(cfg.getParameter<bool>("onGPU"),
-               cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
-               cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
-               cfg.getParameter<unsigned int>("minHitsForSharingCut"),
-               cfg.getParameter<bool>("useRiemannFit"),
-               cfg.getParameter<bool>("fitNas4"),
-               cfg.getParameter<bool>("includeJumpingForwardDoublets"),
-               cfg.getParameter<bool>("earlyFishbone"),
-               cfg.getParameter<bool>("lateFishbone"),
-               cfg.getParameter<bool>("idealConditions"),
-               cfg.getParameter<bool>("fillStatistics"),
-               cfg.getParameter<bool>("doClusterCut"),
-               cfg.getParameter<bool>("doZ0Cut"),
-               cfg.getParameter<bool>("doPtCut"),
-               cfg.getParameter<bool>("doSharedHitCut"),
-               cfg.getParameter<bool>("dupPassThrough"),
-               cfg.getParameter<bool>("useSimpleTripletCleaner"),
-               cfg.getParameter<double>("ptmin"),
-               cfg.getParameter<double>("CAThetaCutBarrel"),
-               cfg.getParameter<double>("CAThetaCutForward"),
-               cfg.getParameter<double>("hardCurvCut"),
-               cfg.getParameter<double>("dcaCutInnerTriplet"),
-               cfg.getParameter<double>("dcaCutOuterTriplet"),
-               makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
+template <typename TrackerTraits>
+CAHitNtupletGeneratorOnGPU<TrackerTraits>::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg,
+                                                                      edm::ConsumesCollector& iC)
+    : m_params(makeCommonParams(cfg),
+               makeCellCuts<TrackerTraits>(cfg),
+               topologyCuts<TrackerTraits>::makeQualityCuts(cfg.getParameterSet("trackQualityCuts")),
+               topologyCuts<TrackerTraits>::makeCACuts(cfg)) {
 #ifdef DUMP_GPU_TK_TUPLES
   printf("TK: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
          "tid",
@@ -104,7 +156,61 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
 #endif
 }
 
-void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorOnGPU<TrackerTraits>::fillDescriptions(edm::ParameterSetDescription& desc) {
+  fillDescriptionsCommon(desc);
+  edm::LogWarning("CAHitNtupletGeneratorOnGPU::fillDescriptions")
+      << "Note: this fillDescriptions is a dummy one. Most probably you are missing some parameters. \n"
+         "please implement your TrackerTraits descriptions in CAHitNtupletGeneratorOnGPU. \n";
+}
+
+template <>
+void CAHitNtupletGeneratorOnGPU<pixelTopology::Phase1>::fillDescriptions(edm::ParameterSetDescription& desc) {
+  fillDescriptionsCommon(desc);
+
+  desc.add<bool>("idealConditions", true);
+  desc.add<bool>("includeJumpingForwardDoublets", false);
+
+  edm::ParameterSetDescription trackQualityCuts;
+  trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
+  trackQualityCuts.add<std::vector<double>>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above");
+  trackQualityCuts.add<double>("chi2Scale", 8.)
+      ->setComment(
+          "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann "
+          "fit)");
+  trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
+  trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");
+  trackQualityCuts.add<double>("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV");
+  trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm");
+  desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+      ->setComment(
+          "Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply \"region "
+          "cuts\" based on the fit results (pT, Tip, Zip).");
+}
+
+template <>
+void CAHitNtupletGeneratorOnGPU<pixelTopology::Phase2>::fillDescriptions(edm::ParameterSetDescription& desc) {
+  fillDescriptionsCommon(desc);
+
+  desc.add<bool>("idealConditions", false);
+  desc.add<bool>("includeFarForwards", true);
+  desc.add<bool>("includeJumpingForwardDoublets", true);
+
+  edm::ParameterSetDescription trackQualityCuts;
+  trackQualityCuts.add<double>("maxChi2", 5.)->setComment("Max normalized chi2");
+  trackQualityCuts.add<double>("minPt", 0.5)->setComment("Min pT in GeV");
+  trackQualityCuts.add<double>("maxTip", 0.3)->setComment("Max |Tip| in cm");
+  trackQualityCuts.add<double>("maxZip", 12.)->setComment("Max |Zip|, in cm");
+  desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+      ->setComment(
+          "Quality cuts based on the results of the track fit:\n  - apply cuts based on the fit results (pT, Tip, "
+          "Zip).");
+}
+
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorOnGPU<TrackerTraits>::fillDescriptionsCommon(edm::ParameterSetDescription& desc) {
   // 87 cm/GeV = 1/(3.8T * 0.3)
   // take less than radius given by the hardPtCut and reject everything below
   // auto hardCurvCut = 1.f/(0.35 * 87.f);
@@ -116,13 +222,12 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
   desc.add<double>("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1");
   desc.add<bool>("earlyFishbone", true);
   desc.add<bool>("lateFishbone", false);
-  desc.add<bool>("idealConditions", true);
   desc.add<bool>("fillStatistics", false);
   desc.add<unsigned int>("minHitsPerNtuplet", 4);
-  desc.add<unsigned int>("maxNumberOfDoublets", caConstants::maxNumberOfDoublets);
+  desc.add<unsigned int>("maxNumberOfDoublets", TrackerTraits::maxNumberOfDoublets);
   desc.add<unsigned int>("minHitsForSharingCut", 10)
       ->setComment("Maximum number of hits in a tuple to clean also if the shared hit is on bpx1");
-  desc.add<bool>("includeJumpingForwardDoublets", false);
+
   desc.add<bool>("fitNas4", false)->setComment("fit only 4 hits out of N");
   desc.add<bool>("doClusterCut", true);
   desc.add<bool>("doZ0Cut", true);
@@ -131,27 +236,10 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
   desc.add<bool>("doSharedHitCut", true)->setComment("Sharing hit nTuples cleaning");
   desc.add<bool>("dupPassThrough", false)->setComment("Do not reject duplicate");
   desc.add<bool>("useSimpleTripletCleaner", true)->setComment("use alternate implementation");
-
-  edm::ParameterSetDescription trackQualityCuts;
-  trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
-  trackQualityCuts.add<std::vector<double>>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above");
-  trackQualityCuts.add<double>("chi2Scale", 8.)
-      ->setComment(
-          "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann "
-          "fit)");
-  trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
-  trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");
-  trackQualityCuts.add<double>("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm");
-  trackQualityCuts.add<double>("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV");
-  trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
-  trackQualityCuts.add<double>("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm");
-  desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
-      ->setComment(
-          "Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply \"region "
-          "cuts\" based on the fit results (pT, Tip, Zip).");
 }
 
-void CAHitNtupletGeneratorOnGPU::beginJob() {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorOnGPU<TrackerTraits>::beginJob() {
   if (m_params.onGPU_) {
     // allocate pinned host memory only if CUDA is available
     edm::Service<CUDAService> cs;
@@ -165,49 +253,58 @@ void CAHitNtupletGeneratorOnGPU::beginJob() {
   }
 }
 
-void CAHitNtupletGeneratorOnGPU::endJob() {
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorOnGPU<TrackerTraits>::endJob() {
   if (m_params.onGPU_) {
     // print the gpu statistics and free pinned host memory only if CUDA is available
     edm::Service<CUDAService> cs;
     if (cs and cs->enabled()) {
       if (m_params.doStats_) {
         // crash on multi-gpu processes
-        CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters);
+        CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::printCounters(m_counters);
       }
       cudaFree(m_counters);
     }
   } else {
     if (m_params.doStats_) {
-      CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters);
+      CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::printCounters(m_counters);
     }
     delete m_counters;
   }
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
-                                                                    float bfield,
-                                                                    cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
+template <typename TrackerTraits>
+PixelTrackHeterogeneousT<TrackerTraits> CAHitNtupletGeneratorOnGPU<TrackerTraits>::makeTuplesAsync(
+    HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const {
+  using HelixFitOnGPU = HelixFitOnGPU<TrackerTraits>;
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using GPUKernels = CAHitNtupletGeneratorKernelsGPU<TrackerTraits>;
+
+  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<OutputSoA>(stream));
 
   auto* soa = tracks.get();
   assert(soa);
+  cudaCheck(cudaGetLastError());
 
-  CAHitNtupletGeneratorKernelsGPU kernels(m_params);
+  GPUKernels kernels(m_params);
   kernels.setCounters(m_counters);
   kernels.allocateOnGPU(hits_d.nHits(), stream);
+  cudaCheck(cudaGetLastError());
 
   kernels.buildDoublets(hits_d, stream);
+  cudaCheck(cudaGetLastError());
+
   kernels.launchKernels(hits_d, soa, stream);
+  cudaCheck(cudaGetLastError());
 
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream);
   } else {
-    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream);
   }
   kernels.classifyTuples(hits_d, soa, stream);
-
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
@@ -217,13 +314,19 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   return tracks;
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+template <typename TrackerTraits>
+PixelTrackHeterogeneousT<TrackerTraits> CAHitNtupletGeneratorOnGPU<TrackerTraits>::makeTuples(HitsOnCPU const& hits_d,
+                                                                                              float bfield) const {
+  using HelixFitOnGPU = HelixFitOnGPU<TrackerTraits>;
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using CPUKernels = CAHitNtupletGeneratorKernelsCPU<TrackerTraits>;
+
+  PixelTrackHeterogeneous tracks(std::make_unique<OutputSoA>());
 
   auto* soa = tracks.get();
   assert(soa);
 
-  CAHitNtupletGeneratorKernelsCPU kernels(m_params);
+  CPUKernels kernels(m_params);
   kernels.setCounters(m_counters);
   kernels.allocateOnGPU(hits_d.nHits(), nullptr);
 
@@ -238,9 +341,9 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
+    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets);
   } else {
-    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
+    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets);
   }
 
   kernels.classifyTuples(hits_d, soa, nullptr);
@@ -261,3 +364,6 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
 
   return tracks;
 }
+
+template class CAHitNtupletGeneratorOnGPU<pixelTopology::Phase1>;
+template class CAHitNtupletGeneratorOnGPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index ae4576d883530..745579b960b76 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -21,20 +21,33 @@ namespace edm {
   class ParameterSetDescription;
 }  // namespace edm
 
+template <typename TrackerTraits>
 class CAHitNtupletGeneratorOnGPU {
 public:
-  using HitsOnGPU = TrackingRecHit2DSOAView;
-  using HitsOnCPU = TrackingRecHit2DGPU;
-  using hindex_type = TrackingRecHit2DSOAView::hindex_type;
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
 
-  using Quality = pixelTrack::Quality;
-  using OutputSoA = pixelTrack::TrackSoA;
-  using HitContainer = pixelTrack::HitContainer;
+  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using HitsOnGPU = TrackingRecHit2DGPUT<TrackerTraits>;
+  using HitsOnCPU = TrackingRecHit2DCPUT<TrackerTraits>;
+  using hindex_type = typename HitsView::hindex_type;
+
+  using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
+  using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+
+  using GPUCACell = GPUCACellT<TrackerTraits>;
+  using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+  using HitContainer = typename OutputSoA::HitContainer;
   using Tuple = HitContainer;
 
-  using QualityCuts = cAHitNtupletGenerator::QualityCuts;
-  using Params = cAHitNtupletGenerator::Params;
-  using Counters = cAHitNtupletGenerator::Counters;
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+
+  using Quality = pixelTrack::Quality;
+
+  using QualityCuts = pixelTrack::QualityCutsT<TrackerTraits>;
+  using Params = caHitNtupletGenerator::ParamsT<TrackerTraits>;
+  using Counters = caHitNtupletGenerator::Counters;
 
 public:
   CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC)
@@ -42,21 +55,22 @@ class CAHitNtupletGeneratorOnGPU {
   CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
 
   static void fillDescriptions(edm::ParameterSetDescription& desc);
-  static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
+  static void fillDescriptionsCommon(edm::ParameterSetDescription& desc);
+  //static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
   void beginJob();
   void endJob();
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
+  PixelTrackHeterogeneous makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const;
 
-  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+  PixelTrackHeterogeneous makeTuples(HitsOnCPU const& hits_d, float bfield) const;
 
 private:
-  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
+  void buildDoublets(HitsOnGPU const& hh, cudaStream_t stream) const;
 
-  void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
+  void hitNtuplets(HitsOnGPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
 
-  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
+  void launchKernels(HitsOnGPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
 
   Params m_params;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAStructures.h b/RecoPixelVertexing/PixelTriplets/plugins/CAStructures.h
new file mode 100644
index 0000000000000..21f9d474c683c
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAStructures.h
@@ -0,0 +1,55 @@
+#ifndef CUDADataFormats_TrackerGeometry_CAStructures_h
+#define CUDADataFormats_TrackerGeometry_CAStructures_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+namespace caStructures {
+
+  // types
+  // using typename TrackerTraits::hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+  // using typename TrackerTraits::tindex_type = uint32_t;  // for tuples
+  // using typename TrackerTraits::cindex_type = uint32_t;  // for cells
+
+  template <typename TrackerTraits>
+  using CellNeighborsT = cms::cuda::VecArray<typename TrackerTraits::cindex_type, TrackerTraits::maxCellNeighbors>;
+
+  template <typename TrackerTraits>
+  using CellTracksT = cms::cuda::VecArray<typename TrackerTraits::tindex_type, TrackerTraits::maxCellTracks>;
+
+  template <typename TrackerTraits>
+  using CellNeighborsVectorT = cms::cuda::SimpleVector<CellNeighborsT<TrackerTraits>>;
+
+  template <typename TrackerTraits>
+  using CellTracksVectorT = cms::cuda::SimpleVector<CellTracksT<TrackerTraits>>;
+
+  template <typename TrackerTraits>
+  using OuterHitOfCellContainerT = cms::cuda::VecArray<uint32_t, TrackerTraits::maxCellsPerHit>;
+
+  template <typename TrackerTraits>
+  using TupleMultiplicityT = cms::cuda::OneToManyAssoc<typename TrackerTraits::tindex_type,
+                                                       TrackerTraits::maxHitsOnTrack + 1,
+                                                       TrackerTraits::maxNumberOfTuples>;
+
+  template <typename TrackerTraits>
+  using HitToTupleT = cms::cuda::OneToManyAssoc<typename TrackerTraits::tindex_type,
+                                                -1,
+                                                TrackerTraits::maxHitsForContainers>;  // 3.5 should be enough
+
+  template <typename TrackerTraits>
+  using TuplesContainerT = cms::cuda::OneToManyAssoc<typename TrackerTraits::hindex_type,
+                                                     TrackerTraits::maxNumberOfTuples,
+                                                     TrackerTraits::maxHitsForContainers>;
+
+  template <typename TrackerTraits>
+  struct OuterHitOfCellT {
+    OuterHitOfCellContainerT<TrackerTraits>* container;
+    int32_t offset;
+    constexpr auto& operator[](int i) { return container[i - offset]; }
+    constexpr auto const& operator[](int i) const { return container[i - offset]; }
+  };
+
+}  // namespace caStructures
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 2e3747a2b6841..965889abcb268 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
-#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACellT_h
+#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACellT_h
 
 //
 // Author: Felice Pantaleo, CERN
@@ -15,33 +15,36 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CAConstants.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "CAStructures.h"
 
-class GPUCACell {
+template <typename TrackerTraits>
+class GPUCACellT {
 public:
   using PtrAsInt = unsigned long long;
 
-  static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit;
-  static constexpr auto invalidHitId = std::numeric_limits<caConstants::hindex_type>::max();
-  using OuterHitOfCellContainer = caConstants::OuterHitOfCellContainer;
-  using OuterHitOfCell = caConstants::OuterHitOfCell;
-  using CellNeighbors = caConstants::CellNeighbors;
-  using CellTracks = caConstants::CellTracks;
-  using CellNeighborsVector = caConstants::CellNeighborsVector;
-  using CellTracksVector = caConstants::CellTracksVector;
+  static constexpr auto maxCellsPerHit = TrackerTraits::maxCellsPerHit;
+  using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT<TrackerTraits>;
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
 
-  using Hits = TrackingRecHit2DSOAView;
-  using hindex_type = Hits::hindex_type;
+  using Hits = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using hindex_type = typename TrackerTraits::hindex_type;
+  using tindex_type = typename TrackerTraits::tindex_type;
+  static constexpr auto invalidHitId = std::numeric_limits<hindex_type>::max();
 
-  using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
+  using TmpTuple = cms::cuda::VecArray<uint32_t, TrackerTraits::maxDepth>;
 
-  using HitContainer = pixelTrack::HitContainer;
+  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
   using Quality = pixelTrack::Quality;
   static constexpr auto bad = pixelTrack::Quality::bad;
 
   enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 };
 
-  GPUCACell() = default;
+  GPUCACellT() = default;
 
   __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
                                        CellTracksVector& cellTracks,
@@ -66,7 +69,8 @@ class GPUCACell {
     assert(tracks().empty());
   }
 
-  __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
+  __device__ __forceinline__ int addOuterNeighbor(typename TrackerTraits::cindex_type t,
+                                                  CellNeighborsVector& cellNeighbors) {
     // use smart cache
     if (outerNeighbors().empty()) {
       auto i = cellNeighbors.extend();  // maybe wasted....
@@ -88,7 +92,7 @@ class GPUCACell {
     return outerNeighbors().push_back(t);
   }
 
-  __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
+  __device__ __forceinline__ int addTrack(tindex_type t, CellTracksVector& cellTracks) {
     if (tracks().empty()) {
       auto i = cellTracks.extend();  // maybe wasted....
       if (i > 0) {
@@ -139,7 +143,7 @@ class GPUCACell {
   }
 
   __device__ bool check_alignment(Hits const& hh,
-                                  GPUCACell const& otherCell,
+                                  GPUCACellT const& otherCell,
                                   const float ptmin,
                                   const float hardCurvCut,
                                   const float caThetaCutBarrel,
@@ -157,7 +161,7 @@ class GPUCACell {
 
     auto r1 = otherCell.inner_r(hh);
     auto z1 = otherCell.inner_z(hh);
-    auto isBarrel = otherCell.outer_detIndex(hh) < caConstants::last_barrel_detIndex;
+    auto isBarrel = otherCell.outer_detIndex(hh) < TrackerTraits::last_barrel_detIndex;
     bool aligned = areAlignedRZ(r1,
                                 z1,
                                 ri,
@@ -168,8 +172,8 @@ class GPUCACell {
                                 isBarrel ? caThetaCutBarrel : caThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
     return (aligned && dcaCut(hh,
                               otherCell,
-                              otherCell.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
-                                                                                              : dcaCutOuterTriplet,
+                              otherCell.inner_detIndex(hh) < TrackerTraits::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                                : dcaCutOuterTriplet,
                               hardCurvCut));  // FIXME tune cuts
   }
 
@@ -186,7 +190,7 @@ class GPUCACell {
   }
 
   __device__ inline bool dcaCut(Hits const& hh,
-                                GPUCACell const& otherCell,
+                                GPUCACellT const& otherCell,
                                 const float region_origin_radius_plus_tolerance,
                                 const float maxCurv) const {
     auto x1 = otherCell.inner_x(hh);
@@ -222,11 +226,9 @@ class GPUCACell {
     return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
   }
 
-  __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
-    using caConstants::first_ladder_bpx0;
-    using caConstants::max_ladder_bpx0;
-    using caConstants::module_length_bpx0;
-    using caConstants::module_tolerance_bpx0;
+  __device__ inline bool hole0(Hits const& hh, GPUCACellT const& innerCell) const {
+    using namespace phase1PixelTopology;
+
     int p = innerCell.inner_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
@@ -245,11 +247,9 @@ class GPUCACell {
     return gap;
   }
 
-  __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
-    using caConstants::first_ladder_bpx4;
-    using caConstants::max_ladder_bpx4;
-    using caConstants::module_length_bpx4;
-    using caConstants::module_tolerance_bpx4;
+  __device__ inline bool hole4(Hits const& hh, GPUCACellT const& innerCell) const {
+    using namespace phase1PixelTopology;
+
     int p = outer_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
@@ -272,9 +272,10 @@ class GPUCACell {
 
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
+
   template <int DEPTH>
   __device__ inline void find_ntuplets(Hits const& hh,
-                                       GPUCACell* __restrict__ cells,
+                                       GPUCACellT* __restrict__ cells,
                                        CellTracksVector& cellTracks,
                                        HitContainer& foundNtuplets,
                                        cms::cuda::AtomicPairCounter& apc,
@@ -288,50 +289,62 @@ class GPUCACell {
     // the ntuplets is then saved if the number of hits it contains is greater
     // than a threshold
 
-    auto doubletId = this - cells;
-    tmpNtuplet.push_back_unsafe(doubletId);
-    assert(tmpNtuplet.size() <= 5);
-
-    bool last = true;
-    for (unsigned int otherCell : outerNeighbors()) {
-      if (cells[otherCell].isKilled())
-        continue;  // killed by earlyFishbone
-      last = false;
-      cells[otherCell].find_ntuplets<DEPTH - 1>(
-          hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
-    }
-    if (last) {  // if long enough save...
-      if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
+    if constexpr (DEPTH <= 0) {
+      printf("ERROR: GPUCACellT::find_ntuplets reached full depth!\n");
+#ifdef __CUDA_ARCH__
+      __trap();
+#else
+      abort();
+#endif
+    } else {
+      auto doubletId = this - cells;
+      tmpNtuplet.push_back_unsafe(doubletId);
+      assert(tmpNtuplet.size() <=
+             int(TrackerTraits::maxHitsOnTrack -
+                 3));  //1 for the container, 1 because these are doublets, 1 because we may push another
+
+      bool last = true;
+      for (unsigned int otherCell : outerNeighbors()) {
+        if (cells[otherCell].isKilled())
+          continue;  // killed by earlyFishbone
+        last = false;
+        cells[otherCell].template find_ntuplets<DEPTH - 1>(
+            hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
+      }
+
+      if (last) {  // if long enough save...
+        if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
 #ifdef ONLY_TRIPLETS_IN_HOLE
-        // triplets accepted only pointing to the hole
-        if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
-            ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
+          // triplets accepted only pointing to the hole
+          if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
+              ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
 #endif
-        {
-          hindex_type hits[8];
-          auto nh = 0U;
-          constexpr int maxFB = 2;  // for the time being let's limit this
-          int nfb = 0;
-          for (auto c : tmpNtuplet) {
-            hits[nh++] = cells[c].theInnerHitId;
-            if (nfb < maxFB && cells[c].hasFishbone()) {
-              ++nfb;
-              hits[nh++] = cells[c].theFishboneId;  // fishbone hit is always outer than inner hit
+          {
+            hindex_type hits[TrackerTraits::maxDepth + 2];
+            auto nh = 0U;
+            constexpr int maxFB = 2;  // for the time being let's limit this
+            int nfb = 0;
+            for (auto c : tmpNtuplet) {
+              hits[nh++] = cells[c].theInnerHitId;
+              if (nfb < maxFB && cells[c].hasFishbone()) {
+                ++nfb;
+                hits[nh++] = cells[c].theFishboneId;  // fishbone hit is always outer than inner hit
+              }
+            }
+            assert(nh < TrackerTraits::maxHitsOnTrack);
+            hits[nh] = theOuterHitId;
+            auto it = foundNtuplets.bulkFill(apc, hits, nh + 1);
+            if (it >= 0) {  // if negative is overflow....
+              for (auto c : tmpNtuplet)
+                cells[c].addTrack(it, cellTracks);
+              quality[it] = bad;  // initialize to bad
             }
-          }
-          assert(nh < caConstants::maxHitsOnTrack);
-          hits[nh] = theOuterHitId;
-          auto it = foundNtuplets.bulkFill(apc, hits, nh + 1);
-          if (it >= 0) {  // if negative is overflow....
-            for (auto c : tmpNtuplet)
-              cells[c].addTrack(it, cellTracks);
-            quality[it] = bad;  // initialize to bad
           }
         }
       }
+      tmpNtuplet.pop_back();
+      assert(tmpNtuplet.size() < int(TrackerTraits::maxHitsOnTrack - 1));
     }
-    tmpNtuplet.pop_back();
-    assert(tmpNtuplet.size() < 5);
   }
 
   // Cell status management
@@ -370,22 +383,4 @@ class GPUCACell {
   hindex_type theFishboneId;
 };
 
-template <>
-__device__ inline void GPUCACell::find_ntuplets<0>(Hits const& hh,
-                                                   GPUCACell* __restrict__ cells,
-                                                   CellTracksVector& cellTracks,
-                                                   HitContainer& foundNtuplets,
-                                                   cms::cuda::AtomicPairCounter& apc,
-                                                   Quality* __restrict__ quality,
-                                                   TmpTuple& tmpNtuplet,
-                                                   const unsigned int minHitsPerNtuplet,
-                                                   bool startAt0) const {
-  printf("ERROR: GPUCACell::find_ntuplets reached full depth!\n");
-#ifdef __CUDA_ARCH__
-  __trap();
-#else
-  abort();
-#endif
-}
-
-#endif  // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_GPUCACellT_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index 880bdb47dfb5c..c300329a82208 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,9 +1,11 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
-                                  TupleMultiplicity const *tupleMultiplicity,
-                                  OutputSoA *helix_fit_results) {
+template <typename TrackerTraits>
+void HelixFitOnGPU<TrackerTraits>::allocateOnGPU(
+    Tuples const *tuples,
+    caStructures::TupleMultiplicityT<TrackerTraits> const *tupleMultiplicity,
+    pixelTrack::TrackSoAT<TrackerTraits> *helix_fit_results) {
   tuples_ = tuples;
   tupleMultiplicity_ = tupleMultiplicity;
   outputSoa_ = helix_fit_results;
@@ -13,4 +15,8 @@ void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
   assert(outputSoa_);
 }
 
-void HelixFitOnGPU::deallocateOnGPU() {}
+template <typename TrackerTraits>
+void HelixFitOnGPU<TrackerTraits>::deallocateOnGPU() {}
+
+template class HelixFitOnGPU<pixelTopology::Phase1>;
+template class HelixFitOnGPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 9a9c85970af33..78bec6f5e2a87 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -4,12 +4,13 @@
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
-#include "CAConstants.h"
+#include "CAStructures.h"
 
 namespace riemannFit {
   // in case of memory issue can be made smaller
-  constexpr uint32_t maxNumberOfConcurrentFits = caConstants::maxNumberOfTuples;
+  constexpr uint32_t maxNumberOfConcurrentFits = 32 * 1024;
   constexpr uint32_t stride = maxNumberOfConcurrentFits;
   using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
   using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride, stride> >;
@@ -29,16 +30,25 @@ namespace riemannFit {
   // fast fit
   using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride> >;
 
+  template <auto Start, auto End, auto Inc, class F>  //a compile-time bounded for loop
+  constexpr void rolling_fits(F &&f) {
+    if constexpr (Start < End) {
+      f(std::integral_constant<decltype(Start), Start>());
+      rolling_fits<Start + Inc, End, Inc>(f);
+    }
+  }
+
 }  // namespace riemannFit
 
+template <typename TrackerTraits>
 class HelixFitOnGPU {
 public:
-  using HitsView = TrackingRecHit2DSOAView;
+  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
 
-  using Tuples = pixelTrack::HitContainer;
-  using OutputSoA = pixelTrack::TrackSoA;
+  using Tuples = pixelTrack::HitContainerT<TrackerTraits>;
+  using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
 
-  using TupleMultiplicity = caConstants::TupleMultiplicity;
+  using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
 
   explicit HelixFitOnGPU(float bf, bool fitNas4) : bField_(bf), fitNas4_(fitNas4) {}
   ~HelixFitOnGPU() { deallocateOnGPU(); }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index 13d665b597b13..e4a7de6adaf4c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -1,6 +1,9 @@
 #include "RiemannFitOnGPU.h"
 
-void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
+template <typename TrackerTraits>
+void HelixFitOnGPU<TrackerTraits>::launchRiemannKernelsOnCPU(HitsView const *hv,
+                                                             uint32_t nhits,
+                                                             uint32_t maxNumberOfTuples) {
   assert(tuples_);
 
   //  Fit internals
@@ -16,98 +19,101 @@ void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernel_FastFit<3>(
+    kernel_FastFit<3, TrackerTraits>(
         tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-    kernel_CircleFit<3>(tupleMultiplicity_,
-                        3,
-                        bField_,
-                        hitsGPU.get(),
-                        hits_geGPU.get(),
-                        fast_fit_resultsGPU.get(),
-                        circle_fit_resultsGPU,
-                        offset);
+    kernel_CircleFit<3, TrackerTraits>(tupleMultiplicity_,
+                                       3,
+                                       bField_,
+                                       hitsGPU.get(),
+                                       hits_geGPU.get(),
+                                       fast_fit_resultsGPU.get(),
+                                       circle_fit_resultsGPU,
+                                       offset);
 
-    kernel_LineFit<3>(tupleMultiplicity_,
-                      3,
-                      bField_,
-                      outputSoa_,
-                      hitsGPU.get(),
-                      hits_geGPU.get(),
-                      fast_fit_resultsGPU.get(),
-                      circle_fit_resultsGPU,
-                      offset);
+    kernel_LineFit<3, TrackerTraits>(tupleMultiplicity_,
+                                     3,
+                                     bField_,
+                                     outputSoa_,
+                                     hitsGPU.get(),
+                                     hits_geGPU.get(),
+                                     fast_fit_resultsGPU.get(),
+                                     circle_fit_resultsGPU,
+                                     offset);
 
     // quads
-    kernel_FastFit<4>(
+    kernel_FastFit<4, TrackerTraits>(
         tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-    kernel_CircleFit<4>(tupleMultiplicity_,
-                        4,
-                        bField_,
-                        hitsGPU.get(),
-                        hits_geGPU.get(),
-                        fast_fit_resultsGPU.get(),
-                        circle_fit_resultsGPU,
-                        offset);
+    kernel_CircleFit<4, TrackerTraits>(tupleMultiplicity_,
+                                       4,
+                                       bField_,
+                                       hitsGPU.get(),
+                                       hits_geGPU.get(),
+                                       fast_fit_resultsGPU.get(),
+                                       circle_fit_resultsGPU,
+                                       offset);
 
-    kernel_LineFit<4>(tupleMultiplicity_,
-                      4,
-                      bField_,
-                      outputSoa_,
-                      hitsGPU.get(),
-                      hits_geGPU.get(),
-                      fast_fit_resultsGPU.get(),
-                      circle_fit_resultsGPU,
-                      offset);
+    kernel_LineFit<4, TrackerTraits>(tupleMultiplicity_,
+                                     4,
+                                     bField_,
+                                     outputSoa_,
+                                     hitsGPU.get(),
+                                     hits_geGPU.get(),
+                                     fast_fit_resultsGPU.get(),
+                                     circle_fit_resultsGPU,
+                                     offset);
 
     if (fitNas4_) {
       // penta
-      kernel_FastFit<4>(
+      kernel_FastFit<4, TrackerTraits>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-      kernel_CircleFit<4>(tupleMultiplicity_,
-                          5,
-                          bField_,
-                          hitsGPU.get(),
-                          hits_geGPU.get(),
-                          fast_fit_resultsGPU.get(),
-                          circle_fit_resultsGPU,
-                          offset);
+      kernel_CircleFit<4, TrackerTraits>(tupleMultiplicity_,
+                                         5,
+                                         bField_,
+                                         hitsGPU.get(),
+                                         hits_geGPU.get(),
+                                         fast_fit_resultsGPU.get(),
+                                         circle_fit_resultsGPU,
+                                         offset);
 
-      kernel_LineFit<4>(tupleMultiplicity_,
-                        5,
-                        bField_,
-                        outputSoa_,
-                        hitsGPU.get(),
-                        hits_geGPU.get(),
-                        fast_fit_resultsGPU.get(),
-                        circle_fit_resultsGPU,
-                        offset);
+      kernel_LineFit<4, TrackerTraits>(tupleMultiplicity_,
+                                       5,
+                                       bField_,
+                                       outputSoa_,
+                                       hitsGPU.get(),
+                                       hits_geGPU.get(),
+                                       fast_fit_resultsGPU.get(),
+                                       circle_fit_resultsGPU,
+                                       offset);
 
     } else {
       // penta all 5
-      kernel_FastFit<5>(
+      kernel_FastFit<5, TrackerTraits>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-      kernel_CircleFit<5>(tupleMultiplicity_,
-                          5,
-                          bField_,
-                          hitsGPU.get(),
-                          hits_geGPU.get(),
-                          fast_fit_resultsGPU.get(),
-                          circle_fit_resultsGPU,
-                          offset);
+      kernel_CircleFit<5, TrackerTraits>(tupleMultiplicity_,
+                                         5,
+                                         bField_,
+                                         hitsGPU.get(),
+                                         hits_geGPU.get(),
+                                         fast_fit_resultsGPU.get(),
+                                         circle_fit_resultsGPU,
+                                         offset);
 
-      kernel_LineFit<5>(tupleMultiplicity_,
-                        5,
-                        bField_,
-                        outputSoa_,
-                        hitsGPU.get(),
-                        hits_geGPU.get(),
-                        fast_fit_resultsGPU.get(),
-                        circle_fit_resultsGPU,
-                        offset);
+      kernel_LineFit<5, TrackerTraits>(tupleMultiplicity_,
+                                       5,
+                                       bField_,
+                                       outputSoa_,
+                                       hitsGPU.get(),
+                                       hits_geGPU.get(),
+                                       fast_fit_resultsGPU.get(),
+                                       circle_fit_resultsGPU,
+                                       offset);
     }
   }
 }
+
+template class HelixFitOnGPU<pixelTopology::Phase1>;
+template class HelixFitOnGPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index d8530bac964c1..3d6b2d570077e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -1,10 +1,11 @@
 #include "RiemannFitOnGPU.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
-                                         uint32_t nhits,
-                                         uint32_t maxNumberOfTuples,
-                                         cudaStream_t stream) {
+template <typename TrackerTraits>
+void HelixFitOnGPU<TrackerTraits>::launchRiemannKernels(HitsView const *hv,
+                                                        uint32_t nhits,
+                                                        uint32_t maxNumberOfTuples,
+                                                        cudaStream_t stream) {
   assert(tuples_);
 
   auto blockSize = 64;
@@ -23,109 +24,112 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernel_FastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+    kernel_FastFit<3, TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(
         tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_CircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                  3,
-                                                                  bField_,
-                                                                  hitsGPU.get(),
-                                                                  hits_geGPU.get(),
-                                                                  fast_fit_resultsGPU.get(),
-                                                                  circle_fit_resultsGPU_,
-                                                                  offset);
+    kernel_CircleFit<3, TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                 3,
+                                                                                 bField_,
+                                                                                 hitsGPU.get(),
+                                                                                 hits_geGPU.get(),
+                                                                                 fast_fit_resultsGPU.get(),
+                                                                                 circle_fit_resultsGPU_,
+                                                                                 offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_LineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                3,
-                                                                bField_,
-                                                                outputSoa_,
-                                                                hitsGPU.get(),
-                                                                hits_geGPU.get(),
-                                                                fast_fit_resultsGPU.get(),
-                                                                circle_fit_resultsGPU_,
-                                                                offset);
+    kernel_LineFit<3, TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                               3,
+                                                                               bField_,
+                                                                               outputSoa_,
+                                                                               hitsGPU.get(),
+                                                                               hits_geGPU.get(),
+                                                                               fast_fit_resultsGPU.get(),
+                                                                               circle_fit_resultsGPU_,
+                                                                               offset);
     cudaCheck(cudaGetLastError());
 
     // quads
-    kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+    kernel_FastFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
         tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                      4,
-                                                                      bField_,
-                                                                      hitsGPU.get(),
-                                                                      hits_geGPU.get(),
-                                                                      fast_fit_resultsGPU.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+    kernel_CircleFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                     4,
+                                                                                     bField_,
+                                                                                     hitsGPU.get(),
+                                                                                     hits_geGPU.get(),
+                                                                                     fast_fit_resultsGPU.get(),
+                                                                                     circle_fit_resultsGPU_,
+                                                                                     offset);
     cudaCheck(cudaGetLastError());
 
-    kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                    4,
-                                                                    bField_,
-                                                                    outputSoa_,
-                                                                    hitsGPU.get(),
-                                                                    hits_geGPU.get(),
-                                                                    fast_fit_resultsGPU.get(),
-                                                                    circle_fit_resultsGPU_,
-                                                                    offset);
+    kernel_LineFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                   4,
+                                                                                   bField_,
+                                                                                   outputSoa_,
+                                                                                   hitsGPU.get(),
+                                                                                   hits_geGPU.get(),
+                                                                                   fast_fit_resultsGPU.get(),
+                                                                                   circle_fit_resultsGPU_,
+                                                                                   offset);
     cudaCheck(cudaGetLastError());
 
     if (fitNas4_) {
       // penta
-      kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+      kernel_FastFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                        5,
-                                                                        bField_,
-                                                                        hitsGPU.get(),
-                                                                        hits_geGPU.get(),
-                                                                        fast_fit_resultsGPU.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
+      kernel_CircleFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                       5,
+                                                                                       bField_,
+                                                                                       hitsGPU.get(),
+                                                                                       hits_geGPU.get(),
+                                                                                       fast_fit_resultsGPU.get(),
+                                                                                       circle_fit_resultsGPU_,
+                                                                                       offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_,
-                                                                      hitsGPU.get(),
-                                                                      hits_geGPU.get(),
-                                                                      fast_fit_resultsGPU.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+      kernel_LineFit<4, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                     5,
+                                                                                     bField_,
+                                                                                     outputSoa_,
+                                                                                     hitsGPU.get(),
+                                                                                     hits_geGPU.get(),
+                                                                                     fast_fit_resultsGPU.get(),
+                                                                                     circle_fit_resultsGPU_,
+                                                                                     offset);
       cudaCheck(cudaGetLastError());
     } else {
       // penta all 5
-      kernel_FastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+      kernel_FastFit<5, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
           tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_CircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                        5,
-                                                                        bField_,
-                                                                        hitsGPU.get(),
-                                                                        hits_geGPU.get(),
-                                                                        fast_fit_resultsGPU.get(),
-                                                                        circle_fit_resultsGPU_,
-                                                                        offset);
+      kernel_CircleFit<5, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                       5,
+                                                                                       bField_,
+                                                                                       hitsGPU.get(),
+                                                                                       hits_geGPU.get(),
+                                                                                       fast_fit_resultsGPU.get(),
+                                                                                       circle_fit_resultsGPU_,
+                                                                                       offset);
       cudaCheck(cudaGetLastError());
 
-      kernel_LineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
-                                                                      5,
-                                                                      bField_,
-                                                                      outputSoa_,
-                                                                      hitsGPU.get(),
-                                                                      hits_geGPU.get(),
-                                                                      fast_fit_resultsGPU.get(),
-                                                                      circle_fit_resultsGPU_,
-                                                                      offset);
+      kernel_LineFit<5, TrackerTraits><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                                     5,
+                                                                                     bField_,
+                                                                                     outputSoa_,
+                                                                                     hitsGPU.get(),
+                                                                                     hits_geGPU.get(),
+                                                                                     fast_fit_resultsGPU.get(),
+                                                                                     circle_fit_resultsGPU_,
+                                                                                     offset);
       cudaCheck(cudaGetLastError());
     }
   }
 }
+
+template class HelixFitOnGPU<pixelTopology::Phase1>;
+template class HelixFitOnGPU<pixelTopology::Phase2>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 926002d674b83..18dd205cd13c3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -14,15 +14,20 @@
 
 #include "HelixFitOnGPU.h"
 
-using HitsOnGPU = TrackingRecHit2DSOAView;
-using Tuples = pixelTrack::HitContainer;
-using OutputSoA = pixelTrack::TrackSoA;
-
-template <int N>
-__global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
-                               caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+template <typename TrackerTraits>
+using HitsOnGPU = TrackingRecHit2DSOAViewT<TrackerTraits>;
+template <typename TrackerTraits>
+using Tuples = pixelTrack::HitContainerT<TrackerTraits>;
+template <typename TrackerTraits>
+using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+template <typename TrackerTraits>
+using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+
+template <int N, typename TrackerTraits>
+__global__ void kernel_FastFit(Tuples<TrackerTraits> const *__restrict__ foundNtuplets,
+                               TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
                                uint32_t nHits,
-                               HitsOnGPU const *__restrict__ hhp,
+                               HitsOnGPU<TrackerTraits> const *__restrict__ hhp,
                                double *__restrict__ phits,
                                float *__restrict__ phits_ge,
                                double *__restrict__ pfast_fit,
@@ -51,7 +56,7 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
 
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
-    assert(tkid < foundNtuplets->nOnes());
+    assert(int(tkid) < foundNtuplets->nOnes());
 
     assert(foundNtuplets->size(tkid) == nHits);
 
@@ -83,8 +88,8 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
   }
 }
 
-template <int N>
-__global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+template <int N, typename TrackerTraits>
+__global__ void kernel_CircleFit(TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
                                  uint32_t nHits,
                                  double bField,
                                  double *__restrict__ phits,
@@ -124,11 +129,11 @@ __global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restric
   }
 }
 
-template <int N>
-__global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+template <int N, typename TrackerTraits>
+__global__ void kernel_LineFit(TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
                                uint32_t nHits,
                                double bField,
-                               OutputSoA *results,
+                               OutputSoA<TrackerTraits> *results,
                                double *__restrict__ phits,
                                float *__restrict__ phits_ge,
                                double *__restrict__ pfast_fit_input,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 5f3866af0b3d3..d4b3282574ec3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -8,21 +8,35 @@
 #include <limits>
 
 #include "DataFormats/Math/interface/approx_atan2.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "GPUCACell.h"
+#include "CAStructures.h"
 
 namespace gpuPixelDoublets {
 
-  __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp,
-                           GPUCACell* cells,
+  template <typename TrackerTraits>
+  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
+
+  template <typename TrackerTraits>
+  __global__ void fishbone(Hits<TrackerTraits> const* __restrict__ hhp,
+                           GPUCACellT<TrackerTraits>* cells,
                            uint32_t const* __restrict__ nCells,
-                           GPUCACell::OuterHitOfCell const isOuterHitOfCellWrap,
+                           OuterHitOfCell<TrackerTraits> const isOuterHitOfCellWrap,
                            int32_t nHits,
                            bool checkTrack) {
-    constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
+    constexpr auto maxCellsPerHit = GPUCACellT<TrackerTraits>::maxCellsPerHit;
 
     auto const& hh = *hhp;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 57dbf822c88d3..deed54ca02b5b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -7,78 +7,37 @@
 
 namespace gpuPixelDoublets {
 
-  constexpr int nPairsForQuadruplets = 13;                     // quadruplets require hits in all layers
-  constexpr int nPairsForTriplets = nPairsForQuadruplets + 2;  // include barrel "jumping" layer pairs
-  constexpr int nPairs = nPairsForTriplets + 4;                // include forward "jumping" layer pairs
-  static_assert(nPairs <= caConstants::maxNumberOfLayerPairs);
-
-  // start constants
-  // clang-format off
-
-  CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
-      0, 1, 0, 4, 0, 7,              // BPIX1 (3)
-      1, 2, 1, 4, 1, 7,              // BPIX2 (6)
-      4, 5, 7, 8,                    // FPIX1 (8)
-      2, 3, 2, 4, 2, 7, 5, 6, 8, 9,  // BPIX3 & FPIX2 (13)
-      0, 2, 1, 3,                    // Jumping Barrel (15)
-      0, 5, 0, 8,                    // Jumping Forward (BPIX1,FPIX2)
-      4, 6, 7, 9                     // Jumping Forward (19)
-  };
-
-  constexpr int16_t phi0p05 = 522;  // round(521.52189...) = phi2short(0.05);
-  constexpr int16_t phi0p06 = 626;  // round(625.82270...) = phi2short(0.06);
-  constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
-
-  CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05,
-                                             phi0p07,
-                                             phi0p07,
-                                             phi0p05,
-                                             phi0p06,
-                                             phi0p06,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p06,
-                                             phi0p06,
-                                             phi0p06,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p05,
-                                             phi0p05};
-  //   phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06};  // relaxed cuts
-
-  CONSTANT_VAR float const minz[nPairs] = {
-      -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.};
-  CONSTANT_VAR float const maxz[nPairs] = {
-      20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.};
-  CONSTANT_VAR float const maxr[nPairs] = {
-      20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.};
+  template <typename TrackerTraits>
+  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
 
   // end constants
   // clang-format on
 
-  using CellNeighbors = caConstants::CellNeighbors;
-  using CellTracks = caConstants::CellTracks;
-  using CellNeighborsVector = caConstants::CellNeighborsVector;
-  using CellTracksVector = caConstants::CellTracksVector;
-
-  __global__ void initDoublets(GPUCACell::OuterHitOfCell isOuterHitOfCell,
+  template <typename TrackerTraits>
+  __global__ void initDoublets(OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
                                int nHits,
-                               CellNeighborsVector* cellNeighbors,
-                               CellNeighbors* cellNeighborsContainer,
-                               CellTracksVector* cellTracks,
-                               CellTracks* cellTracksContainer) {
+                               CellNeighborsVector<TrackerTraits>* cellNeighbors,
+                               CellNeighbors<TrackerTraits>* cellNeighborsContainer,
+                               CellTracksVector<TrackerTraits>* cellTracks,
+                               CellTracks<TrackerTraits>* cellTracksContainer) {
     assert(isOuterHitOfCell.container);
     int first = blockIdx.x * blockDim.x + threadIdx.x;
     for (int i = first; i < nHits - isOuterHitOfCell.offset; i += gridDim.x * blockDim.x)
       isOuterHitOfCell.container[i].reset();
 
     if (0 == first) {
-      cellNeighbors->construct(caConstants::maxNumOfActiveDoublets, cellNeighborsContainer);
-      cellTracks->construct(caConstants::maxNumOfActiveDoublets, cellTracksContainer);
+      cellNeighbors->construct(TrackerTraits::maxNumOfActiveDoublets, cellNeighborsContainer);
+      cellTracks->construct(TrackerTraits::maxNumOfActiveDoublets, cellTracksContainer);
       auto i = cellNeighbors->extend();
       assert(0 == i);
       (*cellNeighbors)[0].reset();
@@ -91,40 +50,23 @@ namespace gpuPixelDoublets {
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
+  template <typename TrackerTraits>
   __global__
 #ifdef __CUDACC__
   __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP)
 #endif
-      void getDoubletsFromHisto(GPUCACell* cells,
+      void getDoubletsFromHisto(GPUCACellT<TrackerTraits>* cells,
                                 uint32_t* nCells,
-                                CellNeighborsVector* cellNeighbors,
-                                CellTracksVector* cellTracks,
-                                TrackingRecHit2DSOAView const* __restrict__ hhp,
-                                GPUCACell::OuterHitOfCell isOuterHitOfCell,
+                                CellNeighborsVector<TrackerTraits>* cellNeighbors,
+                                CellTracksVector<TrackerTraits>* cellTracks,
+                                TrackingRecHit2DSOAViewT<TrackerTraits> const* __restrict__ hhp,
+                                OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
                                 int nActualPairs,
-                                bool ideal_cond,
-                                bool doClusterCut,
-                                bool doZ0Cut,
-                                bool doPtCut,
-                                uint32_t maxNumOfDoublets) {
+                                CellCutsT<TrackerTraits> cuts) {
     auto const& __restrict__ hh = *hhp;
-    doubletsFromHisto(layerPairs,
-                      nActualPairs,
-                      cells,
-                      nCells,
-                      cellNeighbors,
-                      cellTracks,
-                      hh,
-                      isOuterHitOfCell,
-                      phicuts,
-                      minz,
-                      maxz,
-                      maxr,
-                      ideal_cond,
-                      doClusterCut,
-                      doZ0Cut,
-                      doPtCut,
-                      maxNumOfDoublets);
+
+    doubletsFromHisto<TrackerTraits>(
+        nActualPairs, cells, nCells, cellNeighbors, cellTracks, hh, isOuterHitOfCell, cuts);
   }
 
 }  // namespace gpuPixelDoublets
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 80316d24c748b..0f3d786a8e476 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -12,45 +12,119 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
-#include "CAConstants.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "CAStructures.h"
 #include "GPUCACell.h"
 
+//#define GPU_DEBUG
+//#define NTUPLE_DEBUG
+
 namespace gpuPixelDoublets {
 
-  using CellNeighbors = caConstants::CellNeighbors;
-  using CellTracks = caConstants::CellTracks;
-  using CellNeighborsVector = caConstants::CellNeighborsVector;
-  using CellTracksVector = caConstants::CellTracksVector;
+  template <typename TrackerTraits>
+  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+  template <typename TrackerTraits>
+  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
+
+  template <typename TrackerTraits>
+  struct CellCutsT {
+    using H = Hits<TrackerTraits>;
+    using T = TrackerTraits;
+
+    const uint32_t maxNumberOfDoublets_;
+    const bool doClusterCut_;
+    const bool doZ0Cut_;
+    const bool doPtCut_;
+    const bool idealConditions_;  //this is actually not used by phase2
+
+    __device__ __forceinline__ bool zSizeCut(H const& hh, int i, int o) const {
+      auto mi = hh.detectorIndex(i);
+
+      bool innerB1 = mi < T::last_bpix1_detIndex;
+      bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
+      auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1;
+
+      if (mes < 0)
+        return false;
+
+      auto mo = hh.detectorIndex(o);
+      auto so = hh.clusterSizeY(o);
+
+      auto dz = hh.zGlobal(i) - hh.zGlobal(o);
+      auto dr = hh.rGlobal(i) - hh.rGlobal(o);
+
+      auto innerBarrel = mi < T::last_barrel_detIndex;
+      auto onlyBarrel = mo < T::last_barrel_detIndex;
+
+      if (not innerBarrel and not onlyBarrel)
+        return false;
+      auto dy = innerB1 ? T::maxDYsize12 : T::maxDYsize;
+
+      return onlyBarrel ? so > 0 && std::abs(so - mes) > dy
+                        : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred;
+    }
+
+    __device__ __forceinline__ bool clusterCut(H const& hh, int i, int o) const {
+      auto mo = hh.detectorIndex(o);
+      bool outerFwd = (mo >= T::last_barrel_detIndex);
+
+      if (!outerFwd)
+        return false;
+
+      auto mi = hh.detectorIndex(i);
+      bool innerB1orB2 = mi < T::last_bpix2_detIndex;
 
-  __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
-                                                    uint32_t nPairs,
-                                                    GPUCACell* cells,
+      if (!innerB1orB2)
+        return false;
+
+      bool innerB1 = mi < T::last_bpix1_detIndex;
+      bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
+      auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1;
+
+      if (innerB1 && outerFwd)  // B1 and F1
+        if (mes > 0 && mes < T::minYsizeB1)
+          return true;                                                                 // only long cluster  (5*8)
+      bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex);  //FIXME number
+      if (innerB2 && outerFwd)                                                         // B2 and F1
+        if (mes > 0 && mes < T::minYsizeB2)
+          return true;
+
+      return false;
+    }
+  };
+
+  // template <typename TrackerTraits>
+  // struct CellCutsT : public CellCutsCommon<TrackerTraits> {};
+  //
+  // template <>
+  // struct CellCutsT<pixelTopology::Phase2> : public CellCutsCommon<pixelTopology::Phase2> {};
+
+  template <typename TrackerTraits>
+  __device__ __forceinline__ void doubletsFromHisto(uint32_t nPairs,
+                                                    GPUCACellT<TrackerTraits>* cells,
                                                     uint32_t* nCells,
-                                                    CellNeighborsVector* cellNeighbors,
-                                                    CellTracksVector* cellTracks,
-                                                    TrackingRecHit2DSOAView const& __restrict__ hh,
-                                                    GPUCACell::OuterHitOfCell isOuterHitOfCell,
-                                                    int16_t const* __restrict__ phicuts,
-                                                    float const* __restrict__ minz,
-                                                    float const* __restrict__ maxz,
-                                                    float const* __restrict__ maxr,
-                                                    bool ideal_cond,
-                                                    bool doClusterCut,
-                                                    bool doZ0Cut,
-                                                    bool doPtCut,
-                                                    uint32_t maxNumOfDoublets) {
+                                                    CellNeighborsVector<TrackerTraits>* cellNeighbors,
+                                                    CellTracksVector<TrackerTraits>* cellTracks,
+                                                    TrackingRecHit2DSOAViewT<TrackerTraits> const& __restrict__ hh,
+                                                    OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
+                                                    CellCutsT<TrackerTraits> const& cuts) {
     // ysize cuts (z in the barrel)  times 8
     // these are used if doClusterCut is true
-    constexpr int minYsizeB1 = 36;
-    constexpr int minYsizeB2 = 28;
-    constexpr int maxDYsize12 = 28;
-    constexpr int maxDYsize = 20;
-    constexpr int maxDYPred = 20;
-    constexpr float dzdrFact = 8 * 0.0285 / 0.015;  // from dz/dr to "DY"
 
-    bool isOuterLadder = ideal_cond;
+    const bool doClusterCut = cuts.doClusterCut_;
+    const bool doZ0Cut = cuts.doZ0Cut_;
+    const bool doPtCut = cuts.doPtCut_;
+    const uint32_t maxNumOfDoublets = cuts.maxNumberOfDoublets_;
 
-    using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
+    using PhiBinner = typename TrackingRecHit2DSOAViewT<TrackerTraits>::PhiBinner;
 
     auto const& __restrict__ phiBinner = hh.phiBinner();
     uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
@@ -61,14 +135,13 @@ namespace gpuPixelDoublets {
     // nPairsMax to be optimized later (originally was 64).
     // If it should be much bigger, consider using a block-wide parallel prefix scan,
     // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
-    const int nPairsMax = caConstants::maxNumberOfLayerPairs;
-    assert(nPairs <= nPairsMax);
-    __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
+
+    __shared__ uint32_t innerLayerCumulativeSize[TrackerTraits::nPairs];
     __shared__ uint32_t ntot;
     if (threadIdx.y == 0 && threadIdx.x == 0) {
-      innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
+      innerLayerCumulativeSize[0] = layerSize(TrackerTraits::layerPairs[0]);
       for (uint32_t i = 1; i < nPairs; ++i) {
-        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(layerPairs[2 * i]);
+        innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(TrackerTraits::layerPairs[2 * i]);
       }
       ntot = innerLayerCumulativeSize[nPairs - 1];
     }
@@ -80,6 +153,7 @@ namespace gpuPixelDoublets {
     auto stride = blockDim.x;
 
     uint32_t pairLayerId = 0;  // cannot go backward
+
     for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
       while (j >= innerLayerCumulativeSize[pairLayerId++])
         ;
@@ -89,12 +163,12 @@ namespace gpuPixelDoublets {
       assert(j < innerLayerCumulativeSize[pairLayerId]);
       assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
 
-      uint8_t inner = layerPairs[2 * pairLayerId];
-      uint8_t outer = layerPairs[2 * pairLayerId + 1];
+      uint8_t inner = TrackerTraits::layerPairs[2 * pairLayerId];
+      uint8_t outer = TrackerTraits::layerPairs[2 * pairLayerId + 1];
       assert(outer > inner);
 
       auto hoff = PhiBinner::histOff(outer);
-
+      auto fo = __ldg(phiBinner.begin(hoff));  //first hit on outer for the cluster cut
       auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
       i += offsets[inner];
 
@@ -104,8 +178,8 @@ namespace gpuPixelDoublets {
       assert(i < offsets[inner + 1]);
 
       // found hit corresponding to our cuda thread, now do the job
-      auto mi = hh.detectorIndex(i);
-      if (mi > gpuClustering::maxNumModules)
+
+      if (hh.detectorIndex(i) > gpuClustering::maxNumModules)
         continue;  // invalid
 
       /* maybe clever, not effective when zoCut is on
@@ -116,32 +190,18 @@ namespace gpuPixelDoublets {
 
       auto mez = hh.zGlobal(i);
 
-      if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
+      if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId])
+        continue;
+
+      if (doClusterCut && cuts.clusterCut(hh, i, fo))
         continue;
 
-      int16_t mes = -1;  // make compiler happy
-      if (doClusterCut) {
-        // if ideal treat inner ladder as outer
-        if (inner == 0)
-          assert(mi < 96);
-        isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
-
-        // in any case we always test mes>0 ...
-        mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1;
-
-        if (inner == 0 && outer > 3)  // B1 and F1
-          if (mes > 0 && mes < minYsizeB1)
-            continue;                 // only long cluster  (5*8)
-        if (inner == 1 && outer > 3)  // B2 and F1
-          if (mes > 0 && mes < minYsizeB2)
-            continue;
-      }
       auto mep = hh.iphi(i);
       auto mer = hh.rGlobal(i);
 
       // all cuts: true if fails
-      constexpr float z0cut = 12.f;      // cm
-      constexpr float hardPtCut = 0.5f;  // GeV
+      constexpr float z0cut = TrackerTraits::z0Cut;              // cm
+      constexpr float hardPtCut = TrackerTraits::doubletHardPt;  // GeV
       // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
       constexpr float minRadius = hardPtCut * 87.78f;
       constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
@@ -156,24 +216,10 @@ namespace gpuPixelDoublets {
         auto zo = hh.zGlobal(j);
         auto ro = hh.rGlobal(j);
         auto dr = ro - mer;
-        return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
-      };
-
-      auto zsizeCut = [&](int j) {
-        auto onlyBarrel = outer < 4;
-        auto so = hh.clusterSizeY(j);
-        auto dy = inner == 0 ? maxDYsize12 : maxDYsize;
-        // in the barrel cut on difference in size
-        // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well)
-        // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ...
-        auto zo = hh.zGlobal(j);
-        auto ro = hh.rGlobal(j);
-        return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy
-                          : (inner < 4) && mes > 0 &&
-                                std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred;
+        return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
       };
 
-      auto iphicut = phicuts[pairLayerId];
+      auto iphicut = TrackerTraits::phicuts[pairLayerId];
 
       auto kl = PhiBinner::bin(int16_t(mep - iphicut));
       auto kh = PhiBinner::bin(int16_t(mep + iphicut));
@@ -200,18 +246,18 @@ namespace gpuPixelDoublets {
           assert(oi >= offsets[outer]);
           assert(oi < offsets[outer + 1]);
           auto mo = hh.detectorIndex(oi);
+
           if (mo > gpuClustering::maxNumModules)
             continue;  //    invalid
 
           if (doZ0Cut && z0cutoff(oi))
             continue;
-
           auto mop = hh.iphi(oi);
           uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
           if (idphi > iphicut)
             continue;
 
-          if (doClusterCut && zsizeCut(oi))
+          if (doClusterCut && cuts.zSizeCut(hh, i, oi))
             continue;
           if (doPtCut && ptcut(oi, idphi))
             continue;
@@ -231,9 +277,19 @@ namespace gpuPixelDoublets {
 #endif
         }
       }
+//      #endif
 #ifdef GPU_DEBUG
       if (tooMany > 0)
-        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
+        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f\n",
+               i,
+               inner,
+               outer,
+               nmin,
+               tot,
+               tooMany,
+               iphicut,
+               TrackerTraits::minz[pairLayerId],
+               TrackerTraits::maxz[pairLayerId]);
 #endif
     }  // loop in block...
   }
diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
index 70544a2647ee7..3c6be161a346f 100644
--- a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
+++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp
@@ -1,5 +1,5 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h"
-
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include <typeinfo>
 #include <iostream>
 
@@ -9,17 +9,30 @@ void print() {
 }
 
 int main() {
-  using namespace caConstants;
+  using namespace pixelTopology;
+  using namespace caStructures;
+  //for Phase-I
+  print<GPUCACellT<Phase1>>();
+  print<CellNeighborsT<Phase1>>();
+  print<CellTracksT<Phase1>>();
+  print<OuterHitOfCellContainerT<Phase1>>();
+  print<TuplesContainerT<Phase1>>();
+  print<HitToTupleT<Phase1>>();
+  print<TupleMultiplicityT<Phase1>>();
+
+  print<CellNeighborsVectorT<Phase1>>();
+
+  //for Phase-II
 
-  print<GPUCACell>();
-  print<CellNeighbors>();
-  print<CellTracks>();
-  print<OuterHitOfCellContainer>();
-  print<TuplesContainer>();
-  print<HitToTuple>();
-  print<TupleMultiplicity>();
+  print<GPUCACellT<Phase2>>();
+  print<CellNeighborsT<Phase2>>();
+  print<CellTracksT<Phase2>>();
+  print<OuterHitOfCellContainerT<Phase2>>();
+  print<TuplesContainerT<Phase2>>();
+  print<HitToTupleT<Phase2>>();
+  print<TupleMultiplicityT<Phase2>>();
 
-  print<CellNeighborsVector>();
+  print<CellNeighborsVectorT<Phase2>>();
 
   return 0;
 }
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 34b0ed9e29fc1..024c95398b988 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -1,6 +1,7 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/Common/interface/Product.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -21,10 +22,14 @@
 
 #undef PIXVERTEX_DEBUG_PRODUCE
 
-class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
+template <typename TrackerTraits>
+class PixelVertexProducerCUDAT : public edm::global::EDProducer<> {
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using GPUAlgo = gpuVertexFinder::Producer<TrackerTraits>;
+
 public:
-  explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig);
-  ~PixelVertexProducerCUDA() override = default;
+  explicit PixelVertexProducerCUDAT(const edm::ParameterSet& iConfig);
+  ~PixelVertexProducerCUDAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -40,14 +45,15 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
 
-  const gpuVertexFinder::Producer gpuAlgo_;
+  const GPUAlgo gpuAlgo_;
 
   // Tracking cuts before sending tracks to vertex algo
   const float ptMin_;
   const float ptMax_;
 };
 
-PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
+template <typename TrackerTraits>
+PixelVertexProducerCUDAT<TrackerTraits>::PixelVertexProducerCUDAT(const edm::ParameterSet& conf)
     : onGPU_(conf.getParameter<bool>("onGPU")),
       gpuAlgo_(conf.getParameter<bool>("oneKernel"),
                conf.getParameter<bool>("useDensity"),
@@ -65,12 +71,13 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
         consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
   } else {
-    tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUTrack_ = consumes(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
   }
 }
 
-void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void PixelVertexProducerCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
   // Only one of these three algos can be used at once.
@@ -90,13 +97,13 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
   desc.add<double>("PtMax", 75.);
   desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("pixelTracksCUDA"));
 
-  auto label = "pixelVerticesCUDA";
-  descriptions.add(label, desc);
+  descriptions.addWithDefaultLabel(desc);
 }
 
-void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
-                                           edm::Event& iEvent,
-                                           const edm::EventSetup& iSetup) const {
+template <typename TrackerTraits>
+void PixelVertexProducerCUDAT<TrackerTraits>::produceOnGPU(edm::StreamID streamID,
+                                                           edm::Event& iEvent,
+                                                           const edm::EventSetup& iSetup) const {
   edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
   iEvent.getByToken(tokenGPUTrack_, hTracks);
 
@@ -108,9 +115,10 @@ void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
   ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_));
 }
 
-void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
-                                           edm::Event& iEvent,
-                                           const edm::EventSetup& iSetup) const {
+template <typename TrackerTraits>
+void PixelVertexProducerCUDAT<TrackerTraits>::produceOnCPU(edm::StreamID streamID,
+                                                           edm::Event& iEvent,
+                                                           const edm::EventSetup& iSetup) const {
   auto const* tracks = iEvent.get(tokenCPUTrack_).get();
   assert(tracks);
 
@@ -133,7 +141,10 @@ void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
   iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_));
 }
 
-void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+template <typename TrackerTraits>
+void PixelVertexProducerCUDAT<TrackerTraits>::produce(edm::StreamID streamID,
+                                                      edm::Event& iEvent,
+                                                      const edm::EventSetup& iSetup) const {
   if (onGPU_) {
     produceOnGPU(streamID, iEvent, iSetup);
   } else {
@@ -141,4 +152,11 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
   }
 }
 
+using PixelVertexProducerCUDA = PixelVertexProducerCUDAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(PixelVertexProducerCUDA);
+
+using PixelVertexProducerCUDAPhase1 = PixelVertexProducerCUDAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(PixelVertexProducerCUDAPhase1);
+
+using PixelVertexProducerCUDAPhase2 = PixelVertexProducerCUDAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(PixelVertexProducerCUDAPhase2);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
index 029c619b42e58..8cceeaa42cc10 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -25,7 +25,7 @@
 
 class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
 public:
-  using IndToEdm = std::vector<uint16_t>;
+  using IndToEdm = std::vector<uint32_t>;
 
   explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig);
   ~PixelVertexProducerFromSoA() override = default;
@@ -90,7 +90,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
             << " from " << indToEdm.size() << " tracks" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
 
-  std::set<uint16_t> uind;  // for verifing index consistency
+  std::set<uint32_t> uind;  // for verifing index consistency
   for (int j = nv - 1; j >= 0; --j) {
     auto i = soa.sortInd[j];  // on gpu sorted in ascending order....
     assert(i < nv);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 20b007d2d029f..74bcd26f8a79c 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -18,7 +18,9 @@ namespace gpuVertexFinder {
   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;
 
-  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
+  template <typename TrackerTraits>
+  __global__ void loadTracks(
+      pixelTrack::TrackSoAT<TrackerTraits> const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
     assert(ptracks);
     assert(soa);
     auto const& tracks = *ptracks;
@@ -26,6 +28,7 @@ namespace gpuVertexFinder {
     auto const* quality = tracks.qualityData();
 
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
+
     for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
       auto nHits = tracks.nHits(idx);
       assert(nHits >= 3);
@@ -94,14 +97,22 @@ namespace gpuVertexFinder {
   }
 #endif
 
+  template <typename TrackerTraits>
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const {
+  ZVertexHeterogeneous Producer<TrackerTraits>::makeAsync(cudaStream_t stream,
+                                                          pixelTrack::TrackSoAT<TrackerTraits> const* tksoa,
+                                                          float ptMin,
+                                                          float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
 #else
-  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const {
+
+  ZVertexHeterogeneous Producer<TrackerTraits>::make(pixelTrack::TrackSoAT<TrackerTraits> const* tksoa,
+                                                     float ptMin,
+                                                     float ptMax) const {
+
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
@@ -120,12 +131,12 @@ namespace gpuVertexFinder {
 #ifdef __CUDACC__
     init<<<1, 1, 0, stream>>>(soa, ws_d.get());
     auto blockSize = 128;
-    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    auto numberOfBlocks = (pixelTrack::TrackSoAT<TrackerTraits>::stride() + blockSize - 1) / blockSize;
+    loadTracks<TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
     cudaCheck(cudaGetLastError());
 #else
     init(soa, ws_d.get());
-    loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    loadTracks<TrackerTraits>(tksoa, soa, ws_d.get(), ptMin, ptMax);
 #endif
 
 #ifdef __CUDACC__
@@ -186,4 +197,7 @@ namespace gpuVertexFinder {
     return vertices;
   }
 
+  template class Producer<pixelTopology::Phase1>;
+  template class Producer<pixelTopology::Phase2>;
+
 }  // namespace gpuVertexFinder
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 2b6a8107d927f..6128939f6eb87 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -10,15 +10,13 @@
 namespace gpuVertexFinder {
 
   using ZVertices = ZVertexSoA;
-  using TkSoA = pixelTrack::TrackSoA;
-
   // workspace used in the vertex reco algos
   struct WorkSpace {
     static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
     static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
 
     uint32_t ntrks;            // number of "selected tracks"
-    uint16_t itrk[MAXTRACKS];  // index of original track
+    uint32_t itrk[MAXTRACKS];  // index of original track
     float zt[MAXTRACKS];       // input track z at bs
     float ezt2[MAXTRACKS];     // input error^2 on the above
     float ptt2[MAXTRACKS];     // input pt^2 on the above
@@ -38,11 +36,12 @@ namespace gpuVertexFinder {
     pws->init();
   }
 
+  template <typename TrackerTraits>
   class Producer {
   public:
     using ZVertices = ZVertexSoA;
     using WorkSpace = gpuVertexFinder::WorkSpace;
-    using TkSoA = pixelTrack::TrackSoA;
+    using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
 
     Producer(bool oneKernel,
              bool useDensity,
diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
index 34e04b0f7aedb..c11b53538c5b0 100644
--- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
+++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
@@ -145,6 +145,9 @@ struct L2TauNNProducerCacheData {
 };
 
 class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNProducerCacheData>> {
+  using TrackSoA = pixelTrack::TrackSoAT<pixelTopology::Phase1>;
+  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<pixelTopology::Phase1>;
+
 public:
   struct caloRecHitCollections {
     const HBHERecHitCollection* hbhe;
@@ -179,16 +182,16 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
                        const caloRecHitCollections& caloRecHits);
   void fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                       const std::vector<l1t::TauRef>& allTaus,
-                      const pixelTrack::TrackSoA& patatracks_tsoa,
+                      const TrackSoA& patatracks_tsoa,
                       const ZVertexSoA& patavtx_soa,
                       const reco::BeamSpot& beamspot,
                       const MagneticField* magfi);
   void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                   const pixelTrack::TrackSoA& patatracks_tsoa,
+                                   const TrackSoA& patatracks_tsoa,
                                    std::vector<int>& trkGood,
                                    std::vector<int>& vtxGood);
   std::pair<float, float> impactParameter(int it,
-                                          const pixelTrack::TrackSoA& patatracks_tsoa,
+                                          const TrackSoA& patatracks_tsoa,
                                           float patatrackPhi,
                                           const reco::BeamSpot& beamspot,
                                           const MagneticField* magfi);
@@ -293,7 +296,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd
       geometryToken_(esConsumes<CaloGeometry, CaloGeometryRecord>()),
       bFieldToken_(esConsumes<MagneticField, IdealMagneticFieldRecord>()),
       pataVerticesToken_(consumes<ZVertexHeterogeneous>(cfg.getParameter<edm::InputTag>("pataVertices"))),
-      pataTracksToken_(consumes<PixelTrackHeterogeneous>(cfg.getParameter<edm::InputTag>("pataTracks"))),
+      pataTracksToken_(consumes(cfg.getParameter<edm::InputTag>("pataTracks"))),
       beamSpotToken_(consumes<reco::BeamSpot>(cfg.getParameter<edm::InputTag>("BeamSpot"))),
       maxVtx_(cfg.getParameter<uint>("maxVtx")),
       fractionSumPt2_(cfg.getParameter<double>("fractionSumPt2")),
@@ -570,7 +573,7 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix,
 }
 
 void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                                  const pixelTrack::TrackSoA& patatracks_tsoa,
+                                                  const TrackSoA& patatracks_tsoa,
                                                   std::vector<int>& trkGood,
                                                   std::vector<int>& vtxGood) {
   const auto maxTracks = patatracks_tsoa.stride();
@@ -617,7 +620,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
 }
 
 std::pair<float, float> L2TauNNProducer::impactParameter(int it,
-                                                         const pixelTrack::TrackSoA& patatracks_tsoa,
+                                                         const TrackSoA& patatracks_tsoa,
                                                          float patatrackPhi,
                                                          const reco::BeamSpot& beamspot,
                                                          const MagneticField* magfi) {
@@ -650,7 +653,7 @@ std::pair<float, float> L2TauNNProducer::impactParameter(int it,
 
 void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                                      const std::vector<l1t::TauRef>& allTaus,
-                                     const pixelTrack::TrackSoA& patatracks_tsoa,
+                                     const TrackSoA& patatracks_tsoa,
                                      const ZVertexSoA& patavtx_soa,
                                      const reco::BeamSpot& beamspot,
                                      const MagneticField* magfi) {
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index 0e5823fc46c46..9023640f62d5a 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -33,10 +33,11 @@
 /*
   produces seeds directly from cuda produced tuples
 */
-class SeedProducerFromSoA : public edm::global::EDProducer<> {
+template <typename TrackerTraits>
+class SeedProducerFromSoAT : public edm::global::EDProducer<> {
 public:
-  explicit SeedProducerFromSoA(const edm::ParameterSet& iConfig);
-  ~SeedProducerFromSoA() override = default;
+  explicit SeedProducerFromSoAT(const edm::ParameterSet& iConfig);
+  ~SeedProducerFromSoAT() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -45,7 +46,7 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
 
   // Event data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<PixelTrackHeterogeneousT<TrackerTraits>> tokenTrack_;
   // Event setup tokens
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> trackerDigiGeometryToken_;
@@ -53,9 +54,10 @@ class SeedProducerFromSoA : public edm::global::EDProducer<> {
   int32_t minNumberOfHits_;
 };
 
-SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig)
+template <typename TrackerTraits>
+SeedProducerFromSoAT<TrackerTraits>::SeedProducerFromSoAT(const edm::ParameterSet& iConfig)
     : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenTrack_(consumes(iConfig.getParameter<edm::InputTag>("src"))),
       idealMagneticFieldToken_(esConsumes()),
       trackerDigiGeometryToken_(esConsumes()),
       trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))),
@@ -65,7 +67,8 @@ SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig)
   produces<TrajectorySeedCollection>();
 }
 
-void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+template <typename TrackerTraits>
+void SeedProducerFromSoAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
   desc.add<edm::InputTag>("src", edm::InputTag("pixelTrackSoA"));
@@ -74,7 +77,10 @@ void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descr
   descriptions.addWithDefaultLabel(desc);
 }
 
-void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+template <typename TrackerTraits>
+void SeedProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
+                                                  edm::Event& iEvent,
+                                                  const edm::EventSetup& iSetup) const {
   // std::cout << "Converting gpu helix to trajectory seed" << std::endl;
   auto result = std::make_unique<TrajectorySeedCollection>();
 
@@ -167,4 +173,11 @@ void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, co
   iEvent.put(std::move(result));
 }
 
+using SeedProducerFromSoA = SeedProducerFromSoAT<pixelTopology::Phase1>;
 DEFINE_FWK_MODULE(SeedProducerFromSoA);
+
+using SeedProducerFromSoAPhase1 = SeedProducerFromSoAT<pixelTopology::Phase1>;
+DEFINE_FWK_MODULE(SeedProducerFromSoAPhase1);
+
+using SeedProducerFromSoAPhase2 = SeedProducerFromSoAT<pixelTopology::Phase2>;
+DEFINE_FWK_MODULE(SeedProducerFromSoAPhase2);