diff --git a/README.md b/README.md
index 1b53b8ba1..38e7c2e33 100644
--- a/README.md
+++ b/README.md
@@ -209,7 +209,8 @@ make cuda ... USER_CXXFLAGS="-DCUDA_DISABLE_CACHING_ALLOCATOR -DCUDA_DISABLE_ASY
 
 #### `cudadev`
 
-This program is currently equivalent to `cuda`.
+This program corresponds to the updated version of the pixel tracking software integrated in
+[CMSSW_12_0_0_pre3](https://github.com/cms-sw/cmssw/tree/CMSSW_12_0_0_pre3).
 
 The use of caching allocator can be disabled at compile time setting the
 `CUDADEV_DISABLE_CACHING_ALLOCATOR` preprocessor symbol:
diff --git a/src/cudadev/CUDACore/FlexiStorage.h b/src/cudadev/CUDACore/FlexiStorage.h
new file mode 100644
index 000000000..f794fb53a
--- /dev/null
+++ b/src/cudadev/CUDACore/FlexiStorage.h
@@ -0,0 +1,49 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
+#define HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
+
+#include <cstdint>
+
+namespace cms {
+  namespace cuda {
+
+    template <typename I, int S>
+    class FlexiStorage {
+    public:
+      constexpr int capacity() const { return S; }
+
+      constexpr I& operator[](int i) { return m_v[i]; }
+      constexpr const I& operator[](int i) const { return m_v[i]; }
+
+      constexpr I* data() { return m_v; }
+      constexpr I const* data() const { return m_v; }
+
+    private:
+      I m_v[S];
+    };
+
+    template <typename I>
+    class FlexiStorage<I, -1> {
+    public:
+      constexpr void init(I* v, int s) {
+        m_v = v;
+        m_capacity = s;
+      }
+
+      constexpr int capacity() const { return m_capacity; }
+
+      constexpr I& operator[](int i) { return m_v[i]; }
+      constexpr const I& operator[](int i) const { return m_v[i]; }
+
+      constexpr I* data() { return m_v; }
+      constexpr I const* data() const { return m_v; }
+
+    private:
+      I* m_v;
+      int m_capacity;
+    };
+
+  }  // namespace cuda
+
+}  // namespace cms
+
+#endif
diff --git a/src/cudadev/CUDACore/HistoContainer.h b/src/cudadev/CUDACore/HistoContainer.h
index c2ac3308d..68799939e 100644
--- a/src/cudadev/CUDACore/HistoContainer.h
+++ b/src/cudadev/CUDACore/HistoContainer.h
@@ -1,19 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
 
-#include <algorithm>
-#ifndef __CUDA_ARCH__
-#include <atomic>
-#endif  // __CUDA_ARCH__
-#include <cstddef>
-#include <cstdint>
-#include <type_traits>
-
-#include "CUDACore/AtomicPairCounter.h"
-#include "CUDACore/cudaCheck.h"
-#include "CUDACore/cuda_assert.h"
-#include "CUDACore/cudastdAlgorithm.h"
-#include "CUDACore/prefixScan.h"
+#include "CUDACore/OneToManyAssoc.h"
 
 namespace cms {
   namespace cuda {
@@ -50,61 +38,27 @@ namespace cms {
       }
     }
 
-    template <typename Histo>
-    inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
-                                                          cudaStream_t stream
-#ifndef __CUDACC__
-                                                          = cudaStreamDefault
-#endif
-    ) {
-      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
-      assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
-#ifdef __CUDACC__
-      cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
-#else
-      ::memset(poff, 0, size);
-#endif
-    }
-
-    template <typename Histo>
-    inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
-                                                              cudaStream_t stream
-#ifndef __CUDACC__
-                                                              = cudaStreamDefault
-#endif
-    ) {
-#ifdef __CUDACC__
-      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
-      auto nthreads = 1024;
-      auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
-      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
-          poff, poff, Histo::totbins(), ppsws);
-      cudaCheck(cudaGetLastError());
-#else
-      h->finalize();
-#endif
-    }
-
     template <typename Histo, typename T>
     inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
                                                                   uint32_t nh,
                                                                   T const *__restrict__ v,
                                                                   uint32_t const *__restrict__ offsets,
-                                                                  uint32_t totSize,
+                                                                  int32_t totSize,
                                                                   int nthreads,
+                                                                  typename Histo::index_type *mem,
                                                                   cudaStream_t stream
 #ifndef __CUDACC__
                                                                   = cudaStreamDefault
 #endif
     ) {
-      launchZero(h, stream);
+      typename Histo::View view = {h, nullptr, mem, -1, totSize};
+      launchZero(view, stream);
 #ifdef __CUDACC__
       auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
       countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
-      launchFinalize(h, stream);
+      launchFinalize(view, stream);
       fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
 #else
@@ -114,11 +68,6 @@ namespace cms {
 #endif
     }
 
-    template <typename Assoc>
-    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
-      assoc->bulkFinalizeFill(*apc);
-    }
-
     // iteratate over N bins left and right of the one containing "v"
     template <typename Hist, typename V, typename Func>
     __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
@@ -142,20 +91,19 @@ namespace cms {
       }
     }
 
-    template <typename T,                  // the type of the discretized input values
-              uint32_t NBINS,              // number of bins
-              uint32_t SIZE,               // max number of element
+    template <typename T,      // the type of the discretized input values
+              uint32_t NBINS,  // number of bins
+              int32_t SIZE,    // max number of element. If -1 is initialized at runtime using external storage
               uint32_t S = sizeof(T) * 8,  // number of significant bits in T
               typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
               uint32_t NHISTS = 1     // number of histos stored
               >
-    class HistoContainer {
+    class HistoContainer : public OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE> {
     public:
-      using Counter = uint32_t;
-
-      using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
-
-      using index_type = I;
+      using Base = OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE>;
+      using View = typename Base::View;
+      using Counter = typename Base::Counter;
+      using index_type = typename Base::index_type;
       using UT = typename std::make_unsigned<T>::type;
 
       static constexpr uint32_t ilog2(uint32_t v) {
@@ -176,7 +124,8 @@ namespace cms {
       static constexpr uint32_t nhists() { return NHISTS; }
       static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
       static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
-      static constexpr uint32_t capacity() { return SIZE; }
+
+      // static_assert(int32_t(totbins())==Base::ctNOnes());
 
       static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
 
@@ -186,91 +135,18 @@ namespace cms {
         return (t >> shift) & mask;
       }
 
-      __host__ __device__ void zero() {
-        for (auto &i : off)
-          i = 0;
-      }
-
-      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
-        for (uint32_t i = 0; i < totbins(); ++i) {
-#ifdef __CUDA_ARCH__
-          atomicAdd(off + i, co.off[i]);
-#else
-          auto &a = (std::atomic<Counter> &)(off[i]);
-          a += co.off[i];
-#endif
-        }
-      }
-
-      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
-#ifdef __CUDA_ARCH__
-        return atomicAdd(&x, 1);
-#else
-        auto &a = (std::atomic<Counter> &)(x);
-        return a++;
-#endif
-      }
-
-      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
-#ifdef __CUDA_ARCH__
-        return atomicSub(&x, 1);
-#else
-        auto &a = (std::atomic<Counter> &)(x);
-        return a--;
-#endif
-      }
-
-      __host__ __device__ __forceinline__ void countDirect(T b) {
-        assert(b < nbins());
-        atomicIncrement(off[b]);
-      }
-
-      __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
-        assert(b < nbins());
-        auto w = atomicDecrement(off[b]);
-        assert(w > 0);
-        bins[w - 1] = j;
-      }
-
-      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
-        auto c = apc.add(n);
-        if (c.m >= nbins())
-          return -int32_t(c.m);
-        off[c.m] = c.n;
-        for (uint32_t j = 0; j < n; ++j)
-          bins[c.n + j] = v[j];
-        return c.m;
-      }
-
-      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
-        off[apc.get().m] = apc.get().n;
-      }
-
-      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
-        auto m = apc.get().m;
-        auto n = apc.get().n;
-        if (m >= nbins()) {  // overflow!
-          off[nbins()] = uint32_t(off[nbins() - 1]);
-          return;
-        }
-        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
-        for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
-          off[i] = n;
-        }
-      }
-
       __host__ __device__ __forceinline__ void count(T t) {
         uint32_t b = bin(t);
         assert(b < nbins());
-        atomicIncrement(off[b]);
+        Base::atomicIncrement(this->off[b]);
       }
 
       __host__ __device__ __forceinline__ void fill(T t, index_type j) {
         uint32_t b = bin(t);
         assert(b < nbins());
-        auto w = atomicDecrement(off[b]);
+        auto w = Base::atomicDecrement(this->off[b]);
         assert(w > 0);
-        bins[w - 1] = j;
+        this->content[w - 1] = j;
       }
 
       __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
@@ -278,7 +154,7 @@ namespace cms {
         assert(b < nbins());
         b += histOff(nh);
         assert(b < totbins());
-        atomicIncrement(off[b]);
+        Base::atomicIncrement(this->off[b]);
       }
 
       __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
@@ -286,37 +162,12 @@ namespace cms {
         assert(b < nbins());
         b += histOff(nh);
         assert(b < totbins());
-        auto w = atomicDecrement(off[b]);
+        auto w = Base::atomicDecrement(this->off[b]);
         assert(w > 0);
-        bins[w - 1] = j;
-      }
-
-      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
-        assert(off[totbins() - 1] == 0);
-        blockPrefixScan(off, totbins(), ws);
-        assert(off[totbins() - 1] == off[totbins() - 2]);
+        this->content[w - 1] = j;
       }
-
-      constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
-      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
-
-      constexpr index_type const *begin() const { return bins; }
-      constexpr index_type const *end() const { return begin() + size(); }
-
-      constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
-      constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
-
-      Counter off[totbins()];
-      int32_t psws;  // prefix-scan working space
-      index_type bins[capacity()];
     };
 
-    template <typename I,        // type stored in the container (usually an index in a vector of the input values)
-              uint32_t MAXONES,  // max number of "ones"
-              uint32_t MAXMANYS  // max number of "manys"
-              >
-    using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
-
   }  // namespace cuda
 }  // namespace cms
 
diff --git a/src/cudadev/CUDACore/HostAllocator.h b/src/cudadev/CUDACore/HostAllocator.h
index 19c86e31f..291c40833 100644
--- a/src/cudadev/CUDACore/HostAllocator.h
+++ b/src/cudadev/CUDACore/HostAllocator.h
@@ -3,6 +3,7 @@
 
 #include <memory>
 #include <new>
+
 #include <cuda_runtime.h>
 
 namespace cms {
diff --git a/src/cudadev/CUDACore/OneToManyAssoc.h b/src/cudadev/CUDACore/OneToManyAssoc.h
new file mode 100644
index 000000000..345389e00
--- /dev/null
+++ b/src/cudadev/CUDACore/OneToManyAssoc.h
@@ -0,0 +1,282 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
+#define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
+
+#include <algorithm>
+#ifndef __CUDA_ARCH__
+#include <atomic>
+#endif  // __CUDA_ARCH__
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "CUDACore/AtomicPairCounter.h"
+#include "CUDACore/cudaCheck.h"
+#include "CUDACore/cuda_assert.h"
+#include "CUDACore/cudastdAlgorithm.h"
+#include "CUDACore/prefixScan.h"
+#include "CUDACore/FlexiStorage.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <typename Assoc>
+    struct OneToManyAssocView {
+      using Counter = typename Assoc::Counter;
+      using index_type = typename Assoc::index_type;
+
+      Assoc *assoc = nullptr;
+      Counter *offStorage = nullptr;
+      index_type *contentStorage = nullptr;
+      int32_t offSize = -1;
+      int32_t contentSize = -1;
+    };
+
+    // this MUST BE DONE in a single block (or in two kernels!)
+    template <typename Assoc>
+    __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {
+      auto h = view.assoc;
+      assert(1 == gridDim.x);
+      assert(0 == blockIdx.x);
+
+      int first = threadIdx.x;
+
+      if (0 == first) {
+        h->psws = 0;
+        h->initStorage(view);
+      }
+      __syncthreads();
+      for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {
+        h->off[i] = 0;
+      }
+    }
+
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchZero(Assoc *h,
+                                                          cudaStream_t stream
+#ifndef __CUDACC__
+                                                          = cudaStreamDefault
+#endif
+    ) {
+      typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
+      launchZero(view, stream);
+    }
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
+                                                          cudaStream_t stream
+#ifndef __CUDACC__
+                                                          = cudaStreamDefault
+#endif
+    ) {
+
+      if constexpr (Assoc::ctCapacity() < 0) {
+        assert(view.contentStorage);
+        assert(view.contentSize > 0);
+      }
+      if constexpr (Assoc::ctNOnes() < 0) {
+        assert(view.offStorage);
+        assert(view.offSize > 0);
+      }
+#ifdef __CUDACC__
+      auto nthreads = 1024;
+      auto nblocks = 1;  // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
+      zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
+      cudaCheck(cudaGetLastError());
+#else
+      auto h = view.assoc;
+      assert(h);
+      h->initStorage(view);
+      h->zero();
+      h->psws = 0;
+#endif
+    }
+
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
+                                                              cudaStream_t stream
+#ifndef __CUDACC__
+                                                              = cudaStreamDefault
+#endif
+    ) {
+      typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
+      launchFinalize(view, stream);
+    }
+
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
+                                                              cudaStream_t stream
+#ifndef __CUDACC__
+                                                              = cudaStreamDefault
+#endif
+    ) {
+      auto h = view.assoc;
+      assert(h);
+#ifdef __CUDACC__
+      using Counter = typename Assoc::Counter;
+      Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
+      auto nOnes = Assoc::ctNOnes();
+      if constexpr (Assoc::ctNOnes() < 0) {
+        assert(view.offStorage);
+        assert(view.offSize > 0);
+        nOnes = view.offSize;
+        poff = view.offStorage;
+      }
+      assert(nOnes > 0);
+      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
+      auto nthreads = 1024;
+      auto nblocks = (nOnes + nthreads - 1) / nthreads;
+      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
+      cudaCheck(cudaGetLastError());
+#else
+      h->finalize();
+#endif
+    }
+
+    template <typename Assoc>
+    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
+      assoc->bulkFinalizeFill(*apc);
+    }
+
+    template <typename I,    // type stored in the container (usually an index in a vector of the input values)
+              int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
+              int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
+              >
+    class OneToManyAssoc {
+    public:
+      using View = OneToManyAssocView<OneToManyAssoc<I, ONES, SIZE>>;
+      using Counter = uint32_t;
+
+      using CountersOnly = OneToManyAssoc<I, ONES, 0>;
+
+      using index_type = I;
+
+      static constexpr uint32_t ilog2(uint32_t v) {
+        constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+        constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+        uint32_t r = 0;  // result of log2(v) will go here
+        for (auto i = 4; i >= 0; i--)
+          if (v & b[i]) {
+            v >>= s[i];
+            r |= s[i];
+          }
+        return r;
+      }
+
+      static constexpr int32_t ctNOnes() { return ONES; }
+      constexpr auto totOnes() const { return off.capacity(); }
+      constexpr auto nOnes() const { return totOnes() - 1; }
+      static constexpr int32_t ctCapacity() { return SIZE; }
+      constexpr auto capacity() const { return content.capacity(); }
+
+      __host__ __device__ void initStorage(View view) {
+        assert(view.assoc == this);
+        if constexpr (ctCapacity() < 0) {
+          assert(view.contentStorage);
+          assert(view.contentSize > 0);
+          content.init(view.contentStorage, view.contentSize);
+        }
+        if constexpr (ctNOnes() < 0) {
+          assert(view.offStorage);
+          assert(view.offSize > 0);
+          off.init(view.offStorage, view.offSize);
+        }
+      }
+
+      __host__ __device__ void zero() {
+        for (int32_t i = 0; i < totOnes(); ++i) {
+          off[i] = 0;
+        }
+      }
+
+      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
+        for (int32_t i = 0; i < totOnes(); ++i) {
+#ifdef __CUDA_ARCH__
+          atomicAdd(off.data() + i, co.off[i]);
+#else
+          auto &a = (std::atomic<Counter> &)(off[i]);
+          a += co.off[i];
+#endif
+        }
+      }
+
+      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+        return atomicAdd(&x, 1);
+#else
+        auto &a = (std::atomic<Counter> &)(x);
+        return a++;
+#endif
+      }
+
+      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+        return atomicSub(&x, 1);
+#else
+        auto &a = (std::atomic<Counter> &)(x);
+        return a--;
+#endif
+      }
+
+      __host__ __device__ __forceinline__ void count(int32_t b) {
+        assert(b < nOnes());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(int32_t b, index_type j) {
+        assert(b < nOnes());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        content[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
+        auto c = apc.add(n);
+        if (int(c.m) >= nOnes())
+          return -int32_t(c.m);
+        off[c.m] = c.n;
+        for (uint32_t j = 0; j < n; ++j)
+          content[c.n + j] = v[j];
+        return c.m;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
+        off[apc.get().m] = apc.get().n;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
+        int m = apc.get().m;
+        auto n = apc.get().n;
+        if (m >= nOnes()) {  // overflow!
+          off[nOnes()] = uint32_t(off[nOnes() - 1]);
+          return;
+        }
+        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
+        for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {
+          off[i] = n;
+        }
+      }
+
+      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
+        assert(off[totOnes() - 1] == 0);
+        blockPrefixScan(off.data(), totOnes(), ws);
+        assert(off[totOnes() - 1] == off[totOnes() - 2]);
+      }
+
+      constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
+      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
+
+      constexpr index_type const *begin() const { return content.data(); }
+      constexpr index_type const *end() const { return begin() + size(); }
+
+      constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
+      constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
+
+      FlexiStorage<Counter, ONES> off;
+      int32_t psws;  // prefix-scan working space
+      FlexiStorage<index_type, SIZE> content;
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
diff --git a/src/cudadev/CUDACore/SimpleVector.h b/src/cudadev/CUDACore/SimpleVector.h
index f21f51cf8..b9a4b2629 100644
--- a/src/cudadev/CUDACore/SimpleVector.h
+++ b/src/cudadev/CUDACore/SimpleVector.h
@@ -35,7 +35,7 @@ namespace cms {
       }
 
       template <class... Ts>
-      constexpr int emplace_back_unsafe(Ts &&... args) {
+      constexpr int emplace_back_unsafe(Ts &&...args) {
         auto previousSize = m_size;
         m_size++;
         if (previousSize < m_capacity) {
@@ -69,7 +69,7 @@ namespace cms {
       }
 
       template <class... Ts>
-      __device__ int emplace_back(Ts &&... args) {
+      __device__ int emplace_back(Ts &&...args) {
         auto previousSize = atomicAdd(&m_size, 1);
         if (previousSize < m_capacity) {
           (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
diff --git a/src/cudadev/CUDACore/VecArray.h b/src/cudadev/CUDACore/VecArray.h
index 595238ecd..b43a8ae16 100644
--- a/src/cudadev/CUDACore/VecArray.h
+++ b/src/cudadev/CUDACore/VecArray.h
@@ -29,7 +29,7 @@ namespace cms {
       }
 
       template <class... Ts>
-      constexpr int emplace_back_unsafe(Ts &&... args) {
+      constexpr int emplace_back_unsafe(Ts &&...args) {
         auto previousSize = m_size;
         m_size++;
         if (previousSize < maxSize) {
@@ -61,7 +61,7 @@ namespace cms {
       }
 
       template <class... Ts>
-      __device__ int emplace_back(Ts &&... args) {
+      __device__ int emplace_back(Ts &&...args) {
         auto previousSize = atomicAdd(&m_size, 1);
         if (previousSize < maxSize) {
           (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
diff --git a/src/cudadev/CUDACore/copyAsync.h b/src/cudadev/CUDACore/copyAsync.h
index 47e55c74a..1288373b9 100644
--- a/src/cudadev/CUDACore/copyAsync.h
+++ b/src/cudadev/CUDACore/copyAsync.h
@@ -1,12 +1,15 @@
-#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
-#define HeterogeneousCore_CUDAUtilities_copyAsync_h
+#ifndef HeterogeneousCore_CUDAUtilities_interface_copyAsync_h
+#define HeterogeneousCore_CUDAUtilities_interface_copyAsync_h
 
+#include <type_traits>
+#include <vector>
+
+#include "CUDACore/HostAllocator.h"
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_noncached_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-
-#include <type_traits>
+#include "Framework/propagate_const_array.h"
 
 namespace cms {
   namespace cuda {
@@ -63,7 +66,24 @@ namespace cms {
                           cudaStream_t stream) {
       cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
     }
+
+    // copy from a host vector using pinned memory
+    template <typename T>
+    inline void copyAsync(cms::cuda::device::unique_ptr<T[]>& dst,
+                          const std::vector<T, cms::cuda::HostAllocator<T>>& src,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.data(), src.size() * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
+    // special case used to transfer conditions data
+    template <typename T>
+    inline void copyAsync(edm::propagate_const_array<cms::cuda::device::unique_ptr<T[]>>& dst,
+                          const std::vector<T, cms::cuda::HostAllocator<T>>& src,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(
+          get_underlying(dst).get(), src.data(), src.size() * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
   }  // namespace cuda
 }  // namespace cms
 
-#endif
+#endif  // HeterogeneousCore_CUDAUtilities_interface_copyAsync_h
diff --git a/src/cudadev/CUDACore/cudaCompat.cc b/src/cudadev/CUDACore/cudaCompat.cc
deleted file mode 100644
index e6bb8069d..000000000
--- a/src/cudadev/CUDACore/cudaCompat.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "CUDACore/cudaCompat.h"
-
-namespace cms {
-  namespace cudacompat {
-    thread_local dim3 blockIdx;
-    thread_local dim3 gridDim;
-  }  // namespace cudacompat
-}  // namespace cms
-
-namespace {
-  struct InitGrid {
-    InitGrid() { cms::cudacompat::resetGrid(); }
-  };
-
-  const InitGrid initGrid;
-
-}  // namespace
diff --git a/src/cudadev/CUDACore/cudaCompat.h b/src/cudadev/CUDACore/cudaCompat.h
index f9b4b2f8a..8bd51d3fa 100644
--- a/src/cudadev/CUDACore/cudaCompat.h
+++ b/src/cudadev/CUDACore/cudaCompat.h
@@ -11,21 +11,26 @@
 #include <cstdint>
 #include <cstring>
 
+// include the CUDA runtime header to define some of the attributes, types and sybols also on the CPU
 #include <cuda_runtime.h>
 
+// make sure function are inlined to avoid multiple definition
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+
+#undef __forceinline__
+#define __forceinline__ inline __attribute__((always_inline))
+
 namespace cms {
   namespace cudacompat {
 
-#ifndef __CUDA_RUNTIME_H__
-    struct dim3 {
-      uint32_t x, y, z;
-    };
-#endif
+    // run serially with a single thread
+    // 1-dimensional block
     const dim3 threadIdx = {0, 0, 0};
     const dim3 blockDim = {1, 1, 1};
-
-    extern thread_local dim3 blockIdx;
-    extern thread_local dim3 gridDim;
+    // 1-dimensional grid
+    const dim3 blockIdx = {0, 0, 0};
+    const dim3 gridDim = {1, 1, 1};
 
     template <typename T1, typename T2>
     T1 atomicCAS(T1* address, T1 compare, T2 val) {
@@ -78,35 +83,12 @@ namespace cms {
       return *x;
     }
 
-    inline void resetGrid() {
-      blockIdx = {0, 0, 0};
-      gridDim = {1, 1, 1};
-    }
-
   }  // namespace cudacompat
 }  // namespace cms
 
-// some  not needed as done by cuda runtime...
-#ifndef __CUDA_RUNTIME_H__
-#define __host__
-#define __device__
-#define __global__
-#define __shared__
-#define __forceinline__
-#endif
-
-// make sure function are inlined to avoid multiple definition
-#ifndef __CUDA_ARCH__
-#undef __global__
-#define __global__ inline __attribute__((always_inline))
-#undef __forceinline__
-#define __forceinline__ inline __attribute__((always_inline))
-#endif
-
-#ifndef __CUDA_ARCH__
+// make the cudacompat implementation available in the global namespace
 using namespace cms::cudacompat;
-#endif
 
-#endif
+#endif  // __CUDACC__
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/src/cudadev/CUDACore/cuda_cxx17.h b/src/cudadev/CUDACore/cuda_cxx17.h
deleted file mode 100644
index 89f131edd..000000000
--- a/src/cudadev/CUDACore/cuda_cxx17.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
-#define HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
-
-#include <initializer_list>
-
-// CUDA does not support C++17 yet, so we define here some of the missing library functions
-#if __cplusplus <= 201402L
-
-namespace std {
-
-  // from https://en.cppreference.com/w/cpp/iterator/size
-  template <class C>
-  constexpr auto size(const C& c) -> decltype(c.size()) {
-    return c.size();
-  }
-
-  template <class T, std::size_t N>
-  constexpr std::size_t size(const T (&array)[N]) noexcept {
-    return N;
-  }
-
-  // from https://en.cppreference.com/w/cpp/iterator/empty
-  template <class C>
-  constexpr auto empty(const C& c) -> decltype(c.empty()) {
-    return c.empty();
-  }
-
-  template <class T, std::size_t N>
-  constexpr bool empty(const T (&array)[N]) noexcept {
-    return false;
-  }
-
-  template <class E>
-  constexpr bool empty(std::initializer_list<E> il) noexcept {
-    return il.size() == 0;
-  }
-
-  // from https://en.cppreference.com/w/cpp/iterator/data
-  template <class C>
-  constexpr auto data(C& c) -> decltype(c.data()) {
-    return c.data();
-  }
-
-  template <class C>
-  constexpr auto data(const C& c) -> decltype(c.data()) {
-    return c.data();
-  }
-
-  template <class T, std::size_t N>
-  constexpr T* data(T (&array)[N]) noexcept {
-    return array;
-  }
-
-  template <class E>
-  constexpr const E* data(std::initializer_list<E> il) noexcept {
-    return il.begin();
-  }
-
-}  // namespace std
-
-#endif
-
-#endif  // HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
diff --git a/src/cudadev/CUDACore/prefixScan.h b/src/cudadev/CUDACore/prefixScan.h
index 5624af03f..bdfc591e5 100644
--- a/src/cudadev/CUDACore/prefixScan.h
+++ b/src/cudadev/CUDACore/prefixScan.h
@@ -5,6 +5,7 @@
 
 #include "CUDACore/cudaCompat.h"
 #include "CUDACore/cuda_assert.h"
+#include "Framework/CMSUnrollLoop.h"
 
 #ifdef __CUDA_ARCH__
 
@@ -13,7 +14,7 @@ __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __re
   // ci and co may be the same
   auto x = ci[i];
   auto laneId = threadIdx.x & 0x1f;
-#pragma unroll
+  CMS_UNROLL_LOOP
   for (int offset = 1; offset < 32; offset <<= 1) {
     auto y = __shfl_up_sync(mask, x, offset);
     if (laneId >= offset)
@@ -26,7 +27,7 @@ template <typename T>
 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
   auto x = c[i];
   auto laneId = threadIdx.x & 0x1f;
-#pragma unroll
+  CMS_UNROLL_LOOP
   for (int offset = 1; offset < 32; offset <<= 1) {
     auto y = __shfl_up_sync(mask, x, offset);
     if (laneId >= offset)
diff --git a/src/cudadev/CUDACore/radixSort.h b/src/cudadev/CUDACore/radixSort.h
index ff1da2d46..769eab81a 100644
--- a/src/cudadev/CUDACore/radixSort.h
+++ b/src/cudadev/CUDACore/radixSort.h
@@ -7,6 +7,7 @@
 #include <type_traits>
 
 #include "CUDACore/cuda_assert.h"
+#include "Framework/CMSUnrollLoop.h"
 
 template <typename T>
 __device__ inline void dummyReorder(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
@@ -124,7 +125,7 @@ __device__ __forceinline__ void radixSortImpl(
     if (threadIdx.x < sb) {
       auto x = c[threadIdx.x];
       auto laneId = threadIdx.x & 0x1f;
-#pragma unroll
+      CMS_UNROLL_LOOP
       for (int offset = 1; offset < 32; offset <<= 1) {
         auto y = __shfl_up_sync(0xffffffff, x, offset);
         if (laneId >= offset)
diff --git a/src/cudadev/CUDADataFormats/PixelTrackHeterogeneous.h b/src/cudadev/CUDADataFormats/PixelTrackHeterogeneous.h
index 579c67092..14d7c6e04 100644
--- a/src/cudadev/CUDADataFormats/PixelTrackHeterogeneous.h
+++ b/src/cudadev/CUDADataFormats/PixelTrackHeterogeneous.h
@@ -1,74 +1,9 @@
-#ifndef CUDADataFormatsTrackTrackHeterogeneous_H
-#define CUDADataFormatsTrackTrackHeterogeneous_H
-
-#include "CUDADataFormats/TrajectoryStateSoA.h"
-#include "CUDACore/HistoContainer.h"
+#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
+#define CUDADataFormats_Track_PixelTrackHeterogeneous_h
 
 #include "CUDADataFormats/HeterogeneousSoA.h"
-
-namespace trackQuality {
-  enum Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
-}
-
-template <int32_t S>
-class TrackSoAT {
-public:
-  static constexpr int32_t stride() { return S; }
-
-  using Quality = trackQuality::Quality;
-  using hindex_type = uint16_t;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
-
-  // Always check quality is at least loose!
-  // CUDA does not support enums  in __lgc ...
-  eigenSoA::ScalarSoA<uint8_t, S> m_quality;
-  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i)); }
-  constexpr Quality &quality(int32_t i) { return (Quality &)(m_quality(i)); }
-  constexpr Quality const *qualityData() const { return (Quality const *)(m_quality.data()); }
-  constexpr Quality *qualityData() { return (Quality *)(m_quality.data()); }
-
-  // this is chi2/ndof as not necessarely all hits are used in the fit
-  eigenSoA::ScalarSoA<float, S> chi2;
-
-  constexpr int nHits(int i) const { return detIndices.size(i); }
-
-  // State at the Beam spot
-  // phi,tip,1/pt,cotan(theta),zip
-  TrajectoryStateSoA<S> stateAtBS;
-  eigenSoA::ScalarSoA<float, S> eta;
-  eigenSoA::ScalarSoA<float, S> pt;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
-  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
-  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
-  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
-
-  // state at the detector of the outermost hit
-  // representation to be decided...
-  // not yet filled on GPU
-  // TrajectoryStateSoA<S> stateAtOuterDet;
-
-  HitContainer hitIndices;
-  HitContainer detIndices;
-
-  // total number of tracks (including those not fitted)
-  uint32_t m_nTracks;
-};
-
-namespace pixelTrack {
-
-#ifdef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumber() { return 2 * 1024; }
-#else
-  constexpr uint32_t maxNumber() { return 32 * 1024; }
-#endif
-
-  using TrackSoA = TrackSoAT<maxNumber()>;
-  using TrajectoryState = TrajectoryStateSoA<maxNumber()>;
-  using HitContainer = TrackSoA::HitContainer;
-  using Quality = trackQuality::Quality;
-
-}  // namespace pixelTrack
+#include "CUDADataFormats/TrackSoAHeterogeneousT.h"
 
 using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
 
-#endif  // CUDADataFormatsTrackTrackSoA_H
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
index fd46a81bf..56c7ec47c 100644
--- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
+++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
@@ -1,15 +1,14 @@
-#include "CUDADataFormats/SiPixelClustersCUDA.h"
-
+#include "CUDACore/copyAsync.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "CUDACore/copyAsync.h"
-
-SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) {
-  moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
-  clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
+#include "CUDADataFormats/SiPixelClustersCUDA.h"
 
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
+    : moduleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)),
+      clusInModule_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
+      moduleId_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
+      clusModuleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream))
+{
   auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->moduleStart_ = moduleStart_d.get();
   view->clusInModule_ = clusInModule_d.get();
diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
index e41b8ea5c..e93b742cf 100644
--- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
+++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
@@ -10,7 +10,7 @@
 class SiPixelClustersCUDA {
 public:
   SiPixelClustersCUDA() = default;
-  explicit SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream);
+  explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream);
   ~SiPixelClustersCUDA() = default;
 
   SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete;
@@ -32,23 +32,13 @@ class SiPixelClustersCUDA {
   uint32_t const *moduleId() const { return moduleId_d.get(); }
   uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
 
-  uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
-  uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
-  uint32_t const *c_moduleId() const { return moduleId_d.get(); }
-  uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
-
   class DeviceConstView {
   public:
-    // DeviceConstView() = default;
-
     __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); }
     __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); }
     __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); }
     __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); }
 
-    friend SiPixelClustersCUDA;
-
-    //   private:
     uint32_t const *moduleStart_;
     uint32_t const *clusInModule_;
     uint32_t const *moduleId_;
@@ -67,7 +57,7 @@ class SiPixelClustersCUDA {
 
   cms::cuda::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
 
-  uint32_t nClusters_h;
+  uint32_t nClusters_h = 0;
 };
 
-#endif
+#endif  // CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
index b19664874..d79942608 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
+++ b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.cc
@@ -1,20 +1,18 @@
-#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
+#include <cassert>
 
+#include "CUDACore/copyAsync.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "CUDACore/copyAsync.h"
 #include "CUDACore/memsetAsync.h"
+#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
 
-#include <cassert>
-
-SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
-    : formatterErrors_h(std::move(errors)) {
-  error_d = cms::cuda::make_device_unique<cms::cuda::SimpleVector<PixelErrorCompact>>(stream);
-  data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
-
+SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream)
+    : data_d(cms::cuda::make_device_unique<SiPixelErrorCompact[]>(maxFedWords, stream)),
+      error_d(cms::cuda::make_device_unique<SiPixelErrorCompactVector>(stream)),
+      error_h(cms::cuda::make_host_unique<SiPixelErrorCompactVector>(stream)),
+      formatterErrors_h(std::move(errors)) {
   cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
 
-  error_h = cms::cuda::make_host_unique<cms::cuda::SimpleVector<PixelErrorCompact>>(stream);
   cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
   assert(error_h->empty());
   assert(error_h->capacity() == static_cast<int>(maxFedWords));
@@ -30,7 +28,7 @@ SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync
   // On one hand size() could be sufficient. On the other hand, if
   // someone copies the SimpleVector<>, (s)he might expect the data
   // buffer to actually have space for capacity() elements.
-  auto data = cms::cuda::make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
+  auto data = cms::cuda::make_host_unique<SiPixelErrorCompact[]>(error_h->capacity(), stream);
 
   // but transfer only the required amount
   if (not error_h->empty()) {
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h
index 9c7c874ee..442a66f92 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h
+++ b/src/cudadev/CUDADataFormats/SiPixelDigiErrorsCUDA.h
@@ -6,12 +6,15 @@
 #include "CUDACore/SimpleVector.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "DataFormats/PixelErrors.h"
+#include "DataFormats/SiPixelErrorCompact.h"
+#include "DataFormats/SiPixelFormatterErrors.h"
 
 class SiPixelDigiErrorsCUDA {
 public:
+  using SiPixelErrorCompactVector = cms::cuda::SimpleVector<SiPixelErrorCompact>;
+
   SiPixelDigiErrorsCUDA() = default;
-  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream);
+  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream);
   ~SiPixelDigiErrorsCUDA() = default;
 
   SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
@@ -19,23 +22,21 @@ class SiPixelDigiErrorsCUDA {
   SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default;
   SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default;
 
-  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
+  const SiPixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
 
-  cms::cuda::SimpleVector<PixelErrorCompact>* error() { return error_d.get(); }
-  cms::cuda::SimpleVector<PixelErrorCompact> const* error() const { return error_d.get(); }
-  cms::cuda::SimpleVector<PixelErrorCompact> const* c_error() const { return error_d.get(); }
+  SiPixelErrorCompactVector* error() { return error_d.get(); }
+  SiPixelErrorCompactVector const* error() const { return error_d.get(); }
 
-  using HostDataError =
-      std::pair<cms::cuda::SimpleVector<PixelErrorCompact>, cms::cuda::host::unique_ptr<PixelErrorCompact[]>>;
+  using HostDataError = std::pair<SiPixelErrorCompactVector, cms::cuda::host::unique_ptr<SiPixelErrorCompact[]>>;
   HostDataError dataErrorToHostAsync(cudaStream_t stream) const;
 
   void copyErrorToHostAsync(cudaStream_t stream);
 
 private:
-  cms::cuda::device::unique_ptr<PixelErrorCompact[]> data_d;
-  cms::cuda::device::unique_ptr<cms::cuda::SimpleVector<PixelErrorCompact>> error_d;
-  cms::cuda::host::unique_ptr<cms::cuda::SimpleVector<PixelErrorCompact>> error_h;
-  PixelFormatterErrors formatterErrors_h;
+  cms::cuda::device::unique_ptr<SiPixelErrorCompact[]> data_d;
+  cms::cuda::device::unique_ptr<SiPixelErrorCompactVector> error_d;
+  cms::cuda::host::unique_ptr<SiPixelErrorCompactVector> error_h;
+  SiPixelFormatterErrors formatterErrors_h;
 };
 
-#endif
+#endif  // CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
index 5f096ab18..453d22531 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
+++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
@@ -1,19 +1,19 @@
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
 
+#include "CUDACore/copyAsync.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "CUDACore/copyAsync.h"
-
-SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
-  xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream);
-
-  pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
-  rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
 
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
+    : xx_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
+      yy_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
+      adc_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
+      moduleInd_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
+      clus_d(cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream)),
+      view_d(cms::cuda::make_device_unique<DeviceConstView>(stream)),
+      pdigi_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream)),
+      rawIdArr_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream))
+{
   auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->xx_ = xx_d.get();
   view->yy_ = yy_d.get();
@@ -21,7 +21,6 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
   view->moduleInd_ = moduleInd_d.get();
   view->clus_ = clus_d.get();
 
-  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
   cms::cuda::copyAsync(view_d, view, stream);
 }
 
diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h
index 647f5b42e..03ae6639a 100644
--- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h
+++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h
@@ -1,11 +1,11 @@
 #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 
+#include <cuda_runtime.h>
+
+#include "CUDACore/cudaCompat.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "CUDACore/cudaCompat.h"
-
-#include <cuda_runtime.h>
 
 class SiPixelDigisCUDA {
 public:
@@ -42,14 +42,6 @@ class SiPixelDigisCUDA {
   uint32_t const *pdigi() const { return pdigi_d.get(); }
   uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
 
-  uint16_t const *c_xx() const { return xx_d.get(); }
-  uint16_t const *c_yy() const { return yy_d.get(); }
-  uint16_t const *c_adc() const { return adc_d.get(); }
-  uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
-  int32_t const *c_clus() const { return clus_d.get(); }
-  uint32_t const *c_pdigi() const { return pdigi_d.get(); }
-  uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
-
   cms::cuda::host::unique_ptr<uint16_t[]> adcToHostAsync(cudaStream_t stream) const;
   cms::cuda::host::unique_ptr<int32_t[]> clusToHostAsync(cudaStream_t stream) const;
   cms::cuda::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cudaStream_t stream) const;
@@ -57,17 +49,12 @@ class SiPixelDigisCUDA {
 
   class DeviceConstView {
   public:
-    // DeviceConstView() = default;
-
     __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); }
     __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); }
     __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); }
     __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); }
     __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); }
 
-    friend class SiPixelDigisCUDA;
-
-    // private:
     uint16_t const *xx_;
     uint16_t const *yy_;
     uint16_t const *adc_;
@@ -88,11 +75,11 @@ class SiPixelDigisCUDA {
 
   // These are for CPU output; should we (eventually) place them to a
   // separate product?
-  cms::cuda::device::unique_ptr<uint32_t[]> pdigi_d;
-  cms::cuda::device::unique_ptr<uint32_t[]> rawIdArr_d;
+  cms::cuda::device::unique_ptr<uint32_t[]> pdigi_d;     // packed digi (row, col, adc) of each pixel
+  cms::cuda::device::unique_ptr<uint32_t[]> rawIdArr_d;  // DetId of each pixel
 
   uint32_t nModules_h = 0;
   uint32_t nDigis_h = 0;
 };
 
-#endif
+#endif  // CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
diff --git a/src/cudadev/CUDADataFormats/TrackSoAHeterogeneousT.h b/src/cudadev/CUDADataFormats/TrackSoAHeterogeneousT.h
new file mode 100644
index 000000000..4ec805701
--- /dev/null
+++ b/src/cudadev/CUDADataFormats/TrackSoAHeterogeneousT.h
@@ -0,0 +1,72 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
+#define CUDADataFormats_Track_TrackHeterogeneousT_H
+
+#include "CUDACore/HistoContainer.h"
+#include "CUDADataFormats/HeterogeneousSoA.h"
+#include "CUDADataFormats/TrajectoryStateSoAT.h"
+
+namespace pixelTrack {
+  enum class Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAHeterogeneousT {
+public:
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = pixelTrack::Quality;
+  using hindex_type = uint32_t;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+private:
+  eigenSoA::ScalarSoA<uint8_t, S> quality_;
+
+public:
+  constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); }
+  constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); }
+  constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); }
+  constexpr Quality *qualityData() { return (Quality *)(quality_.data()); }
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i); }
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoAT<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+};
+
+namespace pixelTrack {
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DCUDA.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DCUDA.h
deleted file mode 100644
index 54b74d97b..000000000
--- a/src/cudadev/CUDADataFormats/TrackingRecHit2DCUDA.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DCUDA.cc b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc
similarity index 83%
rename from src/cudadev/CUDADataFormats/TrackingRecHit2DCUDA.cc
rename to src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc
index 81b5e5571..5c1aacaf4 100644
--- a/src/cudadev/CUDADataFormats/TrackingRecHit2DCUDA.cc
+++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc
@@ -1,8 +1,8 @@
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
 #include "CUDACore/copyAsync.h"
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 template <>
 cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
@@ -13,8 +13,9 @@ cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync
 
 template <>
 cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(2001, stream);
-  cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClustering::maxNumModules + 1, stream);
+  cudaCheck(cudaMemcpyAsync(
+      ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream));
   return ret;
 }
 
diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
index 2320fa6d6..7a19299a9 100644
--- a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
+++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h
@@ -10,7 +10,7 @@ class TrackingRecHit2DHeterogeneous {
   template <typename T>
   using unique_ptr = typename Traits::template unique_ptr<T>;
 
-  using Hist = TrackingRecHit2DSOAView::Hist;
+  using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
 
   TrackingRecHit2DHeterogeneous() = default;
 
@@ -33,12 +33,12 @@ class TrackingRecHit2DHeterogeneous {
 
   auto hitsModuleStart() const { return m_hitsModuleStart; }
   auto hitsLayerStart() { return m_hitsLayerStart; }
-  auto phiBinner() { return m_hist; }
+  auto phiBinner() { return m_phiBinner; }
+  auto phiBinnerStorage() { return m_phiBinnerStorage; }
   auto iphi() { return m_iphi; }
 
   // only the local coord and detector index
   cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cudaStream_t stream) const;
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
   // for validation
@@ -47,14 +47,14 @@ class TrackingRecHit2DHeterogeneous {
   cms::cuda::host::unique_ptr<int16_t[]> sizeToHostAsync(cudaStream_t stream) const;
 
 private:
-  static constexpr uint32_t n16 = 4;
-  static constexpr uint32_t n32 = 9;
+  static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
+  static constexpr uint32_t n32 = 10;                // number of elements in m_store32
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
 
   unique_ptr<uint16_t[]> m_store16;  //!
   unique_ptr<float[]> m_store32;     //!
 
-  unique_ptr<TrackingRecHit2DSOAView::Hist> m_HistStore;                        //!
+  unique_ptr<TrackingRecHit2DSOAView::PhiBinner> m_PhiBinnerStore;              //!
   unique_ptr<TrackingRecHit2DSOAView::AverageGeometry> m_AverageGeometryStore;  //!
 
   unique_ptr<TrackingRecHit2DSOAView> m_view;  //!
@@ -64,7 +64,8 @@ class TrackingRecHit2DHeterogeneous {
   uint32_t const* m_hitsModuleStart;  // needed for legacy, this is on GPU!
 
   // needed as kernel params...
-  Hist* m_hist;
+  PhiBinner* m_phiBinner;
+  PhiBinner::index_type* m_phiBinnerStorage;
   uint32_t* m_hitsLayerStart;
   int16_t* m_iphi;
 };
@@ -89,11 +90,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
 
   // if empy do not bother
   if (0 == nHits) {
-    if
-#ifndef __CUDACC__
-        constexpr
-#endif
-        (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+    if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
       cms::cuda::copyAsync(m_view, view, stream);
     } else {
       m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
@@ -107,14 +104,20 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
   m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
-  m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);
+  m_store32 =
+      Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
+  m_PhiBinnerStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
+
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
 
   auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
   auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
 
   // copy all the pointers
-  m_hist = view->m_hist = m_HistStore.get();
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(9));
 
   view->m_xl = get32(0);
   view->m_yl = get32(1);
@@ -136,11 +139,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
 
   // transfer view
-  if
-#ifndef __CUDACC__
-      constexpr
-#endif
-      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+  if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
     cms::cuda::copyAsync(m_view, view, stream);
   } else {
     m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h
index faaa4378c..5638fcd59 100644
--- a/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h
+++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h
@@ -14,11 +14,9 @@ namespace pixelCPEforGPU {
 
 class TrackingRecHit2DSOAView {
 public:
-  static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
-  using hindex_type = uint16_t;  // if above is <=2^16
+  using hindex_type = uint32_t;  // if above is <=2^32
 
-  using Hist =
-      cms::cuda::HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), uint16_t, 10>;
+  using PhiBinner = cms::cuda::HistoContainer<int16_t, 128, -1, 8 * sizeof(int16_t), hindex_type, 10>;
 
   using AverageGeometry = phase1PixelTopology::AverageGeometry;
 
@@ -65,8 +63,8 @@ class TrackingRecHit2DSOAView {
   __device__ __forceinline__ uint32_t* hitsLayerStart() { return m_hitsLayerStart; }
   __device__ __forceinline__ uint32_t const* hitsLayerStart() const { return m_hitsLayerStart; }
 
-  __device__ __forceinline__ Hist& phiBinner() { return *m_hist; }
-  __device__ __forceinline__ Hist const& phiBinner() const { return *m_hist; }
+  __device__ __forceinline__ PhiBinner& phiBinner() { return *m_phiBinner; }
+  __device__ __forceinline__ PhiBinner const& phiBinner() const { return *m_phiBinner; }
 
   __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; }
   __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; }
@@ -87,15 +85,17 @@ class TrackingRecHit2DSOAView {
   uint16_t* m_detInd;
 
   // supporting objects
-  AverageGeometry* m_averageGeometry;  // owned (corrected for beam spot: not sure where to host it otherwise)
+  // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise
+  AverageGeometry* m_averageGeometry;              // owned by TrackingRecHit2DHeterogeneous
   pixelCPEforGPU::ParamsOnGPU const* m_cpeParams;  // forwarded from setup, NOT owned
   uint32_t const* m_hitsModuleStart;               // forwarded from clusters
 
   uint32_t* m_hitsLayerStart;
 
-  Hist* m_hist;
+  PhiBinner* m_phiBinner;
+  PhiBinner::index_type* m_phiBinnerStorage;
 
   uint32_t m_nHits;
 };
 
-#endif
+#endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
diff --git a/src/cudadev/CUDADataFormats/TrajectoryStateSoA.h b/src/cudadev/CUDADataFormats/TrajectoryStateSoAT.h
similarity index 90%
rename from src/cudadev/CUDADataFormats/TrajectoryStateSoA.h
rename to src/cudadev/CUDADataFormats/TrajectoryStateSoAT.h
index 49ca2b525..91fa245a3 100644
--- a/src/cudadev/CUDADataFormats/TrajectoryStateSoA.h
+++ b/src/cudadev/CUDADataFormats/TrajectoryStateSoAT.h
@@ -1,11 +1,11 @@
-#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H
-#define CUDADataFormatsTrackTrajectoryStateSOA_H
+#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
+#define CUDADataFormats_Track_TrajectoryStateSOAT_H
 
 #include <Eigen/Dense>
 #include "CUDACore/eigenSoA.h"
 
 template <int32_t S>
-struct TrajectoryStateSoA {
+struct TrajectoryStateSoAT {
   using Vector5f = Eigen::Matrix<float, 5, 1>;
   using Vector15f = Eigen::Matrix<float, 15, 1>;
 
@@ -56,4 +56,4 @@ struct TrajectoryStateSoA {
   }
 };
 
-#endif  // CUDADataFormatsTrackTrajectoryStateSOA_H
+#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/src/cudadev/CUDADataFormats/ZVertexSoA.h b/src/cudadev/CUDADataFormats/ZVertexSoA.h
index ecdf76d8e..273d0f054 100644
--- a/src/cudadev/CUDADataFormats/ZVertexSoA.h
+++ b/src/cudadev/CUDADataFormats/ZVertexSoA.h
@@ -1,5 +1,5 @@
-#ifndef CUDADataFormatsVertexZVertexSoA_H
-#define CUDADataFormatsVertexZVertexSoA_H
+#ifndef CUDADataFormats_Vertex_ZVertexSoA_h
+#define CUDADataFormats_Vertex_ZVertexSoA_h
 
 #include <cstdint>
 #include "CUDACore/cudaCompat.h"
@@ -23,4 +23,4 @@ struct ZVertexSoA {
   __host__ __device__ void init() { nvFinal = 0; }
 };
 
-#endif  // CUDADataFormatsVertexZVertexSoA.H
+#endif  // CUDADataFormats_Vertex_ZVertexSoA_h
diff --git a/src/cudadev/CUDADataFormats/gpuClusteringConstants.h b/src/cudadev/CUDADataFormats/gpuClusteringConstants.h
index 1430606ab..77cf567dc 100644
--- a/src/cudadev/CUDADataFormats/gpuClusteringConstants.h
+++ b/src/cudadev/CUDADataFormats/gpuClusteringConstants.h
@@ -2,30 +2,23 @@
 #define CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h
 
 #include <cstdint>
-
-namespace pixelGPUConstants {
-#ifdef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumberOfHits = 24 * 1024;
-#else
-  constexpr uint32_t maxNumberOfHits =
-      48 * 1024;  // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away
-#endif
-}  // namespace pixelGPUConstants
+#include <limits>
 
 namespace gpuClustering {
 #ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
   constexpr uint32_t maxHitsInIter() { return 64; }
 #else
   // optimized for real data PU 50
+  // tested on MC events with 55-75 pileup events
   constexpr uint32_t maxHitsInIter() { return 160; }
 #endif
   constexpr uint32_t maxHitsInModule() { return 1024; }
 
-  constexpr uint32_t MaxNumModules = 2000;
-  constexpr int32_t MaxNumClustersPerModules = maxHitsInModule();
-  constexpr uint32_t MaxHitsInModule = maxHitsInModule();  // as above
-  constexpr uint32_t MaxNumClusters = pixelGPUConstants::maxNumberOfHits;
-  constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
+  constexpr uint16_t maxNumModules = 2000;
+  constexpr int32_t maxNumClustersPerModules = maxHitsInModule();
+  constexpr uint16_t invalidModuleId = std::numeric_limits<uint16_t>::max() - 1;
+  static_assert(invalidModuleId > maxNumModules);  // invalidModuleId must be > maxNumModules
 
 }  // namespace gpuClustering
 
diff --git a/src/cudadev/CondFormats/PixelCPEFast.cc b/src/cudadev/CondFormats/PixelCPEFast.cc
index dd79bd389..08941425d 100644
--- a/src/cudadev/CondFormats/PixelCPEFast.cc
+++ b/src/cudadev/CondFormats/PixelCPEFast.cc
@@ -22,64 +22,65 @@ PixelCPEFast::PixelCPEFast(std::string const &path) {
   {
     std::ifstream in(path, std::ios::binary);
     in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
-    in.read(reinterpret_cast<char *>(&m_commonParamsGPU), sizeof(pixelCPEforGPU::CommonParams));
+    in.read(reinterpret_cast<char *>(&commonParamsGPU_), sizeof(pixelCPEforGPU::CommonParams));
     unsigned int ndetParams;
     in.read(reinterpret_cast<char *>(&ndetParams), sizeof(unsigned int));
-    m_detParamsGPU.resize(ndetParams);
-    in.read(reinterpret_cast<char *>(m_detParamsGPU.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams));
-    in.read(reinterpret_cast<char *>(&m_averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry));
-    in.read(reinterpret_cast<char *>(&m_layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry));
+    detParamsGPU_.resize(ndetParams);
+    in.read(reinterpret_cast<char *>(detParamsGPU_.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams));
+    in.read(reinterpret_cast<char *>(&averageGeometry_), sizeof(pixelCPEforGPU::AverageGeometry));
+    in.read(reinterpret_cast<char *>(&layerGeometry_), sizeof(pixelCPEforGPU::LayerGeometry));
   }
 
   cpuData_ = {
-      &m_commonParamsGPU,
-      m_detParamsGPU.data(),
-      &m_layerGeometry,
-      &m_averageGeometry,
+      &commonParamsGPU_,
+      detParamsGPU_.data(),
+      &layerGeometry_,
+      &averageGeometry_,
   };
 }
 
 const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cudaStream_t cudaStream) const {
   const auto &data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData &data, cudaStream_t stream) {
     // and now copy to device...
-    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
-    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_detParams,
-                         this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams)));
-    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry)));
-    cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry)));
-    cudaCheck(cudaMalloc((void **)&data.d_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU)));
+    cudaCheck(cudaMalloc((void **)&data.paramsOnGPU_h.m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
+    cudaCheck(cudaMalloc((void **)&data.paramsOnGPU_h.m_detParams,
+                         this->detParamsGPU_.size() * sizeof(pixelCPEforGPU::DetParams)));
+    cudaCheck(cudaMalloc((void **)&data.paramsOnGPU_h.m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry)));
+    cudaCheck(cudaMalloc((void **)&data.paramsOnGPU_h.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry)));
+    cudaCheck(cudaMalloc((void **)&data.paramsOnGPU_d, sizeof(pixelCPEforGPU::ParamsOnGPU)));
 
     cudaCheck(cudaMemcpyAsync(
-        data.d_paramsOnGPU, &data.h_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream));
-    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_commonParams,
-                              &this->m_commonParamsGPU,
+        data.paramsOnGPU_d, &data.paramsOnGPU_h, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync((void *)data.paramsOnGPU_h.m_commonParams,
+                              &this->commonParamsGPU_,
                               sizeof(pixelCPEforGPU::CommonParams),
                               cudaMemcpyDefault,
                               stream));
-    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_averageGeometry,
-                              &this->m_averageGeometry,
+    cudaCheck(cudaMemcpyAsync((void *)data.paramsOnGPU_h.m_averageGeometry,
+                              &this->averageGeometry_,
                               sizeof(pixelCPEforGPU::AverageGeometry),
                               cudaMemcpyDefault,
                               stream));
-    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_layerGeometry,
-                              &this->m_layerGeometry,
+    cudaCheck(cudaMemcpyAsync((void *)data.paramsOnGPU_h.m_layerGeometry,
+                              &this->layerGeometry_,
                               sizeof(pixelCPEforGPU::LayerGeometry),
                               cudaMemcpyDefault,
                               stream));
-    cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_detParams,
-                              this->m_detParamsGPU.data(),
-                              this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams),
+    cudaCheck(cudaMemcpyAsync((void *)data.paramsOnGPU_h.m_detParams,
+                              this->detParamsGPU_.data(),
+                              this->detParamsGPU_.size() * sizeof(pixelCPEforGPU::DetParams),
                               cudaMemcpyDefault,
                               stream));
   });
-  return data.d_paramsOnGPU;
+  return data.paramsOnGPU_d;
 }
 
 PixelCPEFast::GPUData::~GPUData() {
-  if (d_paramsOnGPU != nullptr) {
-    cudaFree((void *)h_paramsOnGPU.m_commonParams);
-    cudaFree((void *)h_paramsOnGPU.m_detParams);
-    cudaFree((void *)h_paramsOnGPU.m_averageGeometry);
-    cudaFree(d_paramsOnGPU);
+  if (paramsOnGPU_d != nullptr) {
+    cudaFree((void *)paramsOnGPU_h.m_commonParams);
+    cudaFree((void *)paramsOnGPU_h.m_detParams);
+    cudaFree((void *)paramsOnGPU_h.m_averageGeometry);
+    cudaFree((void *)paramsOnGPU_h.m_layerGeometry);
+    cudaFree(paramsOnGPU_d);
   }
 }
diff --git a/src/cudadev/CondFormats/PixelCPEFast.h b/src/cudadev/CondFormats/PixelCPEFast.h
index eb0f21c28..80b2476f5 100644
--- a/src/cudadev/CondFormats/PixelCPEFast.h
+++ b/src/cudadev/CondFormats/PixelCPEFast.h
@@ -20,20 +20,18 @@ class PixelCPEFast {
   pixelCPEforGPU::ParamsOnGPU const &getCPUProduct() const { return cpuData_; }
 
 private:
-  // allocate it with posix malloc to be ocmpatible with cpu wf
-  std::vector<pixelCPEforGPU::DetParams> m_detParamsGPU;
-  // std::vector<pixelCPEforGPU::DetParams, cms::cuda::HostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
-  pixelCPEforGPU::CommonParams m_commonParamsGPU;
-  pixelCPEforGPU::LayerGeometry m_layerGeometry;
-  pixelCPEforGPU::AverageGeometry m_averageGeometry;
-
+  // allocate this with posix malloc to be compatible with the cpu workflow
+  std::vector<pixelCPEforGPU::DetParams> detParamsGPU_;
+  pixelCPEforGPU::CommonParams commonParamsGPU_;
+  pixelCPEforGPU::LayerGeometry layerGeometry_;
+  pixelCPEforGPU::AverageGeometry averageGeometry_;
   pixelCPEforGPU::ParamsOnGPU cpuData_;
 
   struct GPUData {
     ~GPUData();
     // not needed if not used on CPU...
-    pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU;
-    pixelCPEforGPU::ParamsOnGPU *d_paramsOnGPU = nullptr;  // copy of the above on the Device
+    pixelCPEforGPU::ParamsOnGPU paramsOnGPU_h;
+    pixelCPEforGPU::ParamsOnGPU *paramsOnGPU_d = nullptr;  // copy of the above on the Device
   };
   cms::cuda::ESProduct<GPUData> gpuData_;
 
diff --git a/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.cc b/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
index 76e64e8f3..43885d5a9 100644
--- a/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
+++ b/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
@@ -28,7 +28,7 @@ const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsyn
 
     cudaCheck(cudaMemcpyAsync(
         data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream));
-    cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals),
+    cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals_),
                               &(data.gainDataOnGPU),
                               sizeof(SiPixelGainForHLTonGPU_DecodingStructure*),
                               cudaMemcpyDefault,
diff --git a/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.h b/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.h
index e5920a08c..12b52f426 100644
--- a/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.h
+++ b/src/cudadev/CondFormats/SiPixelGainCalibrationForHLTGPU.h
@@ -12,7 +12,7 @@ class SiPixelGainCalibrationForHLTGPU {
   ~SiPixelGainCalibrationForHLTGPU();
 
   const SiPixelGainForHLTonGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
-  const SiPixelGainForHLTonGPU *getCPUProduct() const { return gainForHLTonHost_; }
+  const SiPixelGainForHLTonGPU *cpuProduct() const { return gainForHLTonHost_; }
 
 private:
   SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr;
diff --git a/src/cudadev/CondFormats/SiPixelGainForHLTonGPU.h b/src/cudadev/CondFormats/SiPixelGainForHLTonGPU.h
index 5bcdc7a66..5b168f00c 100644
--- a/src/cudadev/CondFormats/SiPixelGainForHLTonGPU.h
+++ b/src/cudadev/CondFormats/SiPixelGainForHLTonGPU.h
@@ -1,5 +1,5 @@
-#ifndef CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
-#define CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
+#ifndef CondFormats_SiPixelObjects_interface_SiPixelGainForHLTonGPU_h
+#define CondFormats_SiPixelObjects_interface_SiPixelGainForHLTonGPU_h
 
 #include <cstdint>
 #include <cstdio>
@@ -17,6 +17,7 @@
 #endif  // __CUDACC__
 
 #include "CUDACore/cuda_assert.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
 
 struct SiPixelGainForHLTonGPU_DecodingStructure {
   uint8_t gain;
@@ -32,8 +33,8 @@ class SiPixelGainForHLTonGPU {
 
   inline __host__ __device__ std::pair<float, float> getPedAndGain(
       uint32_t moduleInd, int col, int row, bool& isDeadColumn, bool& isNoisyColumn) const {
-    auto range = rangeAndCols[moduleInd].first;
-    auto nCols = rangeAndCols[moduleInd].second;
+    auto range = rangeAndCols_[moduleInd].first;
+    auto nCols = rangeAndCols_[moduleInd].second;
 
     // determine what averaged data block we are in (there should be 1 or 2 of these depending on if plaquette is 1 by X or 2 by X
     unsigned int lengthOfColumnData = (range.second - range.first) / nCols;
@@ -46,7 +47,7 @@ class SiPixelGainForHLTonGPU {
     assert(offset < 3088384);
     assert(0 == offset % 2);
 
-    DecodingStructure const* __restrict__ lp = v_pedestals;
+    DecodingStructure const* __restrict__ lp = v_pedestals_;
     auto s = lp[offset / 2];
 
     isDeadColumn = (s.ped & 0xFF) == deadFlag_;
@@ -55,15 +56,14 @@ class SiPixelGainForHLTonGPU {
     return std::make_pair(decodePed(s.ped & 0xFF), decodeGain(s.gain & 0xFF));
   }
 
-  constexpr float decodeGain(unsigned int gain) const { return gain * gainPrecision + minGain_; }
-  constexpr float decodePed(unsigned int ped) const { return ped * pedPrecision + minPed_; }
+  constexpr float decodeGain(unsigned int gain) const { return gain * gainPrecision_ + minGain_; }
+  constexpr float decodePed(unsigned int ped) const { return ped * pedPrecision_ + minPed_; }
 
-  DecodingStructure* v_pedestals;
-  std::pair<Range, int> rangeAndCols[2000];
+  DecodingStructure* v_pedestals_;
+  std::pair<Range, int> rangeAndCols_[gpuClustering::maxNumModules];
 
   float minPed_, maxPed_, minGain_, maxGain_;
-
-  float pedPrecision, gainPrecision;
+  float pedPrecision_, gainPrecision_;
 
   unsigned int numberOfRowsAveragedOver_;  // this is 80!!!!
   unsigned int nBinsToUseForEncoding_;
@@ -71,4 +71,4 @@ class SiPixelGainForHLTonGPU {
   unsigned int noisyFlag_;
 };
 
-#endif  // CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
+#endif  // CondFormats_SiPixelObjects_interface_SiPixelGainForHLTonGPU_h
diff --git a/src/cudadev/CondFormats/SiPixelFedCablingMapGPU.h b/src/cudadev/CondFormats/SiPixelROCsStatusAndMapping.h
similarity index 72%
rename from src/cudadev/CondFormats/SiPixelFedCablingMapGPU.h
rename to src/cudadev/CondFormats/SiPixelROCsStatusAndMapping.h
index 900307ae0..f7cd8dedc 100644
--- a/src/cudadev/CondFormats/SiPixelFedCablingMapGPU.h
+++ b/src/cudadev/CondFormats/SiPixelROCsStatusAndMapping.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPU_h
-#define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPU_h
+#ifndef CondFormats_SiPixelObjects_interface_SiPixelROCsStatusAndMapping_h
+#define CondFormats_SiPixelObjects_interface_SiPixelROCsStatusAndMapping_h
 
 namespace pixelgpudetails {
   // Maximum fed for phase1 is 150 but not all of them are filled
@@ -11,16 +11,15 @@ namespace pixelgpudetails {
   constexpr unsigned int MAX_SIZE_BYTE_BOOL = MAX_SIZE * sizeof(unsigned char);
 }  // namespace pixelgpudetails
 
-// TODO: since this has more information than just cabling map, maybe we should invent a better name?
-struct SiPixelFedCablingMapGPU {
+struct SiPixelROCsStatusAndMapping {
   alignas(128) unsigned int fed[pixelgpudetails::MAX_SIZE];
   alignas(128) unsigned int link[pixelgpudetails::MAX_SIZE];
   alignas(128) unsigned int roc[pixelgpudetails::MAX_SIZE];
-  alignas(128) unsigned int RawId[pixelgpudetails::MAX_SIZE];
+  alignas(128) unsigned int rawId[pixelgpudetails::MAX_SIZE];
   alignas(128) unsigned int rocInDet[pixelgpudetails::MAX_SIZE];
   alignas(128) unsigned int moduleId[pixelgpudetails::MAX_SIZE];
   alignas(128) unsigned char badRocs[pixelgpudetails::MAX_SIZE];
   alignas(128) unsigned int size = 0;
 };
 
-#endif
+#endif  // CondFormats_SiPixelObjects_interface_SiPixelROCsStatusAndMapping_h
diff --git a/src/cudadev/CondFormats/SiPixelFedCablingMapGPUWrapper.cc b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.cc
similarity index 55%
rename from src/cudadev/CondFormats/SiPixelFedCablingMapGPUWrapper.cc
rename to src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.cc
index 56a3dc7ea..8a1d9618f 100644
--- a/src/cudadev/CondFormats/SiPixelFedCablingMapGPUWrapper.cc
+++ b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.cc
@@ -11,32 +11,33 @@
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "CondFormats/SiPixelROCsStatusAndMappingWrapper.h"
 
-SiPixelFedCablingMapGPUWrapper::SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const& cablingMap,
+SiPixelROCsStatusAndMappingWrapper::SiPixelROCsStatusAndMappingWrapper(SiPixelROCsStatusAndMapping const& cablingMap,
                                                                std::vector<unsigned char> modToUnp)
     : modToUnpDefault(modToUnp.size()), hasQuality_(true) {
-  cudaCheck(cudaMallocHost(&cablingMapHost, sizeof(SiPixelFedCablingMapGPU)));
-  std::memcpy(cablingMapHost, &cablingMap, sizeof(SiPixelFedCablingMapGPU));
+  cudaCheck(cudaMallocHost(&cablingMapHost, sizeof(SiPixelROCsStatusAndMapping)));
+  std::memcpy(cablingMapHost, &cablingMap, sizeof(SiPixelROCsStatusAndMapping));
 
   std::copy(modToUnp.begin(), modToUnp.end(), modToUnpDefault.begin());
 }
 
-SiPixelFedCablingMapGPUWrapper::~SiPixelFedCablingMapGPUWrapper() { cudaCheck(cudaFreeHost(cablingMapHost)); }
+SiPixelROCsStatusAndMappingWrapper::~SiPixelROCsStatusAndMappingWrapper() { cudaCheck(cudaFreeHost(cablingMapHost)); }
 
-const SiPixelFedCablingMapGPU* SiPixelFedCablingMapGPUWrapper::getGPUProductAsync(cudaStream_t cudaStream) const {
+const SiPixelROCsStatusAndMapping* SiPixelROCsStatusAndMappingWrapper::getGPUProductAsync(cudaStream_t cudaStream) const {
   const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
     // allocate
-    cudaCheck(cudaMalloc(&data.cablingMapDevice, sizeof(SiPixelFedCablingMapGPU)));
+    cudaCheck(cudaMalloc(&data.cablingMapDevice, sizeof(SiPixelROCsStatusAndMapping)));
 
     // transfer
     cudaCheck(cudaMemcpyAsync(
-        data.cablingMapDevice, this->cablingMapHost, sizeof(SiPixelFedCablingMapGPU), cudaMemcpyDefault, stream));
+        data.cablingMapDevice, this->cablingMapHost, sizeof(SiPixelROCsStatusAndMapping), cudaMemcpyDefault, stream));
   });
   return data.cablingMapDevice;
 }
 
-const unsigned char* SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cudaStream_t cudaStream) const {
+const unsigned char* SiPixelROCsStatusAndMappingWrapper::getModToUnpAllAsync(cudaStream_t cudaStream) const {
   const auto& data =
       modToUnp_.dataForCurrentDeviceAsync(cudaStream, [this](ModulesToUnpack& data, cudaStream_t stream) {
         cudaCheck(cudaMalloc((void**)&data.modToUnpDefault, pixelgpudetails::MAX_SIZE_BYTE_BOOL));
@@ -49,6 +50,6 @@ const unsigned char* SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cudaStr
   return data.modToUnpDefault;
 }
 
-SiPixelFedCablingMapGPUWrapper::GPUData::~GPUData() { cudaCheck(cudaFree(cablingMapDevice)); }
+SiPixelROCsStatusAndMappingWrapper::GPUData::~GPUData() { cudaCheck(cudaFree(cablingMapDevice)); }
 
-SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::~ModulesToUnpack() { cudaCheck(cudaFree(modToUnpDefault)); }
+SiPixelROCsStatusAndMappingWrapper::ModulesToUnpack::~ModulesToUnpack() { cudaCheck(cudaFree(modToUnpDefault)); }
diff --git a/src/cudadev/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.h
similarity index 55%
rename from src/cudadev/CondFormats/SiPixelFedCablingMapGPUWrapper.h
rename to src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.h
index 027e7d25c..1997d2377 100644
--- a/src/cudadev/CondFormats/SiPixelFedCablingMapGPUWrapper.h
+++ b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.h
@@ -1,25 +1,25 @@
-#ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
-#define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
+#ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelROCsStatusAndMappingWrapper_h
+#define RecoLocalTracker_SiPixelClusterizer_SiPixelROCsStatusAndMappingWrapper_h
 
 #include "CUDACore/ESProduct.h"
 #include "CUDACore/HostAllocator.h"
 #include "CUDACore/device_unique_ptr.h"
-#include "CondFormats/SiPixelFedCablingMapGPU.h"
+#include "CondFormats/SiPixelROCsStatusAndMapping.h"
 
 #include <cuda_runtime.h>
 
 #include <set>
 
-class SiPixelFedCablingMapGPUWrapper {
+class SiPixelROCsStatusAndMappingWrapper {
 public:
-  explicit SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const &cablingMap,
+  explicit SiPixelROCsStatusAndMappingWrapper(SiPixelROCsStatusAndMapping const &cablingMap,
                                           std::vector<unsigned char> modToUnp);
-  ~SiPixelFedCablingMapGPUWrapper();
+  ~SiPixelROCsStatusAndMappingWrapper();
 
   bool hasQuality() const { return hasQuality_; }
 
   // returns pointer to GPU memory
-  const SiPixelFedCablingMapGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
+  const SiPixelROCsStatusAndMapping *getGPUProductAsync(cudaStream_t cudaStream) const;
 
   // returns pointer to GPU memory
   const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const;
@@ -28,11 +28,11 @@ class SiPixelFedCablingMapGPUWrapper {
   std::vector<unsigned char, cms::cuda::HostAllocator<unsigned char>> modToUnpDefault;
   bool hasQuality_;
 
-  SiPixelFedCablingMapGPU *cablingMapHost = nullptr;  // pointer to struct in CPU
+  SiPixelROCsStatusAndMapping *cablingMapHost = nullptr;  // pointer to struct in CPU
 
   struct GPUData {
     ~GPUData();
-    SiPixelFedCablingMapGPU *cablingMapDevice = nullptr;  // pointer to struct in GPU
+    SiPixelROCsStatusAndMapping *cablingMapDevice = nullptr;  // pointer to struct in GPU
   };
   cms::cuda::ESProduct<GPUData> gpuData_;
 
diff --git a/src/cudadev/CondFormats/pixelCPEforGPU.h b/src/cudadev/CondFormats/pixelCPEforGPU.h
index f1eca60fd..6eefa7fda 100644
--- a/src/cudadev/CondFormats/pixelCPEforGPU.h
+++ b/src/cudadev/CondFormats/pixelCPEforGPU.h
@@ -83,10 +83,10 @@ namespace pixelCPEforGPU {
     uint32_t minCol[N];
     uint32_t maxCol[N];
 
-    int32_t Q_f_X[N];
-    int32_t Q_l_X[N];
-    int32_t Q_f_Y[N];
-    int32_t Q_l_Y[N];
+    int32_t q_f_X[N];
+    int32_t q_l_X[N];
+    int32_t q_f_Y[N];
+    int32_t q_l_Y[N];
 
     int32_t charge[N];
 
@@ -116,8 +116,8 @@ namespace pixelCPEforGPU {
   }
 
   constexpr inline float correction(int sizeM1,
-                                    int Q_f,                        //!< Charge in the first pixel.
-                                    int Q_l,                        //!< Charge in the last pixel.
+                                    int q_f,                        //!< Charge in the first pixel.
+                                    int q_l,                        //!< Charge in the last pixel.
                                     uint16_t upper_edge_first_pix,  //!< As the name says.
                                     uint16_t lower_edge_last_pix,   //!< As the name says.
                                     float lorentz_shift,            //!< L-shift at half thickness
@@ -130,26 +130,27 @@ namespace pixelCPEforGPU {
     if (0 == sizeM1)  // size 1
       return 0;
 
-    float W_eff = 0;
+    float w_eff = 0;
     bool simple = true;
     if (1 == sizeM1) {  // size 2
       //--- Width of the clusters minus the edge (first and last) pixels.
       //--- In the note, they are denoted x_F and x_L (and y_F and y_L)
       // assert(lower_edge_last_pix >= upper_edge_first_pix);
-      auto W_inner = pitch * float(lower_edge_last_pix - upper_edge_first_pix);  // in cm
+      auto w_inner = pitch * float(lower_edge_last_pix - upper_edge_first_pix);  // in cm
 
       //--- Predicted charge width from geometry
-      auto W_pred = theThickness * cot_angle  // geometric correction (in cm)
+      auto w_pred = theThickness * cot_angle  // geometric correction (in cm)
                     - lorentz_shift;          // (in cm) &&& check fpix!
 
-      W_eff = std::abs(W_pred) - W_inner;
+      w_eff = std::abs(w_pred) - w_inner;
 
       //--- If the observed charge width is inconsistent with the expectations
-      //--- based on the track, do *not* use W_pred-W_inner.  Instead, replace
+      //--- based on the track, do *not* use w_pred-w_inner.  Instead, replace
       //--- it with an *average* effective charge width, which is the average
       //--- length of the edge pixels.
-      simple =
-          (W_eff < 0.0f) | (W_eff > pitch);  // this produces "large" regressions for very small numeric differences...
+
+      // this can produce "large" regressions for very small numeric differences
+      simple = (w_eff < 0.0f) | (w_eff > pitch);
     }
 
     if (simple) {
@@ -159,18 +160,18 @@ namespace pixelCPEforGPU {
         sum_of_edge += 1.0f;
       if (last_is_big)
         sum_of_edge += 1.0f;
-      W_eff = pitch * 0.5f * sum_of_edge;  // ave. length of edge pixels (first+last) (cm)
+      w_eff = pitch * 0.5f * sum_of_edge;  // ave. length of edge pixels (first+last) (cm)
     }
 
     //--- Finally, compute the position in this projection
-    float Qdiff = Q_l - Q_f;
-    float Qsum = Q_l + Q_f;
+    float qdiff = q_l - q_f;
+    float qsum = q_l + q_f;
 
     //--- Temporary fix for clusters with both first and last pixel with charge = 0
-    if (Qsum == 0)
-      Qsum = 1.0f;
+    if (qsum == 0)
+      qsum = 1.0f;
 
-    return 0.5f * (Qdiff / Qsum) * W_eff;
+    return 0.5f * (qdiff / qsum) * w_eff;
   }
 
   constexpr inline void position(CommonParams const& __restrict__ comParams,
@@ -207,8 +208,8 @@ namespace pixelCPEforGPU {
     if (phase1PixelTopology::isBigPixY(cp.maxCol[ic]))
       ++ysize;
 
-    int unbalanceX = 8. * std::abs(float(cp.Q_f_X[ic] - cp.Q_l_X[ic])) / float(cp.Q_f_X[ic] + cp.Q_l_X[ic]);
-    int unbalanceY = 8. * std::abs(float(cp.Q_f_Y[ic] - cp.Q_l_Y[ic])) / float(cp.Q_f_Y[ic] + cp.Q_l_Y[ic]);
+    int unbalanceX = 8. * std::abs(float(cp.q_f_X[ic] - cp.q_l_X[ic])) / float(cp.q_f_X[ic] + cp.q_l_X[ic]);
+    int unbalanceY = 8. * std::abs(float(cp.q_f_Y[ic] - cp.q_l_Y[ic])) / float(cp.q_f_Y[ic] + cp.q_l_Y[ic]);
     xsize = 8 * xsize - unbalanceX;
     ysize = 8 * ysize - unbalanceY;
 
@@ -231,8 +232,8 @@ namespace pixelCPEforGPU {
     auto thickness = detParams.isBarrel ? comParams.theThicknessB : comParams.theThicknessE;
 
     auto xcorr = correction(cp.maxRow[ic] - cp.minRow[ic],
-                            cp.Q_f_X[ic],
-                            cp.Q_l_X[ic],
+                            cp.q_f_X[ic],
+                            cp.q_l_X[ic],
                             llxl,
                             urxl,
                             detParams.chargeWidthX,  // lorentz shift in cm
@@ -243,8 +244,8 @@ namespace pixelCPEforGPU {
                             phase1PixelTopology::isBigPixX(cp.maxRow[ic]));
 
     auto ycorr = correction(cp.maxCol[ic] - cp.minCol[ic],
-                            cp.Q_f_Y[ic],
-                            cp.Q_l_Y[ic],
+                            cp.q_f_Y[ic],
+                            cp.q_l_Y[ic],
                             llyl,
                             uryl,
                             detParams.chargeWidthY,  // lorentz shift in cm
diff --git a/src/cudadev/DataFormats/DetId.h b/src/cudadev/DataFormats/DetId.h
new file mode 100644
index 000000000..290483762
--- /dev/null
+++ b/src/cudadev/DataFormats/DetId.h
@@ -0,0 +1,93 @@
+#ifndef DATAFORMATS_DETID_H
+#define DATAFORMATS_DETID_H
+
+//FIXME shall be removed and implemented where the operator is defined
+#include <ostream>
+
+#include <cstdint>
+/** \class DetId
+
+Parent class for all detector ids in CMS.  The DetId is a 32-bit
+unsigned integer.  The four most significant bits ([31:28]) identify
+the large-scale detector (e.g. Tracker or Ecal) while the next three
+bits ([27:25]) identify a part of the detector (such as HcalBarrel
+(HB) for Hcal).
+
+*/
+class DetId {
+public:
+  static const int kDetMask = 0xF;
+  static const int kSubdetMask = 0x7;
+  static const int kDetOffset = 28;
+  static const int kSubdetOffset = 25;
+
+  enum Detector {
+    Tracker = 1,
+    Muon = 2,
+    Ecal = 3,
+    Hcal = 4,
+    Calo = 5,
+    Forward = 6,
+    VeryForward = 7,
+    HGCalEE = 8,
+    HGCalHSi = 9,
+    HGCalHSc = 10,
+    HGCalTrigger = 11
+  };
+  /// Create an empty or null id (also for persistence)
+  constexpr DetId() : id_(0) {}
+  /// Create an id from a raw number
+  constexpr DetId(uint32_t id) : id_(id) {}
+  /// Create an id, filling the detector and subdetector fields as specified
+  constexpr DetId(Detector det, int subdet)
+      : id_(((det & kDetMask) << kDetOffset) | ((subdet & kSubdetMask) << kSubdetOffset)) {}
+
+  /// get the detector field from this detid
+  constexpr Detector det() const { return Detector((id_ >> kDetOffset) & kDetMask); }
+  /// get the contents of the subdetector field (not cast into any detector's numbering enum)
+  constexpr int subdetId() const {
+    return ((HGCalEE == det()) || (HGCalHSi == det()) || (HGCalHSc == det()) ? 0
+                                                                             : ((id_ >> kSubdetOffset) & kSubdetMask));
+  }
+
+  constexpr uint32_t operator()() const { return id_; }
+  constexpr operator uint32_t() const { return id_; }
+
+  /// get the raw id
+  constexpr uint32_t rawId() const { return id_; }
+  /// is this a null id ?
+  constexpr bool null() const { return id_ == 0; }
+
+  /// equality
+  constexpr bool operator==(DetId id) const { return id_ == id.id_; }
+  /// inequality
+  constexpr bool operator!=(DetId id) const { return id_ != id.id_; }
+  /// comparison
+  constexpr bool operator<(DetId id) const { return id_ < id.id_; }
+
+protected:
+  uint32_t id_;
+};
+
+/// equality
+constexpr inline bool operator==(uint32_t i, DetId id) { return i == id(); }
+constexpr inline bool operator==(DetId id, uint32_t i) { return i == id(); }
+/// inequality
+constexpr inline bool operator!=(uint32_t i, DetId id) { return i != id(); }
+constexpr inline bool operator!=(DetId id, uint32_t i) { return i != id(); }
+/// comparison
+constexpr inline bool operator<(uint32_t i, DetId id) { return i < id(); }
+constexpr inline bool operator<(DetId id, uint32_t i) { return id() < i; }
+
+//std::ostream& operator<<(std::ostream& s, const DetId& id);
+
+namespace std {
+  template <>
+  struct hash<DetId> {
+    typedef DetId argument_type;
+    typedef std::size_t result_type;
+    result_type operator()(argument_type const& id) const noexcept { return std::hash<uint32_t>()(id.rawId()); }
+  };
+}  // namespace std
+
+#endif
diff --git a/src/cudadev/DataFormats/FEDNumbering.h b/src/cudadev/DataFormats/FEDNumbering.h
index d819b780f..acfcb08d1 100644
--- a/src/cudadev/DataFormats/FEDNumbering.h
+++ b/src/cudadev/DataFormats/FEDNumbering.h
@@ -121,9 +121,9 @@ class FEDNumbering {
     MINCTPPSPixelsFEDID = 1462,
     MAXCTPPSPixelsFEDID = 1466,
     MINGEMFEDID = 1467,
-    MAXGEMFEDID = 1472,
-    MINME0FEDID = 1473,
-    MAXME0FEDID = 1478,
+    MINGE0FEDID = 1473,
+    MINGE21FEDID = 1469,
+    MAXGEMFEDID = 1478,
     MINDAQvFEDFEDID = 2815,
     MAXDAQvFEDFEDID = 4095
   };
diff --git a/src/cudadev/DataFormats/PixelErrors.h b/src/cudadev/DataFormats/PixelErrors.h
deleted file mode 100644
index 797fec768..000000000
--- a/src/cudadev/DataFormats/PixelErrors.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef DataFormats_SiPixelDigi_interface_PixelErrors_h
-#define DataFormats_SiPixelDigi_interface_PixelErrors_h
-
-#include <map>
-#include <vector>
-#include <cstdint>
-
-#include "DataFormats/SiPixelRawDataError.h"
-
-// Better ideas for the placement of these?
-
-struct PixelErrorCompact {
-  uint32_t rawId;
-  uint32_t word;
-  uint8_t errorType;
-  uint8_t fedId;
-};
-
-using PixelFormatterErrors = std::map<uint32_t, std::vector<SiPixelRawDataError>>;
-
-#endif  // DataFormats_SiPixelDigi_interface_PixelErrors_h
diff --git a/src/cudadev/DataFormats/PixelSubdetector.h b/src/cudadev/DataFormats/PixelSubdetector.h
new file mode 100644
index 000000000..6f33b240a
--- /dev/null
+++ b/src/cudadev/DataFormats/PixelSubdetector.h
@@ -0,0 +1,14 @@
+#ifndef DataFormats_SiPixelDetId_PixelSubdetector_H
+#define DataFormats_SiPixelDetId_PixelSubdetector_H
+
+/** 
+ *  Enumeration for Pixel Tracker Subdetectors
+ *
+ */
+
+class PixelSubdetector {
+public:
+  enum SubDetector { PixelBarrel = 1, PixelEndcap = 2 };
+};
+
+#endif
diff --git a/src/cudadev/DataFormats/SiPixelDigiConstants.h b/src/cudadev/DataFormats/SiPixelDigiConstants.h
new file mode 100644
index 000000000..f37a7eddc
--- /dev/null
+++ b/src/cudadev/DataFormats/SiPixelDigiConstants.h
@@ -0,0 +1,60 @@
+#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigiConstants
+#define DataFormats_SiPixelDigi_interface_SiPixelDigiConstants
+
+#include <cstdint>
+
+using Word64 = uint64_t;
+using Word32 = uint32_t;
+
+namespace sipixelconstants {
+  constexpr uint32_t dummyDetId = 0xffffffff;
+
+  constexpr uint32_t CRC_bits = 1;
+  constexpr uint32_t DCOL_bits = 5;  // double column
+  constexpr uint32_t PXID_bits = 8;  // pixel id
+  constexpr uint32_t ADC_bits = 8;
+  constexpr uint32_t OMIT_ERR_bits = 1;
+  // GO BACK TO OLD VALUES. THE 48-CHAN FED DOES NOT NEED A NEW FORMAT 28/9/16 d.k.
+  constexpr uint32_t LINK_bits = 6;  // 7;
+  constexpr uint32_t ROC_bits = 5;   // 4;
+
+  constexpr uint32_t CRC_shift = 2;
+  constexpr uint32_t ADC_shift = 0;
+  constexpr uint32_t PXID_shift = ADC_shift + ADC_bits;
+  constexpr uint32_t DCOL_shift = PXID_shift + PXID_bits;
+  constexpr uint32_t ROC_shift = DCOL_shift + DCOL_bits;
+  constexpr uint32_t LINK_shift = ROC_shift + ROC_bits;
+  constexpr uint32_t OMIT_ERR_shift = 20;
+
+  constexpr uint64_t CRC_mask = ~(~Word64(0) << CRC_bits);
+  constexpr uint32_t ERROR_mask = ~(~Word32(0) << ROC_bits);
+  constexpr uint32_t LINK_mask = ~(~Word32(0) << LINK_bits);
+  constexpr uint32_t ROC_mask = ~(~Word32(0) << ROC_bits);
+  constexpr uint32_t OMIT_ERR_mask = ~(~Word32(0) << OMIT_ERR_bits);
+  constexpr uint32_t DCOL_mask = ~(~Word32(0) << DCOL_bits);
+  constexpr uint32_t PXID_mask = ~(~Word32(0) << PXID_bits);
+  constexpr uint32_t ADC_mask = ~(~Word32(0) << ADC_bits);
+
+  // Special for layer 1 bpix rocs 6/9/16 d.k. THIS STAYS.
+  inline namespace phase1layer1 {
+    constexpr uint32_t COL_bits1_l1 = 6;
+    constexpr uint32_t ROW_bits1_l1 = 7;
+    constexpr uint32_t ROW_shift = ADC_shift + ADC_bits;
+    constexpr uint32_t COL_shift = ROW_shift + ROW_bits1_l1;
+    constexpr uint32_t COL_mask = ~(~Word32(0) << COL_bits1_l1);
+    constexpr uint32_t ROW_mask = ~(~Word32(0) << ROW_bits1_l1);
+  }  // namespace phase1layer1
+
+  // constexpr functions are available in device code (GPU) as well
+  inline namespace functions {
+    inline constexpr uint32_t getLink(uint32_t ww) { return ((ww >> LINK_shift) & LINK_mask); }
+    inline constexpr uint32_t getROC(uint32_t ww) { return ((ww >> ROC_shift) & ROC_mask); }
+    inline constexpr uint32_t getADC(uint32_t ww) { return ((ww >> ADC_shift) & ADC_mask); }
+    inline constexpr uint32_t getCol(uint32_t ww) { return ((ww >> COL_shift) & COL_mask); }
+    inline constexpr uint32_t getRow(uint32_t ww) { return ((ww >> ROW_shift) & ROW_mask); }
+    inline constexpr uint32_t getDCol(uint32_t ww) { return ((ww >> DCOL_shift) & DCOL_mask); }
+    inline constexpr uint32_t getPxId(uint32_t ww) { return ((ww >> PXID_shift) & PXID_mask); }
+  }  // namespace functions
+}  // namespace sipixelconstants
+
+#endif  // DataFormats_SiPixelDigi_interface_SiPixelDigiConstants
diff --git a/src/cudadev/DataFormats/SiPixelDigisSoA.cc b/src/cudadev/DataFormats/SiPixelDigisSoA.cc
index 600d79b02..ba06ff9f6 100644
--- a/src/cudadev/DataFormats/SiPixelDigisSoA.cc
+++ b/src/cudadev/DataFormats/SiPixelDigisSoA.cc
@@ -7,6 +7,4 @@ SiPixelDigisSoA::SiPixelDigisSoA(
     : pdigi_(pdigi, pdigi + nDigis),
       rawIdArr_(rawIdArr, rawIdArr + nDigis),
       adc_(adc, adc + nDigis),
-      clus_(clus, clus + nDigis) {
-  assert(pdigi_.size() == nDigis);
-}
+      clus_(clus, clus + nDigis) {}
diff --git a/src/cudadev/DataFormats/SiPixelDigisSoA.h b/src/cudadev/DataFormats/SiPixelDigisSoA.h
index 50e863f03..f352754e3 100644
--- a/src/cudadev/DataFormats/SiPixelDigisSoA.h
+++ b/src/cudadev/DataFormats/SiPixelDigisSoA.h
@@ -1,9 +1,16 @@
 #ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
 #define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
 
+#include <cstddef>
 #include <cstdint>
 #include <vector>
 
+// The main purpose of this class is to deliver digi and cluster data
+// from an EDProducer that transfers the data from GPU to host to an
+// EDProducer that converts the SoA to legacy data products. The class
+// is independent of any GPU technology, and in prunciple could be
+// produced by host code, and be used for other purposes than
+// conversion-to-legacy as well.
 class SiPixelDigisSoA {
 public:
   SiPixelDigisSoA() = default;
@@ -24,10 +31,10 @@ class SiPixelDigisSoA {
   const std::vector<int32_t>& clusVector() const { return clus_; }
 
 private:
-  std::vector<uint32_t> pdigi_;
-  std::vector<uint32_t> rawIdArr_;
-  std::vector<uint16_t> adc_;
-  std::vector<int32_t> clus_;
+  std::vector<uint32_t> pdigi_;     // packed digi (row, col, adc) of each pixel
+  std::vector<uint32_t> rawIdArr_;  // DetId of each pixel
+  std::vector<uint16_t> adc_;       // ADC of each pixel
+  std::vector<int32_t> clus_;       // cluster id of each pixel
 };
 
 #endif
diff --git a/src/cudadev/DataFormats/SiPixelErrorCompact.h b/src/cudadev/DataFormats/SiPixelErrorCompact.h
new file mode 100644
index 000000000..0b1a80868
--- /dev/null
+++ b/src/cudadev/DataFormats/SiPixelErrorCompact.h
@@ -0,0 +1,13 @@
+#ifndef DataFormats_SiPixelRawData_interface_SiPixelErrorCompact_h
+#define DataFormats_SiPixelRawData_interface_SiPixelErrorCompact_h
+
+#include <cstdint>
+
+struct SiPixelErrorCompact {
+  uint32_t rawId;
+  uint32_t word;
+  uint8_t errorType;
+  uint8_t fedId;
+};
+
+#endif  // DataFormats_SiPixelRawData_interface_SiPixelErrorCompact_h
diff --git a/src/cudadev/DataFormats/SiPixelFormatterErrors.h b/src/cudadev/DataFormats/SiPixelFormatterErrors.h
new file mode 100644
index 000000000..871ac5a8a
--- /dev/null
+++ b/src/cudadev/DataFormats/SiPixelFormatterErrors.h
@@ -0,0 +1,11 @@
+#ifndef DataFormats_SiPixelRawData_interface_SiPixelFormatterErrors_h
+#define DataFormats_SiPixelRawData_interface_SiPixelFormatterErrors_h
+
+#include <map>
+#include <vector>
+
+#include "DataFormats/SiPixelRawDataError.h"
+
+using SiPixelFormatterErrors = std::map<uint32_t, std::vector<SiPixelRawDataError>>;
+
+#endif  // DataFormats_SiPixelRawData_interface_SiPixelFormatterErrors_h
diff --git a/src/cudadev/DataFormats/SiStripEnums.h b/src/cudadev/DataFormats/SiStripEnums.h
new file mode 100644
index 000000000..e2744b35d
--- /dev/null
+++ b/src/cudadev/DataFormats/SiStripEnums.h
@@ -0,0 +1,10 @@
+#ifndef SISTRIPENUMS_H
+#define SISTRIPENUMS_H
+
+namespace SiStripSubdetector {
+  enum Subdetector { UNKNOWN = 0, TIB = 3, TID = 4, TOB = 5, TEC = 6 };
+}
+
+enum class SiStripModuleGeometry { UNKNOWNGEOMETRY, IB1, IB2, OB1, OB2, W1A, W2A, W3A, W1B, W2B, W3B, W4, W5, W6, W7 };
+
+#endif
diff --git a/src/cudadev/Framework/CMSUnrollLoop.h b/src/cudadev/Framework/CMSUnrollLoop.h
new file mode 100644
index 000000000..a46df28a2
--- /dev/null
+++ b/src/cudadev/Framework/CMSUnrollLoop.h
@@ -0,0 +1,51 @@
+#ifndef FWCore_Utilities_interface_CMSUnrollLoop_h
+#define FWCore_Utilities_interface_CMSUnrollLoop_h
+
+// convert the macro argument to a null-terminated quoted string
+#define STRINGIFY_(ARG) #ARG
+#define STRINGIFY(ARG) STRINGIFY_(ARG)
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+// CUDA or HIP device compiler
+
+#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(unroll))
+#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N))
+#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1))
+
+#define CMS_DEVICE_UNROLL_LOOP _Pragma(STRINGIFY(unroll))
+#define CMS_DEVICE_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(unroll N))
+#define CMS_DEVICE_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(unroll 1))
+
+#else  // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__)
+
+// any host compiler
+#define CMS_DEVICE_UNROLL_LOOP
+#define CMS_DEVICE_UNROLL_LOOP_COUNT(N)
+#define CMS_DEVICE_UNROLL_LOOP_DISABLE
+
+#if defined(__clang__)
+// clang host compiler
+
+#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(clang loop unroll(enable)))
+#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(clang loop unroll_count(N)))
+#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(clang loop unroll(disable)))
+
+#elif defined(__GNUC__)
+// GCC host compiler
+
+#define CMS_UNROLL_LOOP _Pragma(STRINGIFY(GCC ivdep))
+#define CMS_UNROLL_LOOP_COUNT(N) _Pragma(STRINGIFY(GCC unroll N)) _Pragma(STRINGIFY(GCC ivdep))
+#define CMS_UNROLL_LOOP_DISABLE _Pragma(STRINGIFY(GCC unroll 1))
+
+#else
+// unsupported or unknown compiler
+
+#define CMS_UNROLL_LOOP
+#define CMS_UNROLL_LOOP_COUNT(N)
+#define CMS_UNROLL_LOOP_DISABLE
+
+#endif  // defined(__clang__) || defined(__GNUC__) || ...
+
+#endif  // defined (__CUDA_ARCH__) || defined (__HIP_DEVICE_COMPILE__)
+
+#endif  // FWCore_Utilities_interface_CMSUnrollLoop_h
diff --git a/src/cudadev/Framework/propagate_const_array.h b/src/cudadev/Framework/propagate_const_array.h
new file mode 100644
index 000000000..79eb7024b
--- /dev/null
+++ b/src/cudadev/Framework/propagate_const_array.h
@@ -0,0 +1,131 @@
+#ifndef FWCore_Utilities_interface_propagate_const_array_h
+#define FWCore_Utilities_interface_propagate_const_array_h
+// -*- C++ -*-
+//
+// Package:     FWCore/Utilities
+// Class  :     propagate_const_array
+// Description: Propagate const to array-like objects. Based on C++ experimental std::propagate_const.
+//              If used with an array of incomplete type, edm::propagate_const_array can only be declared
+//              and assigned to nullptr, but not assigned to an actual object or dereferenced.
+
+// system include files
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+// user include files
+
+// forward declarations
+
+namespace edm {
+
+  namespace impl {
+
+    // check if a type T has a subscript operator T[N]
+    template <typename, typename = void>
+    struct has_subscript_operator : std::false_type {};
+
+    template <typename T>
+    struct has_subscript_operator<T, std::void_t<decltype(std::declval<T&>()[0])>> : std::true_type {};
+
+    template <typename T>
+    constexpr auto has_subscript_operator_v = has_subscript_operator<T>::value;
+
+    // for a type T, return the type of the return value of the subscript operator T[N]
+    template <typename T, typename = void, typename = void>
+    struct subscript_type {};
+
+    // the specialisations for arrays allow supporting incomplete types
+    template <typename T>
+    struct subscript_type<T[]> {
+      using type = T;
+    };
+
+    template <typename T, int N>
+    struct subscript_type<T[N]> {
+      using type = T;
+    };
+
+    // for non-array types that implement the subscript operator[], a complete type is needed
+    template <typename T>
+    struct subscript_type<T, std::enable_if_t<not std::is_array_v<T>>, std::enable_if_t<has_subscript_operator_v<T>>> {
+      using type = typename std::remove_reference<decltype(std::declval<T&>()[0])>::type;
+    };
+
+    template <typename T>
+    using subscript_type_t = typename subscript_type<T>::type;
+
+  }  // namespace impl
+
+  template <typename T>
+  class propagate_const_array;
+
+  template <typename T>
+  constexpr std::decay_t<T>& get_underlying(propagate_const_array<T>&);
+  template <typename T>
+  constexpr std::decay_t<T> const& get_underlying(propagate_const_array<T> const&);
+
+  template <typename T>
+  class propagate_const_array {
+  public:
+    friend constexpr std::decay_t<T>& get_underlying<T>(propagate_const_array<T>&);
+    friend constexpr std::decay_t<T> const& get_underlying<T>(propagate_const_array<T> const&);
+
+    template <typename U>
+    friend class propagate_const_array;
+
+    using element_type = typename impl::subscript_type_t<T>;
+
+    constexpr propagate_const_array() = default;
+    constexpr propagate_const_array(propagate_const_array<T>&&) = default;
+    propagate_const_array(propagate_const_array<T> const&) = delete;
+    template <typename U>
+    constexpr propagate_const_array(U&& u) : m_value(std::forward<U>(u)) {}
+
+    constexpr propagate_const_array<T>& operator=(propagate_const_array&&) = default;
+    propagate_const_array<T>& operator=(propagate_const_array<T> const&) = delete;
+
+    template <typename U>
+    constexpr propagate_const_array& operator=(propagate_const_array<U>& other) {
+      static_assert(std::is_convertible_v<std::decay_t<U>, std::decay_t<T>>,
+                    "Cannot assign propagate_const_array<> of incompatible types");
+      m_value = other.m_value;
+      return *this;
+    }
+
+    template <typename U>
+    constexpr propagate_const_array& operator=(U&& u) {
+      m_value = std::forward<U>(u);
+      return *this;
+    }
+
+    // ---------- const member functions ---------------------
+    constexpr element_type const* get() const { return &m_value[0]; }
+    constexpr element_type const& operator[](std::ptrdiff_t pos) const { return m_value[pos]; }
+
+    constexpr operator element_type const *() const { return this->get(); }
+
+    // ---------- member functions ---------------------------
+    constexpr element_type* get() { return &m_value[0]; }
+    constexpr element_type& operator[](std::ptrdiff_t pos) { return m_value[pos]; }
+
+    constexpr operator element_type*() { return this->get(); }
+
+  private:
+    // ---------- member data --------------------------------
+    std::decay_t<T> m_value;
+  };
+
+  template <typename T>
+  constexpr std::decay_t<T>& get_underlying(propagate_const_array<T>& iP) {
+    return iP.m_value;
+  }
+
+  template <typename T>
+  constexpr std::decay_t<T> const& get_underlying(propagate_const_array<T> const& iP) {
+    return iP.m_value;
+  }
+
+}  // namespace edm
+
+#endif  // FWCore_Utilities_interface_propagate_const_array_h
diff --git a/src/cudadev/Geometry/phase1PixelTopology.h b/src/cudadev/Geometry/phase1PixelTopology.h
index 409ebec3c..c2b5bc9d9 100644
--- a/src/cudadev/Geometry/phase1PixelTopology.h
+++ b/src/cudadev/Geometry/phase1PixelTopology.h
@@ -65,7 +65,7 @@ namespace phase1PixelTopology {
     bool go = true;
     int n = 2;
     while (go) {
-      for (uint8_t i = 1; i < 11; ++i) {
+      for (uint8_t i = 1; i < std::size(layerStart); ++i) {
         if (layerStart[i] % n != 0) {
           go = false;
           break;
@@ -81,18 +81,18 @@ namespace phase1PixelTopology {
   constexpr uint32_t maxModuleStride = findMaxModuleStride();
 
   constexpr uint8_t findLayer(uint32_t detId) {
-    for (uint8_t i = 0; i < 11; ++i)
+    for (uint8_t i = 0; i < std::size(layerStart); ++i)
       if (detId < layerStart[i + 1])
         return i;
-    return 11;
+    return std::size(layerStart);
   }
 
   constexpr uint8_t findLayerFromCompact(uint32_t detId) {
     detId *= maxModuleStride;
-    for (uint8_t i = 0; i < 11; ++i)
+    for (uint8_t i = 0; i < std::size(layerStart); ++i)
       if (detId < layerStart[i + 1])
         return i;
-    return 11;
+    return std::size(layerStart);
   }
 
   constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
diff --git a/src/cudadev/bin/main.cc b/src/cudadev/bin/main.cc
index c8a76eee5..c8c0d0a0a 100644
--- a/src/cudadev/bin/main.cc
+++ b/src/cudadev/bin/main.cc
@@ -100,8 +100,8 @@ int main(int argc, char** argv) {
     edmodules = {
         "BeamSpotToCUDA", "SiPixelRawToClusterCUDA", "SiPixelRecHitCUDA", "CAHitNtupletCUDA", "PixelVertexProducerCUDA"};
     esmodules = {"BeamSpotESProducer",
-                 "SiPixelFedCablingMapGPUWrapperESProducer",
                  "SiPixelGainCalibrationForHLTGPUESProducer",
+                 "SiPixelROCsStatusAndMappingWrapperESProducer",
                  "PixelCPEFastESProducer"};
     if (transfer) {
       auto capos = std::find(edmodules.begin(), edmodules.end(), "CAHitNtupletCUDA");
diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
index 66e93f818..408450ea8 100644
--- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
+++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
@@ -9,6 +9,9 @@
 #include "Framework/EDProducer.h"
 #include "CUDACore/ScopedContext.h"
 
+// Switch on to enable checks and printout for found tracks
+#undef PIXEL_DEBUG_PRODUCE
+
 class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork {
 public:
   explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg);
@@ -23,7 +26,7 @@ class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork {
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
+  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)
@@ -37,29 +40,30 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
-  m_soa = inputData.toHostAsync(ctx.stream());
+  soa_ = inputData.toHostAsync(ctx.stream());
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  /*
-  auto const & tsoa = *m_soa;
+#ifdef PIXEL_DEBUG_PRODUCE
+  auto const& tsoa = *soa_;
   auto maxTracks = tsoa.stride();
   std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
     auto nHits = tsoa.nHits(it);
-    assert(nHits==int(tsoa.hitIndices.size(it)));
-    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    assert(nHits == int(tsoa.hitIndices.size(it)));
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
   }
   std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
-  */
+#endif
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(m_soa)));
+  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(soa_)));
 
-  assert(!m_soa);
+  assert(!soa_);
 }
 
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/src/cudadev/plugin-PixelTriplets/BrokenLine.h b/src/cudadev/plugin-PixelTriplets/BrokenLine.h
index 0a4b5f28f..a05bd6f72 100644
--- a/src/cudadev/plugin-PixelTriplets/BrokenLine.h
+++ b/src/cudadev/plugin-PixelTriplets/BrokenLine.h
@@ -5,58 +5,66 @@
 
 #include "FitUtils.h"
 
-namespace BrokenLine {
+namespace brokenline {
 
   //!< Karimäki's parameters: (phi, d, k=1/R)
   /*!< covariance matrix: \n
     |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
     |cov(phi, d )|cov( d , d )|cov( k , d )| \n
-    |cov(phi, k )|cov( d , k )|cov( k , k )|
+    |cov(phi, k )|cov( d , k )|cov( k , k )| \n
+    as defined in Karimäki V., 1990, Effective circle fitting for particle trajectories, 
+    Nucl. Instr. and Meth. A305 (1991) 187.
   */
-  using karimaki_circle_fit = Rfit::circle_fit;
+  using karimaki_circle_fit = riemannFit::CircleFit;
 
   /*!
     \brief data needed for the Broken Line fit procedure.
   */
-  template <int N>
+  template <int n>
   struct PreparedBrokenLineData {
-    int q;                      //!< particle charge
-    Rfit::Matrix2xNd<N> radii;  //!< xy data in the system in which the pre-fitted center is the origin
-    Rfit::VectorNd<N> s;        //!< total distance traveled in the transverse plane
-                                //   starting from the pre-fitted closest approach
-    Rfit::VectorNd<N> S;        //!< total distance traveled (three-dimensional)
-    Rfit::VectorNd<N> Z;        //!< orthogonal coordinate to the pre-fitted line in the sz plane
-    Rfit::VectorNd<N> VarBeta;  //!< kink angles in the SZ plane
+    int qCharge;                          //!< particle charge
+    riemannFit::Matrix2xNd<n> radii;      //!< xy data in the system in which the pre-fitted center is the origin
+    riemannFit::VectorNd<n> sTransverse;  //!< total distance traveled in the transverse plane
+                                          //   starting from the pre-fitted closest approach
+    riemannFit::VectorNd<n> sTotal;       //!< total distance traveled (three-dimensional)
+    riemannFit::VectorNd<n> zInSZplane;   //!< orthogonal coordinate to the pre-fitted line in the sz plane
+    riemannFit::VectorNd<n> varBeta;      //!< kink angles in the SZ plane
   };
 
   /*!
     \brief Computes the Coulomb multiple scattering variance of the planar angle.
     
     \param length length of the track in the material.
-    \param B magnetic field in Gev/cm/c.
-    \param R radius of curvature (needed to evaluate p).
-    \param Layer denotes which of the four layers of the detector is the endpoint of the multiple scattered track. For example, if Layer=3, then the particle has just gone through the material between the second and the third layer.
+    \param bField magnetic field in Gev/cm/c.
+    \param radius radius of curvature (needed to evaluate p).
+    \param layer denotes which of the four layers of the detector is the endpoint of the 
+   *             multiple scattered track. For example, if Layer=3, then the particle has 
+   *             just gone through the material between the second and the third layer.
     
-    \todo add another Layer variable to identify also the start point of the track, so if there are missing hits or multiple hits, the part of the detector that the particle has traversed can be exactly identified.
+    \todo add another Layer variable to identify also the start point of the track, 
+   *      so if there are missing hits or multiple hits, the part of the detector that 
+   *      the particle has traversed can be exactly identified.
     
-    \warning the formula used here assumes beta=1, and so neglects the dependence of theta_0 on the mass of the particle at fixed momentum.
+    \warning the formula used here assumes beta=1, and so neglects the dependence 
+   *         of theta_0 on the mass of the particle at fixed momentum.
     
     \return the variance of the planar angle ((theta_0)^2 /3).
   */
-  __host__ __device__ inline double MultScatt(
-      const double& length, const double B, const double R, int Layer, double slope) {
+  __host__ __device__ inline double multScatt(
+      const double& length, const double bField, const double radius, int layer, double slope) {
     // limit R to 20GeV...
-    auto pt2 = std::min(20., B * R);
+    auto pt2 = std::min(20., bField * radius);
     pt2 *= pt2;
-    constexpr double XXI_0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
+    constexpr double inv_X0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
     //if(Layer==1) XXI_0=0.06/16.;
     // else XXI_0=0.06/16.;
     //XX_0*=1;
-    constexpr double geometry_factor =
-        0.7;  //!< number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
-    constexpr double fact = geometry_factor * Rfit::sqr(13.6 / 1000.);
-    return fact / (pt2 * (1. + Rfit::sqr(slope))) * (std::abs(length) * XXI_0) *
-           Rfit::sqr(1. + 0.038 * log(std::abs(length) * XXI_0));
+
+    //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+    constexpr double geometry_factor = 0.7;
+    constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.);
+    return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (std::abs(length) * inv_X0) *
+           riemannFit::sqr(1. + 0.038 * log(std::abs(length) * inv_X0));
   }
 
   /*!
@@ -66,48 +74,67 @@ namespace BrokenLine {
     
     \return 2D rotation matrix.
   */
-  __host__ __device__ inline Rfit::Matrix2d RotationMatrix(double slope) {
-    Rfit::Matrix2d Rot;
-    Rot(0, 0) = 1. / sqrt(1. + Rfit::sqr(slope));
-    Rot(0, 1) = slope * Rot(0, 0);
-    Rot(1, 0) = -Rot(0, 1);
-    Rot(1, 1) = Rot(0, 0);
-    return Rot;
+  __host__ __device__ inline riemannFit::Matrix2d rotationMatrix(double slope) {
+    riemannFit::Matrix2d rot;
+    rot(0, 0) = 1. / sqrt(1. + riemannFit::sqr(slope));
+    rot(0, 1) = slope * rot(0, 0);
+    rot(1, 0) = -rot(0, 1);
+    rot(1, 1) = rot(0, 0);
+    return rot;
   }
 
   /*!
-    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a translation of the coordinate system, such that the old origin has coordinates (x0,y0) in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
+    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a 
+   *       translation of the coordinate system, such that the old origin has coordinates (x0,y0) 
+   *       in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective 
+   *       circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
     
-    \param circle circle fit in the old coordinate system.
+    \param circle circle fit in the old coordinate system. circle.par(0) is phi, circle.par(1) is d and circle.par(2) is rho. 
     \param x0 x coordinate of the translation vector.
     \param y0 y coordinate of the translation vector.
     \param jacobian passed by reference in order to save stack.
   */
-  __host__ __device__ inline void TranslateKarimaki(karimaki_circle_fit& circle,
+  __host__ __device__ inline void translateKarimaki(karimaki_circle_fit& circle,
                                                     double x0,
                                                     double y0,
-                                                    Rfit::Matrix3d& jacobian) {
-    double A, U, BB, C, DO, DP, uu, xi, v, mu, lambda, zeta;
-    DP = x0 * cos(circle.par(0)) + y0 * sin(circle.par(0));
-    DO = x0 * sin(circle.par(0)) - y0 * cos(circle.par(0)) + circle.par(1);
-    uu = 1 + circle.par(2) * circle.par(1);
-    C = -circle.par(2) * y0 + uu * cos(circle.par(0));
-    BB = circle.par(2) * x0 + uu * sin(circle.par(0));
-    A = 2. * DO + circle.par(2) * (Rfit::sqr(DO) + Rfit::sqr(DP));
-    U = sqrt(1. + circle.par(2) * A);
-    xi = 1. / (Rfit::sqr(BB) + Rfit::sqr(C));
-    v = 1. + circle.par(2) * DO;
-    lambda = (0.5 * A) / (U * Rfit::sqr(1. + U));
-    mu = 1. / (U * (1. + U)) + circle.par(2) * lambda;
-    zeta = Rfit::sqr(DO) + Rfit::sqr(DP);
-
-    jacobian << xi * uu * v, -xi * Rfit::sqr(circle.par(2)) * DP, xi * DP, 2. * mu * uu * DP, 2. * mu * v,
-        mu * zeta - lambda * A, 0, 0, 1.;
-
-    circle.par(0) = atan2(BB, C);
-    circle.par(1) = A / (1 + U);
-    // circle.par(2)=circle.par(2);
-
+                                                    riemannFit::Matrix3d& jacobian) {
+    // Avoid multiple access to the circle.par vector.
+    using scalar = std::remove_reference<decltype(circle.par(0))>::type;
+    scalar phi = circle.par(0);
+    scalar dee = circle.par(1);
+    scalar rho = circle.par(2);
+
+    // Avoid repeated trig. computations
+    scalar sinPhi = sin(phi);
+    scalar cosPhi = cos(phi);
+
+    // Intermediate computations for the circle parameters
+    scalar deltaPara = x0 * cosPhi + y0 * sinPhi;
+    scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee;
+    scalar tempSmallU = 1 + rho * dee;
+    scalar tempC = -rho * y0 + tempSmallU * cosPhi;
+    scalar tempB = rho * x0 + tempSmallU * sinPhi;
+    scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara));
+    scalar tempU = sqrt(1. + rho * tempA);
+
+    // Intermediate computations for the error matrix transform
+    scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC));
+    scalar tempV = 1. + rho * deltaOrth;
+    scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU);
+    scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda;
+    scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara);
+    jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara,
+        2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.;
+
+    // translated circle parameters
+    // phi
+    circle.par(0) = atan2(tempB, tempC);
+    // d
+    circle.par(1) = tempA / (1 + tempU);
+    // rho after translation. It is invariant, so noop
+    // circle.par(2)= rho;
+
+    // translated error matrix
     circle.cov = jacobian * circle.cov * jacobian.transpose();
   }
 
@@ -115,95 +142,97 @@ namespace BrokenLine {
     \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
     
     \param hits hits coordinates.
-    \param hits_cov hits covariance matrix.
     \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-    \param B magnetic field in Gev/cm/c.
+    \param bField magnetic field in Gev/cm/c.
     \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
   */
-  template <typename M3xN, typename V4, int N>
+  template <typename M3xN, typename V4, int n>
   __host__ __device__ inline void prepareBrokenLineData(const M3xN& hits,
                                                         const V4& fast_fit,
-                                                        const double B,
-                                                        PreparedBrokenLineData<N>& results) {
-    constexpr auto n = N;
-    u_int i;
-    Rfit::Vector2d d;
-    Rfit::Vector2d e;
+                                                        const double bField,
+                                                        PreparedBrokenLineData<n>& results) {
+    riemannFit::Vector2d dVec;
+    riemannFit::Vector2d eVec;
 
-    d = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
-    e = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
-    results.q = Rfit::cross2D(d, e) > 0 ? -1 : 1;
+    dVec = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1);
+    eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1);
+    results.qCharge = riemannFit::cross2D(dVec, eVec) > 0 ? -1 : 1;
 
-    const double slope = -results.q / fast_fit(3);
+    const double slope = -results.qCharge / fast_fit(3);
 
-    Rfit::Matrix2d R = RotationMatrix(slope);
+    riemannFit::Matrix2d rotMat = rotationMatrix(slope);
 
     // calculate radii and s
-    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * Rfit::MatrixXd::Constant(1, n, 1);
-    e = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
-    for (i = 0; i < n; i++) {
-      d = results.radii.block(0, i, 2, 1);
-      results.s(i) = results.q * fast_fit(2) * atan2(Rfit::cross2D(d, e), d.dot(e));  // calculates the arc length
+    results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1);
+    eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
+    for (u_int i = 0; i < n; i++) {
+      dVec = results.radii.block(0, i, 2, 1);
+      results.sTransverse(i) = results.qCharge * fast_fit(2) *
+                               atan2(riemannFit::cross2D(dVec, eVec), dVec.dot(eVec));  // calculates the arc length
     }
-    Rfit::VectorNd<N> z = hits.block(2, 0, 1, n).transpose();
-
-    //calculate S and Z
-    Rfit::Matrix2xNd<N> pointsSZ = Rfit::Matrix2xNd<N>::Zero();
-    for (i = 0; i < n; i++) {
-      pointsSZ(0, i) = results.s(i);
-      pointsSZ(1, i) = z(i);
-      pointsSZ.block(0, i, 2, 1) = R * pointsSZ.block(0, i, 2, 1);
+    riemannFit::VectorNd<n> zVec = hits.block(2, 0, 1, n).transpose();
+
+    //calculate sTotal and zVec
+    riemannFit::Matrix2xNd<n> pointsSZ = riemannFit::Matrix2xNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      pointsSZ(0, i) = results.sTransverse(i);
+      pointsSZ(1, i) = zVec(i);
+      pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1);
     }
-    results.S = pointsSZ.block(0, 0, 1, n).transpose();
-    results.Z = pointsSZ.block(1, 0, 1, n).transpose();
-
-    //calculate VarBeta
-    results.VarBeta(0) = results.VarBeta(n - 1) = 0;
-    for (i = 1; i < n - 1; i++) {
-      results.VarBeta(i) = MultScatt(results.S(i + 1) - results.S(i), B, fast_fit(2), i + 2, slope) +
-                           MultScatt(results.S(i) - results.S(i - 1), B, fast_fit(2), i + 1, slope);
+    results.sTotal = pointsSZ.block(0, 0, 1, n).transpose();
+    results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose();
+
+    //calculate varBeta
+    results.varBeta(0) = results.varBeta(n - 1) = 0;
+    for (u_int i = 1; i < n - 1; i++) {
+      results.varBeta(i) = multScatt(results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) +
+                           multScatt(results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope);
     }
   }
 
   /*!
-    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. This is the whole matrix in the case of the line fit and the main n-by-n block in the case of the circle fit.
+    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. 
+   *       This is the whole matrix in the case of the line fit and the main n-by-n block in the case 
+   *       of the circle fit.
     
-    \param w weights of the first part of the cost function, the one with the measurements and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
-    \param S total distance traveled by the particle from the pre-fitted closest approach.
-    \param VarBeta kink angles' variance.
+    \param weights weights of the first part of the cost function, the one with the measurements 
+   *         and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
+    \param sTotal total distance traveled by the particle from the pre-fitted closest approach.
+    \param varBeta kink angles' variance.
     
     \return the n-by-n matrix of the linear system
   */
-  template <int N>
-  __host__ __device__ inline Rfit::MatrixNd<N> MatrixC_u(const Rfit::VectorNd<N>& w,
-                                                         const Rfit::VectorNd<N>& S,
-                                                         const Rfit::VectorNd<N>& VarBeta) {
-    constexpr u_int n = N;
-    u_int i;
-
-    Rfit::MatrixNd<N> C_U = Rfit::MatrixNd<N>::Zero();
-    for (i = 0; i < n; i++) {
-      C_U(i, i) = w(i);
+  template <int n>
+  __host__ __device__ inline riemannFit::MatrixNd<n> matrixC_u(const riemannFit::VectorNd<n>& weights,
+                                                               const riemannFit::VectorNd<n>& sTotal,
+                                                               const riemannFit::VectorNd<n>& varBeta) {
+    riemannFit::MatrixNd<n> c_uMat = riemannFit::MatrixNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, i) = weights(i);
       if (i > 1)
-        C_U(i, i) += 1. / (VarBeta(i - 1) * Rfit::sqr(S(i) - S(i - 1)));
+        c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1)));
       if (i > 0 && i < n - 1)
-        C_U(i, i) += (1. / VarBeta(i)) * Rfit::sqr((S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+        c_uMat(i, i) +=
+            (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) /
+                                                ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
       if (i < n - 2)
-        C_U(i, i) += 1. / (VarBeta(i + 1) * Rfit::sqr(S(i + 1) - S(i)));
+        c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i)));
 
       if (i > 0 && i < n - 1)
-        C_U(i, i + 1) =
-            1. / (VarBeta(i) * (S(i + 1) - S(i))) * (-(S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))));
+        c_uMat(i, i + 1) =
+            1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
       if (i < n - 2)
-        C_U(i, i + 1) += 1. / (VarBeta(i + 1) * (S(i + 1) - S(i))) *
-                         (-(S(i + 2) - S(i)) / ((S(i + 2) - S(i + 1)) * (S(i + 1) - S(i))));
+        c_uMat(i, i + 1) +=
+            1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) *
+            (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))));
 
       if (i < n - 2)
-        C_U(i, i + 2) = 1. / (VarBeta(i + 1) * (S(i + 2) - S(i + 1)) * (S(i + 1) - S(i)));
+        c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)));
 
-      C_U(i, i) *= 0.5;
+      c_uMat(i, i) *= 0.5;
     }
-    return C_U + C_U.transpose();
+    return c_uMat + c_uMat.transpose();
   }
 
   /*!
@@ -217,169 +246,179 @@ namespace BrokenLine {
   */
 
   template <typename M3xN, typename V4>
-  __host__ __device__ inline void BL_Fast_fit(const M3xN& hits, V4& result) {
-    constexpr uint32_t N = M3xN::ColsAtCompileTime;
-    constexpr auto n = N;  // get the number of hits
+  __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) {
+    constexpr uint32_t n = M3xN::ColsAtCompileTime;
 
-    const Rfit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-    const Rfit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
-    const Rfit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+    const riemannFit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1);
+    const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
 
-    auto tmp = 0.5 / Rfit::cross2D(c, a);
+    auto tmp = 0.5 / riemannFit::cross2D(c, a);
     result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
     result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
     // check Wikipedia for these formulas
 
-    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(Rfit::cross2D(b, a)));
+    result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(riemannFit::cross2D(b, a)));
     // Using Math Olympiad's formula R=abc/(4A)
 
-    const Rfit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-    const Rfit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+    const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+    const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
 
-    result(3) = result(2) * atan2(Rfit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+    result(3) = result(2) * atan2(riemannFit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
     // ds/dz slope between last and first point
   }
 
   /*!
-    \brief Performs the Broken Line fit in the curved track case (that is, the fit parameters are the interceptions u and the curvature correction \Delta\kappa).
+    \brief Performs the Broken Line fit in the curved track case (that is, the fit 
+   *       parameters are the interceptions u and the curvature correction \Delta\kappa).
     
     \param hits hits coordinates.
     \param hits_cov hits covariance matrix.
     \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-    \param B magnetic field in Gev/cm/c.
+    \param bField magnetic field in Gev/cm/c.
     \param data PreparedBrokenLineData.
     \param circle_results struct to be filled with the results in this form:
     -par parameter of the line in this form: (phi, d, k); \n
     -cov covariance matrix of the fitted parameter; \n
     -chi2 value of the cost function in the minimum.
     
-    \details The function implements the steps 2 and 3 of the Broken Line fit with the curvature correction.\n
-    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and \Delta\kappa and their covariance matrix.
-    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
+    \details The function implements the steps 2 and 3 of the Broken Line fit 
+   *         with the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint on 
+   * the cost function and solving the consequent linear system. It determines the 
+   * fitted parameters u and \Delta\kappa and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their 
+   * covariance matrix are transformed to the original coordinate system.
   */
-  template <typename M3xN, typename M6xN, typename V4, int N>
-  __host__ __device__ inline void BL_Circle_fit(const M3xN& hits,
-                                                const M6xN& hits_ge,
-                                                const V4& fast_fit,
-                                                const double B,
-                                                PreparedBrokenLineData<N>& data,
-                                                karimaki_circle_fit& circle_results) {
-    constexpr u_int n = N;
-    u_int i;
-
-    circle_results.q = data.q;
+  template <typename M3xN, typename M6xN, typename V4, int n>
+  __host__ __device__ inline void circleFit(const M3xN& hits,
+                                            const M6xN& hits_ge,
+                                            const V4& fast_fit,
+                                            const double bField,
+                                            PreparedBrokenLineData<n>& data,
+                                            karimaki_circle_fit& circle_results) {
+    circle_results.qCharge = data.qCharge;
     auto& radii = data.radii;
-    const auto& s = data.s;
-    const auto& S = data.S;
-    auto& Z = data.Z;
-    auto& VarBeta = data.VarBeta;
-    const double slope = -circle_results.q / fast_fit(3);
-    VarBeta *= 1. + Rfit::sqr(slope);  // the kink angles are projected!
-
-    for (i = 0; i < n; i++) {
-      Z(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
+    const auto& sTransverse = data.sTransverse;
+    const auto& sTotal = data.sTotal;
+    auto& zInSZplane = data.zInSZplane;
+    auto& varBeta = data.varBeta;
+    const double slope = -circle_results.qCharge / fast_fit(3);
+    varBeta *= 1. + riemannFit::sqr(slope);  // the kink angles are projected!
+
+    for (u_int i = 0; i < n; i++) {
+      zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
     }
 
-    Rfit::Matrix2d V;     // covariance matrix
-    Rfit::VectorNd<N> w;  // weights
-    Rfit::Matrix2d RR;    // rotation matrix point by point
-    //double Slope; // slope of the circle point by point
-    for (i = 0; i < n; i++) {
-      V(0, 0) = hits_ge.col(i)[0];            // x errors
-      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
-      V(1, 1) = hits_ge.col(i)[2];            // y errors
-      //Slope=-radii(0,i)/radii(1,i);
-      RR = RotationMatrix(-radii(0, i) / radii(1, i));
-      w(i) = 1. / ((RR * V * RR.transpose())(1, 1));  // compute the orthogonal weight point by point
+    riemannFit::Matrix2d vMat;           // covariance matrix
+    riemannFit::VectorNd<n> weightsVec;  // weights
+    riemannFit::Matrix2d rotMat;         // rotation matrix point by point
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      rotMat = rotationMatrix(-radii(0, i) / radii(1, i));
+      weightsVec(i) =
+          1. / ((rotMat * vMat * rotMat.transpose())(1, 1));  // compute the orthogonal weight point by point
     }
 
-    Rfit::VectorNplusONEd<N> r_u;
-    r_u(n) = 0;
-    for (i = 0; i < n; i++) {
-      r_u(i) = w(i) * Z(i);
+    riemannFit::VectorNplusONEd<n> r_uVec;
+    r_uVec(n) = 0;
+    for (u_int i = 0; i < n; i++) {
+      r_uVec(i) = weightsVec(i) * zInSZplane(i);
     }
 
-    Rfit::MatrixNplusONEd<N> C_U;
-    C_U.block(0, 0, n, n) = MatrixC_u(w, s, VarBeta);
-    C_U(n, n) = 0;
-    //add the border to the C_u matrix
-    for (i = 0; i < n; i++) {
-      C_U(i, n) = 0;
+    riemannFit::MatrixNplusONEd<n> c_uMat;
+    c_uMat.block(0, 0, n, n) = matrixC_u(weightsVec, sTransverse, varBeta);
+    c_uMat(n, n) = 0;
+    //add the border to the c_uMat matrix
+    for (u_int i = 0; i < n; i++) {
+      c_uMat(i, n) = 0;
       if (i > 0 && i < n - 1) {
-        C_U(i, n) +=
-            -(s(i + 1) - s(i - 1)) * (s(i + 1) - s(i - 1)) / (2. * VarBeta(i) * (s(i + 1) - s(i)) * (s(i) - s(i - 1)));
+        c_uMat(i, n) +=
+            -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+            (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1)));
       }
       if (i > 1) {
-        C_U(i, n) += (s(i) - s(i - 2)) / (2. * VarBeta(i - 1) * (s(i) - s(i - 1)));
+        c_uMat(i, n) +=
+            (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1)));
       }
       if (i < n - 2) {
-        C_U(i, n) += (s(i + 2) - s(i)) / (2. * VarBeta(i + 1) * (s(i + 1) - s(i)));
+        c_uMat(i, n) +=
+            (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i)));
       }
-      C_U(n, i) = C_U(i, n);
+      c_uMat(n, i) = c_uMat(i, n);
       if (i > 0 && i < n - 1)
-        C_U(n, n) += Rfit::sqr(s(i + 1) - s(i - 1)) / (4. * VarBeta(i));
+        c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i));
     }
 
 #ifdef CPP_DUMP
-    std::cout << "CU5\n" << C_U << std::endl;
+    std::cout << "CU5\n" << c_uMat << std::endl;
 #endif
-    Rfit::MatrixNplusONEd<N> I;
-    math::cholesky::invert(C_U, I);
-    // Rfit::MatrixNplusONEd<N> I = C_U.inverse();
+    riemannFit::MatrixNplusONEd<n> iMat;
+    math::cholesky::invert(c_uMat, iMat);
 #ifdef CPP_DUMP
-    std::cout << "I5\n" << I << std::endl;
+    std::cout << "I5\n" << iMat << std::endl;
 #endif
 
-    Rfit::VectorNplusONEd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    riemannFit::VectorNplusONEd<n> uVec = iMat * r_uVec;  // obtain the fitted parameters by solving the linear system
 
     // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
 
     radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
     radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
 
-    Rfit::Vector2d d = hits.block(0, 0, 2, 1) + (-Z(0) + u(0)) * radii.block(0, 0, 2, 1);
-    Rfit::Vector2d e = hits.block(0, 1, 2, 1) + (-Z(1) + u(1)) * radii.block(0, 1, 2, 1);
+    riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1);
+    riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1);
 
-    circle_results.par << atan2((e - d)(1), (e - d)(0)),
-        -circle_results.q * (fast_fit(2) - sqrt(Rfit::sqr(fast_fit(2)) - 0.25 * (e - d).squaredNorm())),
-        circle_results.q * (1. / fast_fit(2) + u(n));
+    circle_results.par << atan2((eVec - dVec)(1), (eVec - dVec)(0)),
+        -circle_results.qCharge *
+            (fast_fit(2) - sqrt(riemannFit::sqr(fast_fit(2)) - 0.25 * (eVec - dVec).squaredNorm())),
+        circle_results.qCharge * (1. / fast_fit(2) + uVec(n));
 
-    assert(circle_results.q * circle_results.par(1) <= 0);
+    assert(circle_results.qCharge * circle_results.par(1) <= 0);
 
-    Rfit::Vector2d eMinusd = e - d;
+    riemannFit::Vector2d eMinusd = eVec - dVec;
     double tmp1 = eMinusd.squaredNorm();
+    double tmp2 = sqrt(riemannFit::sqr(2 * fast_fit(2)) - tmp1);
 
-    Rfit::Matrix3d jacobian;
+    riemannFit::Matrix3d jacobian;
     jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1,
         (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0,
-        (circle_results.q / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) /
-            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
-        (circle_results.q / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) /
-            sqrt(Rfit::sqr(2 * fast_fit(2)) - tmp1),
-        0, 0, 0, circle_results.q;
+        (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) / tmp2,
+        (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) / tmp2, 0, 0, 0,
+        circle_results.qCharge;
 
-    circle_results.cov << I(0, 0), I(0, 1), I(0, n), I(1, 0), I(1, 1), I(1, n), I(n, 0), I(n, 1), I(n, n);
+    circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0),
+        iMat(n, 1), iMat(n, n);
 
     circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
 
     //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
 
-    TranslateKarimaki(circle_results, 0.5 * (e - d)(0), 0.5 * (e - d)(1), jacobian);
-    circle_results.cov(0, 0) += (1 + Rfit::sqr(slope)) * MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope);
+    auto eMinusDVec = eVec - dVec;
+    translateKarimaki(circle_results, 0.5 * eMinusDVec(0), 0.5 * eMinusDVec(1), jacobian);
+    circle_results.cov(0, 0) +=
+        (1 + riemannFit::sqr(slope)) * multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope);
 
     //...And translate back to the original system
 
-    TranslateKarimaki(circle_results, d(0), d(1), jacobian);
+    translateKarimaki(circle_results, dVec(0), dVec(1), jacobian);
 
     // compute chi2
     circle_results.chi2 = 0;
-    for (i = 0; i < n; i++) {
-      circle_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
+    for (u_int i = 0; i < n; i++) {
+      circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
       if (i > 0 && i < n - 1)
-        circle_results.chi2 += Rfit::sqr(u(i - 1) / (s(i) - s(i - 1)) -
-                                         u(i) * (s(i + 1) - s(i - 1)) / ((s(i + 1) - s(i)) * (s(i) - s(i - 1))) +
-                                         u(i + 1) / (s(i + 1) - s(i)) + (s(i + 1) - s(i - 1)) * u(n) / 2) /
-                               VarBeta(i);
+        circle_results.chi2 +=
+            riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) -
+                            uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+                                ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) +
+                            uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) +
+                            (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) /
+            varBeta(i);
     }
 
     // assert(circle_results.chi2>=0);
@@ -389,108 +428,109 @@ namespace BrokenLine {
     \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
     
     \param hits hits coordinates.
-    \param hits_cov hits covariance matrix.
     \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
-    \param B magnetic field in Gev/cm/c.
+    \param bField magnetic field in Gev/cm/c.
     \param data PreparedBrokenLineData.
     \param line_results struct to be filled with the results in this form:
     -par parameter of the line in this form: (cot(theta), Zip); \n
     -cov covariance matrix of the fitted parameter; \n
     -chi2 value of the cost function in the minimum.
     
-    \details The function implements the steps 2 and 3 of the Broken Line fit without the curvature correction.\n
-    The step 2 is the least square fit, done by imposing the minimum constraint on the cost function and solving the consequent linear system. It determines the fitted parameters u and their covariance matrix.
-    The step 3 is the correction of the fast pre-fitted parameters for the innermost part of the track. It is first done in a comfortable coordinate system (the one in which the first hit is the origin) and then the parameters and their covariance matrix are transformed to the original coordinate system.
-  */
-  template <typename V4, typename M6xN, int N>
-  __host__ __device__ inline void BL_Line_fit(const M6xN& hits_ge,
-                                              const V4& fast_fit,
-                                              const double B,
-                                              const PreparedBrokenLineData<N>& data,
-                                              Rfit::line_fit& line_results) {
-    constexpr u_int n = N;
-    u_int i;
-
+    \details The function implements the steps 2 and 3 of the Broken Line fit without 
+   *        the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint 
+   * on the cost function and solving the consequent linear system. It determines 
+   * the fitted parameters u and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their covariance 
+   * matrix are transformed to the original coordinate system.
+   */
+  template <typename V4, typename M6xN, int n>
+  __host__ __device__ inline void lineFit(const M6xN& hits_ge,
+                                          const V4& fast_fit,
+                                          const double bField,
+                                          const PreparedBrokenLineData<n>& data,
+                                          riemannFit::LineFit& line_results) {
     const auto& radii = data.radii;
-    const auto& S = data.S;
-    const auto& Z = data.Z;
-    const auto& VarBeta = data.VarBeta;
-
-    const double slope = -data.q / fast_fit(3);
-    Rfit::Matrix2d R = RotationMatrix(slope);
-
-    Rfit::Matrix3d V = Rfit::Matrix3d::Zero();                 // covariance matrix XYZ
-    Rfit::Matrix2x3d JacobXYZtosZ = Rfit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
-    Rfit::VectorNd<N> w = Rfit::VectorNd<N>::Zero();
-    for (i = 0; i < n; i++) {
-      V(0, 0) = hits_ge.col(i)[0];            // x errors
-      V(0, 1) = V(1, 0) = hits_ge.col(i)[1];  // cov_xy
-      V(0, 2) = V(2, 0) = hits_ge.col(i)[3];  // cov_xz
-      V(1, 1) = hits_ge.col(i)[2];            // y errors
-      V(2, 1) = V(1, 2) = hits_ge.col(i)[4];  // cov_yz
-      V(2, 2) = hits_ge.col(i)[5];            // z errors
+    const auto& sTotal = data.sTotal;
+    const auto& zInSZplane = data.zInSZplane;
+    const auto& varBeta = data.varBeta;
+
+    const double slope = -data.qCharge / fast_fit(3);
+    riemannFit::Matrix2d rotMat = rotationMatrix(slope);
+
+    riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero();  // covariance matrix XYZ
+    riemannFit::Matrix2x3d jacobXYZtosZ =
+        riemannFit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+    riemannFit::VectorNd<n> weights = riemannFit::VectorNd<n>::Zero();
+    for (u_int i = 0; i < n; i++) {
+      vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+      vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+      vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3];  // cov_xz
+      vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+      vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4];  // cov_yz
+      vMat(2, 2) = hits_ge.col(i)[5];               // z errors
       auto tmp = 1. / radii.block(0, i, 2, 1).norm();
-      JacobXYZtosZ(0, 0) = radii(1, i) * tmp;
-      JacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
-      JacobXYZtosZ(1, 2) = 1.;
-      w(i) = 1. / ((R * JacobXYZtosZ * V * JacobXYZtosZ.transpose() * R.transpose())(
-                      1, 1));  // compute the orthogonal weight point by point
+      jacobXYZtosZ(0, 0) = radii(1, i) * tmp;
+      jacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
+      jacobXYZtosZ(1, 2) = 1.;
+      weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())(
+                            1, 1));  // compute the orthogonal weight point by point
     }
 
-    Rfit::VectorNd<N> r_u;
-    for (i = 0; i < n; i++) {
-      r_u(i) = w(i) * Z(i);
+    riemannFit::VectorNd<n> r_u;
+    for (u_int i = 0; i < n; i++) {
+      r_u(i) = weights(i) * zInSZplane(i);
     }
 #ifdef CPP_DUMP
-    std::cout << "CU4\n" << MatrixC_u(w, S, VarBeta) << std::endl;
+    std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl;
 #endif
-    Rfit::MatrixNd<N> I;
-    math::cholesky::invert(MatrixC_u(w, S, VarBeta), I);
-    //    Rfit::MatrixNd<N> I=MatrixC_u(w,S,VarBeta).inverse();
+    riemannFit::MatrixNd<n> iMat;
+    math::cholesky::invert(matrixC_u(weights, sTotal, varBeta), iMat);
 #ifdef CPP_DUMP
-    std::cout << "I4\n" << I << std::endl;
+    std::cout << "I4\n" << iMat << std::endl;
 #endif
 
-    Rfit::VectorNd<N> u = I * r_u;  // obtain the fitted parameters by solving the linear system
+    riemannFit::VectorNd<n> uVec = iMat * r_u;  // obtain the fitted parameters by solving the linear system
 
     // line parameters in the system in which the first hit is the origin and with axis along SZ
-    line_results.par << (u(1) - u(0)) / (S(1) - S(0)), u(0);
-    auto idiff = 1. / (S(1) - S(0));
-    line_results.cov << (I(0, 0) - 2 * I(0, 1) + I(1, 1)) * Rfit::sqr(idiff) +
-                            MultScatt(S(1) - S(0), B, fast_fit(2), 2, slope),
-        (I(0, 1) - I(0, 0)) * idiff, (I(0, 1) - I(0, 0)) * idiff, I(0, 0);
+    line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0);
+    auto idiff = 1. / (sTotal(1) - sTotal(0));
+    line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) +
+                            multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope),
+        (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0);
 
     // translate to the original SZ system
-    Rfit::Matrix2d jacobian;
+    riemannFit::Matrix2d jacobian;
     jacobian(0, 0) = 1.;
     jacobian(0, 1) = 0;
-    jacobian(1, 0) = -S(0);
+    jacobian(1, 0) = -sTotal(0);
     jacobian(1, 1) = 1.;
-    line_results.par(1) += -line_results.par(0) * S(0);
+    line_results.par(1) += -line_results.par(0) * sTotal(0);
     line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // rotate to the original sz system
-    auto tmp = R(0, 0) - line_results.par(0) * R(0, 1);
+    auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1);
     jacobian(1, 1) = 1. / tmp;
     jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
     jacobian(0, 1) = 0;
-    jacobian(1, 0) = line_results.par(1) * R(0, 1) * jacobian(0, 0);
+    jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0);
     line_results.par(1) = line_results.par(1) * jacobian(1, 1);
-    line_results.par(0) = (R(0, 1) + line_results.par(0) * R(0, 0)) * jacobian(1, 1);
+    line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1);
     line_results.cov = jacobian * line_results.cov * jacobian.transpose();
 
     // compute chi2
     line_results.chi2 = 0;
-    for (i = 0; i < n; i++) {
-      line_results.chi2 += w(i) * Rfit::sqr(Z(i) - u(i));
+    for (u_int i = 0; i < n; i++) {
+      line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
       if (i > 0 && i < n - 1)
-        line_results.chi2 += Rfit::sqr(u(i - 1) / (S(i) - S(i - 1)) -
-                                       u(i) * (S(i + 1) - S(i - 1)) / ((S(i + 1) - S(i)) * (S(i) - S(i - 1))) +
-                                       u(i + 1) / (S(i + 1) - S(i))) /
-                             VarBeta(i);
+        line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) -
+                                             uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) /
+                                                 ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) +
+                                             uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) /
+                             varBeta(i);
     }
-
-    // assert(line_results.chi2>=0);
   }
 
   /*!
@@ -519,7 +559,7 @@ namespace BrokenLine {
     |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
     |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
     |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
-    \param B magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
+    \param bField magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
     
     \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
     
@@ -527,39 +567,40 @@ namespace BrokenLine {
     
     \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
   */
-  template <int N>
-  inline Rfit::helix_fit BL_Helix_fit(const Rfit::Matrix3xNd<N>& hits,
-                                      const Eigen::Matrix<float, 6, 4>& hits_ge,
-                                      const double B) {
-    Rfit::helix_fit helix;
-    Rfit::Vector4d fast_fit;
-    BL_Fast_fit(hits, fast_fit);
-
-    PreparedBrokenLineData<N> data;
+  template <int n>
+  inline riemannFit::HelixFit helixFit(const riemannFit::Matrix3xNd<n>& hits,
+                                       const Eigen::Matrix<float, 6, 4>& hits_ge,
+                                       const double bField) {
+    riemannFit::HelixFit helix;
+    riemannFit::Vector4d fast_fit;
+    fastFit(hits, fast_fit);
+
+    PreparedBrokenLineData<n> data;
     karimaki_circle_fit circle;
-    Rfit::line_fit line;
-    Rfit::Matrix3d jacobian;
+    riemannFit::LineFit line;
+    riemannFit::Matrix3d jacobian;
 
-    prepareBrokenLineData(hits, fast_fit, B, data);
-    BL_Line_fit(hits_ge, fast_fit, B, data, line);
-    BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+    prepareBrokenLineData(hits, fast_fit, bField, data);
+    lineFit(hits_ge, fast_fit, bField, data, line);
+    circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
     // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
-    jacobian << 1., 0, 0, 0, 1., 0, 0, 0, -std::abs(circle.par(2)) * B / (Rfit::sqr(circle.par(2)) * circle.par(2));
-    circle.par(2) = B / std::abs(circle.par(2));
+    jacobian << 1., 0, 0, 0, 1., 0, 0, 0,
+        -std::abs(circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2));
+    circle.par(2) = bField / std::abs(circle.par(2));
     circle.cov = jacobian * circle.cov * jacobian.transpose();
 
     helix.par << circle.par, line.par;
-    helix.cov = Rfit::MatrixXd::Zero(5, 5);
+    helix.cov = riemannFit::MatrixXd::Zero(5, 5);
     helix.cov.block(0, 0, 3, 3) = circle.cov;
     helix.cov.block(3, 3, 2, 2) = line.cov;
-    helix.q = circle.q;
+    helix.qCharge = circle.qCharge;
     helix.chi2_circle = circle.chi2;
     helix.chi2_line = line.chi2;
 
     return helix;
   }
 
-}  // namespace BrokenLine
+}  // namespace brokenline
 
 #endif  // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
diff --git a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cc b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cc
index cc5865d97..bebfe0e08 100644
--- a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cc
+++ b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cc
@@ -1,67 +1,69 @@
 #include "BrokenLineFitOnGPU.h"
 
 void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
+  assert(tuples_);
 
   //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
+  auto hitsGPU_ =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU_ =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
   auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    kernel_BLFastFit<3>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
 
-    kernelBLFit<3>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   3,
-                   offset);
+    kernel_BLFit<3>(tupleMultiplicity_,
+                    bField_,
+                    outputSoa_,
+                    hitsGPU_.get(),
+                    hits_geGPU_.get(),
+                    fast_fit_resultsGPU_.get(),
+                    3,
+                    offset);
 
     // fit quads
-    kernelBLFastFit<4>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    kernel_BLFastFit<4>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
 
-    kernelBLFit<4>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   4,
-                   offset);
+    kernel_BLFit<4>(tupleMultiplicity_,
+                    bField_,
+                    outputSoa_,
+                    hitsGPU_.get(),
+                    hits_geGPU_.get(),
+                    fast_fit_resultsGPU_.get(),
+                    4,
+                    offset);
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<4>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
 
-      kernelBLFit<4>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
+      kernel_BLFit<4>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU_.get(),
+                      hits_geGPU_.get(),
+                      fast_fit_resultsGPU_.get(),
+                      5,
+                      offset);
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<5>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
 
-      kernelBLFit<5>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
+      kernel_BLFit<5>(tupleMultiplicity_,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU_.get(),
+                      hits_geGPU_.get(),
+                      fast_fit_resultsGPU_.get(),
+                      5,
+                      offset);
     }
 
   }  // loop on concurrent fits
diff --git a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
index c1ba97c29..deadb1a3c 100644
--- a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
+++ b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.cu
@@ -5,79 +5,79 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
                                             cudaStream_t stream) {
-  assert(tuples_d);
+  assert(tuples_);
 
   auto blockSize = 64;
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
   auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
   auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
   auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
-    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    kernel_BLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                             bField_,
-                                                             outputSoa_d,
-                                                             hitsGPU_.get(),
-                                                             hits_geGPU_.get(),
-                                                             fast_fit_resultsGPU_.get(),
-                                                             3,
-                                                             offset);
+    kernel_BLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                              bField_,
+                                                              outputSoa_,
+                                                              hitsGPU_.get(),
+                                                              hits_geGPU_.get(),
+                                                              fast_fit_resultsGPU_.get(),
+                                                              3,
+                                                              offset);
     cudaCheck(cudaGetLastError());
 
     // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
     cudaCheck(cudaGetLastError());
 
-    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 bField_,
-                                                                 outputSoa_d,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 4,
-                                                                 offset);
+    kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                  bField_,
+                                                                  outputSoa_,
+                                                                  hitsGPU_.get(),
+                                                                  hits_geGPU_.get(),
+                                                                  fast_fit_resultsGPU_.get(),
+                                                                  4,
+                                                                  offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
+      kernel_BLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
       cudaCheck(cudaGetLastError());
     } else {
       // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+      kernel_BLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
+      kernel_BLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU_.get(),
+                                                                    hits_geGPU_.get(),
+                                                                    fast_fit_resultsGPU_.get(),
+                                                                    5,
+                                                                    offset);
       cudaCheck(cudaGetLastError());
     }
 
diff --git a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h
index a30c251b7..67831af89 100644
--- a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h
+++ b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h
@@ -8,7 +8,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/cuda_assert.h"
 #include "CondFormats/pixelCPEforGPU.h"
@@ -23,14 +23,14 @@ using OutputSoA = pixelTrack::TrackSoA;
 // #define BL_DUMP_HITS
 
 template <int N>
-__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
-                                CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                HitsOnGPU const *__restrict__ hhp,
-                                double *__restrict__ phits,
-                                float *__restrict__ phits_ge,
-                                double *__restrict__ pfast_fit,
-                                uint32_t nHits,
-                                uint32_t offset) {
+__global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
+                                 caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                 HitsOnGPU const *__restrict__ hhp,
+                                 double *__restrict__ phits,
+                                 float *__restrict__ phits_ge,
+                                 double *__restrict__ pfast_fit,
+                                 uint32_t nHits,
+                                 uint32_t offset) {
   constexpr uint32_t hitsInFit = N;
 
   assert(hitsInFit <= nHits);
@@ -45,12 +45,12 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
 
 #ifdef BROKENLINE_DEBUG
   if (0 == local_start) {
-    printf("%d total Ntuple\n", foundNtuplets->nbins());
+    printf("%d total Ntuple\n", foundNtuplets->nOnes());
     printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
   }
 #endif
 
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -58,13 +58,13 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
 
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
-    assert(tkid < foundNtuplets->nbins());
+    assert(tkid < foundNtuplets->nOnes());
 
     assert(foundNtuplets->size(tkid) == nHits);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
 #ifdef BL_DUMP_HITS
     __shared__ int done;
@@ -105,7 +105,7 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
       hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
-    BrokenLine::BL_Fast_fit(hits, fast_fit);
+    brokenline::fastFit(hits, fast_fit);
 
     // no NaN here....
     assert(fast_fit(0) == fast_fit(0));
@@ -116,14 +116,14 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
 }
 
 template <int N>
-__global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                            double B,
-                            OutputSoA *results,
-                            double *__restrict__ phits,
-                            float *__restrict__ phits_ge,
-                            double *__restrict__ pfast_fit,
-                            uint32_t nHits,
-                            uint32_t offset) {
+__global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                             double bField,
+                             OutputSoA *results,
+                             double *__restrict__ phits,
+                             float *__restrict__ phits_ge,
+                             double *__restrict__ pfast_fit,
+                             uint32_t nHits,
+                             uint32_t offset) {
   assert(N <= nHits);
 
   assert(results);
@@ -133,7 +133,7 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
 
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -142,22 +142,21 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
     // get it for the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
-    BrokenLine::PreparedBrokenLineData<N> data;
-    Rfit::Matrix3d Jacob;
+    brokenline::PreparedBrokenLineData<N> data;
 
-    BrokenLine::karimaki_circle_fit circle;
-    Rfit::line_fit line;
+    brokenline::karimaki_circle_fit circle;
+    riemannFit::LineFit line;
 
-    BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
-    BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
-    BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
+    brokenline::prepareBrokenLineData(hits, fast_fit, bField, data);
+    brokenline::lineFit(hits_ge, fast_fit, bField, data, line);
+    brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
-    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(B), tkid);
-    results->pt(tkid) = float(B) / float(std::abs(circle.par(2)));
+    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+    results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
     results->eta(tkid) = asinhf(line.par(0));
     results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
 
diff --git a/src/cudadev/plugin-PixelTriplets/CAConstants.h b/src/cudadev/plugin-PixelTriplets/CAConstants.h
index b063d0f6e..010ffadfd 100644
--- a/src/cudadev/plugin-PixelTriplets/CAConstants.h
+++ b/src/cudadev/plugin-PixelTriplets/CAConstants.h
@@ -10,60 +10,73 @@
 #include "CUDACore/VecArray.h"
 #include "CUDADataFormats/gpuClusteringConstants.h"
 
-// #define ONLY_PHICUT
+//#define ONLY_PHICUT
 
-namespace CAConstants {
+// Cellular automaton constants
+namespace caConstants {
 
   // constants
-#ifndef ONLY_PHICUT
+#ifdef ONLY_PHICUT
+  constexpr uint32_t maxCellNeighbors = 64;
+  constexpr uint32_t maxCellTracks = 64;
+  constexpr uint32_t maxNumberOfTuples = 48 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 2 * 1024 * 1024;
+  constexpr uint32_t maxCellsPerHit = 8 * 128;
+#else  // ONLY_PHICUT
+  constexpr uint32_t maxCellNeighbors = 36;
+  constexpr uint32_t maxCellTracks = 48;
 #ifdef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumberOfTuples() { return 3 * 1024; }
-#else
-  constexpr uint32_t maxNumberOfTuples() { return 24 * 1024; }
-#endif
-#else
-  constexpr uint32_t maxNumberOfTuples() { return 48 * 1024; }
-#endif
-  constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
-#ifndef ONLY_PHICUT
-#ifndef GPU_SMALL_EVENTS
-  constexpr uint32_t maxNumberOfDoublets() { return 512 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 128; }
-#else
-  constexpr uint32_t maxNumberOfDoublets() { return 128 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 128 / 2; }
-#endif
-#else
-  constexpr uint32_t maxNumberOfDoublets() { return 2 * 1024 * 1024; }
-  constexpr uint32_t maxCellsPerHit() { return 8 * 128; }
-#endif
-  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets() / 8; }
+  // kept for testing and debugging
+  constexpr uint32_t maxNumberOfTuples = 3 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 128 * 1024;
+  constexpr uint32_t maxCellsPerHit = 128 / 2;
+#else   // GPU_SMALL_EVENTS
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumberOfTuples = 24 * 1024;
+  constexpr uint32_t maxNumberOfDoublets = 512 * 1024;
+  constexpr uint32_t maxCellsPerHit = 128;
+#endif  // GPU_SMALL_EVENTS
+#endif  // ONLY_PHICUT
+  constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
+  constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples;
 
-  constexpr uint32_t maxNumberOfLayerPairs() { return 20; }
-  constexpr uint32_t maxNumberOfLayers() { return 10; }
-  constexpr uint32_t maxTuples() { return maxNumberOfTuples(); }
+  constexpr uint32_t maxNumberOfLayerPairs = 20;
+  constexpr uint32_t maxNumberOfLayers = 10;
+  constexpr uint32_t maxTuples = maxNumberOfTuples;
+
+  // Modules constants
+  constexpr uint32_t max_ladder_bpx0 = 12;
+  constexpr uint32_t first_ladder_bpx0 = 0;
+  constexpr float module_length_bpx0 = 6.7f;
+  constexpr float module_tolerance_bpx0 = 0.4f;  // projection to cylinder is inaccurate on BPIX1
+  constexpr uint32_t max_ladder_bpx4 = 64;
+  constexpr uint32_t first_ladder_bpx4 = 84;
+  constexpr float radius_even_ladder = 15.815f;
+  constexpr float radius_odd_ladder = 16.146f;
+  constexpr float module_length_bpx4 = 6.7f;
+  constexpr float module_tolerance_bpx4 = 0.2f;
+  constexpr float barrel_z_length = 26.f;
+  constexpr float forward_z_begin = 32.f;
+
+  // Last indexes
+  constexpr uint32_t last_bpix1_detIndex = 96;
+  constexpr uint32_t last_barrel_detIndex = 1184;
 
   // types
-  using hindex_type = uint16_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
-  using tindex_type = uint16_t;  //  for tuples
+  using hindex_type = uint32_t;  // FIXME from siPixelRecHitsHeterogeneousProduct
+  using tindex_type = uint16_t;  // for tuples
 
-#ifndef ONLY_PHICUT
-  using CellNeighbors = cms::cuda::VecArray<uint32_t, 36>;
-  using CellTracks = cms::cuda::VecArray<tindex_type, 48>;
-#else
-  using CellNeighbors = cms::cuda::VecArray<uint32_t, 64>;
-  using CellTracks = cms::cuda::VecArray<tindex_type, 64>;
-#endif
+  using CellNeighbors = cms::cuda::VecArray<uint32_t, maxCellNeighbors>;
+  using CellTracks = cms::cuda::VecArray<tindex_type, maxCellTracks>;
 
   using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
   using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
 
-  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit()>;
-  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples(), 5 * maxTuples()>;
-  using HitToTuple =
-      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples()>;  // 3.5 should be enough
-  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples()>;
+  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
+  using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
+  using HitToTuple = cms::cuda::OneToManyAssoc<tindex_type, -1, 4 * maxTuples>;  // 3.5 should be enough
+  using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples>;
 
-}  // namespace CAConstants
+}  // namespace caConstants
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
index 57baea007..94085d784 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc
@@ -10,7 +10,7 @@
 
 #include "CAHitNtupletGeneratorOnGPU.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 class CAHitNtupletCUDA : public edm::EDProducer {
 public:
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
index 4ea687af3..f2805d018 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
@@ -18,19 +18,21 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
 #endif
 
-  // in principle we can use "nhits" to heuristically dimension the workspace...
-  // overkill to use template here (std::make_unique would suffice)
-  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
-  device_isOuterHitOfCell_.reset(
-      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
+  // use "nhits" to heuristically dimension the workspace
+
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits));
   assert(device_isOuterHitOfCell_.get());
 
-  cellStorage_.reset((unsigned char *)malloc(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
-                                             CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks)));
+  auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+                         caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
+  cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
-  device_theCellTracksContainer_ =
-      (GPUCACell::CellTracks *)(cellStorage_.get() +
-                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
+                                                                                      sizeof(GPUCACell::CellNeighbors));
 
   gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
                                  nhits,
@@ -39,17 +41,21 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                  device_theCellTracks_.get(),
                                  device_theCellTracksContainer_);
 
-  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
-  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
+  // no need to use the Traits allocations, since we know this is being compiled for the CPU
+  //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
+  device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
   if (0 == nhits)
     return;  // protect against empty events
 
-  // FIXME avoid magic numbers
+  // take all layer pairs into account
   auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
+  if (not params_.includeJumpingForwardDoublets_) {
+    // exclude forward "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
+  }
+  if (params_.minHitsPerNtuplet_ > 3) {
+    // for quadruplets, exclude all "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
   }
 
   assert(nActualPairs <= gpuPixelDoublets::nPairs);
@@ -60,17 +66,17 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                          hh.view(),
                                          device_isOuterHitOfCell_.get(),
                                          nActualPairs,
-                                         m_params.idealConditions_,
-                                         m_params.doClusterCut_,
-                                         m_params.doZ0Cut_,
-                                         m_params.doPtCut_,
-                                         m_params.maxNumberOfDoublets_);
+                                         params_.idealConditions_,
+                                         params_.doClusterCut_,
+                                         params_.doZ0Cut_,
+                                         params_.doPtCut_,
+                                         params_.maxNumberOfDoublets_);
 }
 
 template <>
 void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
 
   assert(tuples_d && quality_d);
 
@@ -78,7 +84,6 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
 
   // std::cout << "N hits " << nhits << std::endl;
   // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
@@ -94,14 +99,14 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  device_nCells_,
                  device_theCellNeighbors_.get(),
                  device_isOuterHitOfCell_.get(),
-                 m_params.hardCurvCut_,
-                 m_params.ptmin_,
-                 m_params.CAThetaCutBarrel_,
-                 m_params.CAThetaCutForward_,
-                 m_params.dcaCutInnerTriplet_,
-                 m_params.dcaCutOuterTriplet_);
-
-  if (nhits > 1 && m_params.earlyFishbone_) {
+                 params_.hardCurvCut_,
+                 params_.ptmin_,
+                 params_.CAThetaCutBarrel_,
+                 params_.CAThetaCutForward_,
+                 params_.dcaCutInnerTriplet_,
+                 params_.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && params_.earlyFishbone_) {
     gpuPixelDoublets::fishbone(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
   }
@@ -113,8 +118,8 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                        tuples_d,
                        device_hitTuple_apc_,
                        quality_d,
-                       m_params.minHitsPerNtuplet_);
-  if (m_params.doStats_)
+                       params_.minHitsPerNtuplet_);
+  if (params_.doStats_)
     kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
 
   cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
@@ -126,14 +131,15 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
-  if (nhits > 1 && m_params.lateFishbone_) {
+  if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
   }
 
-  if (m_params.doStats_) {
+  if (params_.doStats_) {
     kernel_checkOverflows(tuples_d,
                           device_tupleMultiplicity_.get(),
+                          device_hitToTuple_.get(),
                           device_hitTuple_apc_,
                           device_theCells_.get(),
                           device_nCells_,
@@ -141,7 +147,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                           device_theCellTracks_.get(),
                           device_isOuterHitOfCell_.get(),
                           nhits,
-                          m_params.maxNumberOfDoublets_,
+                          params_.maxNumberOfDoublets_,
                           counters_);
   }
 }
@@ -149,12 +155,12 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 template <>
 void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
 
   // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d);
+  kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);
 
-  if (m_params.lateFishbone_) {
+  if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
     kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
   }
@@ -163,14 +169,19 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
 
   // fill hit->track "map"
-  kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
-  kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+  if (params_.doSharedHitCut_ || params_.doStats_) {
+    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
+    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+  }
 
   // remove duplicates (tracks that share a hit)
-  kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+  if (params_.doSharedHitCut_) {
+    kernel_sharedHitCleaner(
+        hh.view(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get());
+  }
 
-  if (m_params.doStats_) {
+  if (params_.doStats_) {
     // counters (add flag???)
     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
     kernel_doStatsForTracks(tuples_d, quality_d, counters_);
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
index 08474c1af..edc1eb49b 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
@@ -3,7 +3,7 @@
 template <>
 void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto blockSize = 128;
-  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+  auto numberOfBlocks = (HitContainer::ctCapacity() + blockSize - 1) / blockSize;
 
   kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       &tracks_d->hitIndices, hv, &tracks_d->detIndices);
@@ -18,16 +18,18 @@ template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
   auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
 
   // zero tuples
   cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
 
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+#ifdef NTUPLE_DEBUG
+  std::cout << "start tuple building. N hits " << nhits << std::endl;
+  if (nhits < 2)
+    std::cout << "too few hits " << nhits << std::endl;
+#endif
 
   //
   // applying conbinatoric cleaning such as fishbone at this stage is too expensive
@@ -36,10 +38,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   auto nthTot = 64;
   auto stride = 4;
   auto blockSize = nthTot / stride;
-  auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  auto numberOfBlocks = nDoubletBlocks(blockSize);
   auto rescale = numberOfBlocks / 65536;
   blockSize *= (rescale + 1);
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = nDoubletBlocks(blockSize);
   assert(numberOfBlocks < 65536);
   assert(blockSize > 0 && 0 == blockSize % 16);
   dim3 blks(1, numberOfBlocks, 1);
@@ -53,15 +55,15 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       device_nCells_,
       device_theCellNeighbors_.get(),
       device_isOuterHitOfCell_.get(),
-      m_params.hardCurvCut_,
-      m_params.ptmin_,
-      m_params.CAThetaCutBarrel_,
-      m_params.CAThetaCutForward_,
-      m_params.dcaCutInnerTriplet_,
-      m_params.dcaCutOuterTriplet_);
+      params_.hardCurvCut_,
+      params_.ptmin_,
+      params_.CAThetaCutBarrel_,
+      params_.CAThetaCutForward_,
+      params_.dcaCutInnerTriplet_,
+      params_.dcaCutOuterTriplet_);
   cudaCheck(cudaGetLastError());
 
-  if (nhits > 1 && m_params.earlyFishbone_) {
+  if (nhits > 1 && params_.earlyFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
@@ -74,7 +76,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   }
 
   blockSize = 64;
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
   kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
                                                                      device_theCells_.get(),
                                                                      device_nCells_,
@@ -82,10 +84,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                                                                      tuples_d,
                                                                      device_hitTuple_apc_,
                                                                      quality_d,
-                                                                     m_params.minHitsPerNtuplet_);
+                                                                     params_.minHitsPerNtuplet_);
   cudaCheck(cudaGetLastError());
 
-  if (m_params.doStats_)
+  if (params_.doStats_)
     kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
   cudaCheck(cudaGetLastError());
 
@@ -95,17 +97,17 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 #endif
 
   blockSize = 128;
-  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+  numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
   cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, quality_d);
   cudaCheck(cudaGetLastError());
 
   blockSize = 128;
-  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
@@ -113,7 +115,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 
-  if (nhits > 1 && m_params.lateFishbone_) {
+  if (nhits > 1 && params_.lateFishbone_) {
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
@@ -125,21 +127,6 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     cudaCheck(cudaGetLastError());
   }
 
-  if (m_params.doStats_) {
-    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
-                                                                        device_tupleMultiplicity_.get(),
-                                                                        device_hitTuple_apc_,
-                                                                        device_theCells_.get(),
-                                                                        device_nCells_,
-                                                                        device_theCellNeighbors_.get(),
-                                                                        device_theCellTracks_.get(),
-                                                                        device_isOuterHitOfCell_.get(),
-                                                                        nhits,
-                                                                        m_params.maxNumberOfDoublets_,
-                                                                        counters_);
-    cudaCheck(cudaGetLastError());
-  }
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
@@ -151,7 +138,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
 template <>
 void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  auto nhits = hh.nHits();
+  int32_t nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
@@ -163,22 +150,21 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1, nhits), stream);
   assert(device_isOuterHitOfCell_.get());
 
   cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
-      CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) +
-          CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks),
+      caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
+          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
       stream);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
-  device_theCellTracksContainer_ =
-      (GPUCACell::CellTracks *)(cellStorage_.get() +
-                                CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors));
+  device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
+                                                                                      sizeof(GPUCACell::CellNeighbors));
 
   {
     int threadsPerBlock = 128;
     // at least one block!
-    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    int blocks = (std::max(1, nhits) + threadsPerBlock - 1) / threadsPerBlock;
     gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
                                                                            nhits,
                                                                            device_theCellNeighbors_.get(),
@@ -188,7 +174,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -198,12 +184,15 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   if (0 == nhits)
     return;  // protect against empty events
 
-  // FIXME avoid magic numbers
+  // take all layer pairs into account
   auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
+  if (not params_.includeJumpingForwardDoublets_) {
+    // exclude forward "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForTriplets;
+  }
+  if (params_.minHitsPerNtuplet_ > 3) {
+    // for quadruplets, exclude all "jumping" layer pairs
+    nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
   }
 
   assert(nActualPairs <= gpuPixelDoublets::nPairs);
@@ -219,11 +208,11 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                                                     hh.view(),
                                                                     device_isOuterHitOfCell_.get(),
                                                                     nActualPairs,
-                                                                    m_params.idealConditions_,
-                                                                    m_params.doClusterCut_,
-                                                                    m_params.doZ0Cut_,
-                                                                    m_params.doPtCut_,
-                                                                    m_params.maxNumberOfDoublets_);
+                                                                    params_.idealConditions_,
+                                                                    params_.doClusterCut_,
+                                                                    params_.doZ0Cut_,
+                                                                    params_.doPtCut_,
+                                                                    params_.maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -236,54 +225,83 @@ template <>
 void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
   auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+  auto *quality_d = tracks_d->qualityData();
+
+  int32_t nhits = hh.nHits();
 
   auto blockSize = 64;
 
   // classify tracks based on kinematics
-  auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, m_params.cuts_, quality_d);
+  auto numberOfBlocks = nQuadrupletBlocks(blockSize);
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, params_.cuts_, quality_d);
   cudaCheck(cudaGetLastError());
 
-  if (m_params.lateFishbone_) {
+  if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+    numberOfBlocks = nDoubletBlocks(blockSize);
     kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         device_theCells_.get(), device_nCells_, quality_d);
     cudaCheck(cudaGetLastError());
   }
 
   // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  numberOfBlocks = nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
   cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+  cudaCheck(cudaDeviceSynchronize());
+#endif
 
-  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
+  if (params_.minHitsPerNtuplet_ < 4 || params_.doStats_) {
     // fill hit->track "map"
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    assert(hitToTupleView_.offSize > nhits);
+    numberOfBlocks = nQuadrupletBlocks(blockSize);
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
-    cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
+    assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
+           (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
+    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+    cudaCheck(cudaDeviceSynchronize());
+#endif
   }
-  if (m_params.minHitsPerNtuplet_ < 4) {
+
+  if (params_.doSharedHitCut_) {
     // remove duplicates (tracks that share a hit)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
-    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+    numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
+    kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        hh.view(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+
+  if (params_.doStats_) {
+    numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitToTuple_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_.get(),
+                                                                        device_theCellTracks_.get(),
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        params_.maxNumberOfDoublets_,
+                                                                        counters_);
     cudaCheck(cudaGetLastError());
   }
 
-  if (m_params.doStats_) {
+  if (params_.doStats_) {
     // counters (add flag???)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
     kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
     cudaCheck(cudaGetLastError());
   }
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
index 3c3e3d447..dd87597a4 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h
@@ -1,6 +1,8 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 
+// #define GPU_DEBUG
+
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "GPUCACell.h"
 
@@ -26,8 +28,8 @@ namespace cAHitNtupletGenerator {
   using HitsView = TrackingRecHit2DSOAView;
   using HitsOnGPU = TrackingRecHit2DSOAView;
 
-  using HitToTuple = CAConstants::HitToTuple;
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+  using HitToTuple = caConstants::HitToTuple;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
   using TkSoA = pixelTrack::TrackSoA;
@@ -39,14 +41,14 @@ namespace cAHitNtupletGenerator {
     float chi2MaxPt;  // GeV
     float chi2Scale;
 
-    struct region {
+    struct Region {
       float maxTip;  // cm
       float minPt;   // GeV
       float maxZip;  // cm
     };
 
-    region triplet;
-    region quadruplet;
+    Region triplet;
+    Region quadruplet;
   };
 
   // params
@@ -54,6 +56,7 @@ namespace cAHitNtupletGenerator {
     Params(bool onGPU,
            uint32_t minHitsPerNtuplet,
            uint32_t maxNumberOfDoublets,
+           uint16_t minHitsForSharingCuts,
            bool useRiemannFit,
            bool fit5as4,
            bool includeJumpingForwardDoublets,
@@ -64,16 +67,19 @@ namespace cAHitNtupletGenerator {
            bool doClusterCut,
            bool doZ0Cut,
            bool doPtCut,
+           bool doSharedHitCut,
            float ptmin,
            float CAThetaCutBarrel,
            float CAThetaCutForward,
            float hardCurvCut,
            float dcaCutInnerTriplet,
            float dcaCutOuterTriplet,
+
            QualityCuts const& cuts)
         : onGPU_(onGPU),
           minHitsPerNtuplet_(minHitsPerNtuplet),
           maxNumberOfDoublets_(maxNumberOfDoublets),
+          minHitsForSharingCut_(minHitsForSharingCuts),
           useRiemannFit_(useRiemannFit),
           fit5as4_(fit5as4),
           includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
@@ -84,6 +90,7 @@ namespace cAHitNtupletGenerator {
           doClusterCut_(doClusterCut),
           doZ0Cut_(doZ0Cut),
           doPtCut_(doPtCut),
+          doSharedHitCut_(doSharedHitCut),
           ptmin_(ptmin),
           CAThetaCutBarrel_(CAThetaCutBarrel),
           CAThetaCutForward_(CAThetaCutForward),
@@ -95,6 +102,7 @@ namespace cAHitNtupletGenerator {
     const bool onGPU_;
     const uint32_t minHitsPerNtuplet_;
     const uint32_t maxNumberOfDoublets_;
+    const uint16_t minHitsForSharingCut_;
     const bool useRiemannFit_;
     const bool fit5as4_;
     const bool includeJumpingForwardDoublets_;
@@ -105,6 +113,7 @@ namespace cAHitNtupletGenerator {
     const bool doClusterCut_;
     const bool doZ0Cut_;
     const bool doPtCut_;
+    const bool doSharedHitCut_;
     const float ptmin_;
     const float CAThetaCutBarrel_;
     const float CAThetaCutForward_;
@@ -152,14 +161,15 @@ class CAHitNtupletGeneratorKernels {
   using HitsOnGPU = TrackingRecHit2DSOAView;
   using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
 
-  using HitToTuple = CAConstants::HitToTuple;
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+  using HitToTuple = caConstants::HitToTuple;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   using Quality = pixelTrack::Quality;
   using TkSoA = pixelTrack::TrackSoA;
   using HitContainer = pixelTrack::HitContainer;
 
-  CAHitNtupletGeneratorKernels(Params const& params) : m_params(params) {}
+  CAHitNtupletGeneratorKernels(Params const& params)
+      : params_(params), paramsMaxDoubletes3Quarters_(3 * params.maxNumberOfDoublets_ / 4) {}
   ~CAHitNtupletGeneratorKernels() = default;
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
@@ -171,25 +181,30 @@ class CAHitNtupletGeneratorKernels {
   void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
-  void allocateOnGPU(cudaStream_t stream);
+  void allocateOnGPU(int32_t nHits, cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);
-  Counters* counters_ = nullptr;
+  void setCounters(Counters* counters) { counters_ = counters; }
 
 private:
+  Counters* counters_ = nullptr;
+
   // workspace
   unique_ptr<unsigned char[]> cellStorage_;
-  unique_ptr<CAConstants::CellNeighborsVector> device_theCellNeighbors_;
-  CAConstants::CellNeighbors* device_theCellNeighborsContainer_;
-  unique_ptr<CAConstants::CellTracksVector> device_theCellTracks_;
-  CAConstants::CellTracks* device_theCellTracksContainer_;
+  unique_ptr<caConstants::CellNeighborsVector> device_theCellNeighbors_;
+  caConstants::CellNeighbors* device_theCellNeighborsContainer_;
+  unique_ptr<caConstants::CellTracksVector> device_theCellTracks_;
+  caConstants::CellTracks* device_theCellTracksContainer_;
 
   unique_ptr<GPUCACell[]> device_theCells_;
   unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
   uint32_t* device_nCells_ = nullptr;
 
   unique_ptr<HitToTuple> device_hitToTuple_;
+  unique_ptr<HitToTuple::Counter[]> device_hitToTupleStorage_;
+  HitToTuple::View hitToTupleView_;
+
   cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
 
   cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr;
@@ -198,7 +213,20 @@ class CAHitNtupletGeneratorKernels {
 
   unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
   // params
-  Params const& m_params;
+  Params const& params_;
+  /// Intermediate result avoiding repeated computations.
+  const uint32_t paramsMaxDoubletes3Quarters_;
+  /// Compute the number of doublet blocks for block size
+  inline uint32_t nDoubletBlocks(uint32_t blockSize) {
+    // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed.
+    return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize;
+  }
+
+  /// Compute the number of quadruplet blocks for block size
+  inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
+    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
+    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+  }
 };
 
 using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
index fb505b126..929677a44 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsAlloc.h
@@ -4,18 +4,28 @@
 
 template <>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  device_theCellNeighbors_ = Traits::template make_unique<CAConstants::CellNeighborsVector>(stream);
-  device_theCellTracks_ = Traits::template make_unique<CAConstants::CellTracksVector>(stream);
+  device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(stream);
+  device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(stream);
 
+#ifdef GPU_DEBUG
+  std::cout << "Allocation for tuple building. N hits " << nHits << std::endl;
+#endif
+
+  nHits++;  // storage requires one more counter;
+  assert(nHits > 0);
   device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
+  device_hitToTupleStorage_ = Traits::template make_unique<HitToTuple::Counter[]>(nHits, stream);
+  hitToTupleView_.assoc = device_hitToTuple_.get();
+  hitToTupleView_.offStorage = device_hitToTupleStorage_.get();
+  hitToTupleView_.offSize = nHits;
 
   device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
 
@@ -25,15 +35,16 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
   device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
 
-  if
-#ifndef __CUDACC__
-      constexpr
-#endif
-      (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+  // FIXME: consider collapsing these 3 in one adhoc kernel
+  if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
   }
   cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
-  cms::cuda::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
+  cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
 }
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
index e35e20be9..6506a104b 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h
@@ -3,6 +3,7 @@
 //
 
 // #define NTUPLE_DEBUG
+// #define GPU_DEBUG
 
 #include <cmath>
 #include <cstdint>
@@ -22,22 +23,23 @@
 using HitsOnGPU = TrackingRecHit2DSOAView;
 using HitsOnCPU = TrackingRecHit2DCUDA;
 
-using HitToTuple = CAConstants::HitToTuple;
-using TupleMultiplicity = CAConstants::TupleMultiplicity;
+using HitToTuple = caConstants::HitToTuple;
+using TupleMultiplicity = caConstants::TupleMultiplicity;
 
 using Quality = pixelTrack::Quality;
 using TkSoA = pixelTrack::TrackSoA;
 using HitContainer = pixelTrack::HitContainer;
 
 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
-                                      CAConstants::TupleMultiplicity *tupleMultiplicity,
+                                      caConstants::TupleMultiplicity const *tupleMultiplicity,
+                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
                                       cms::cuda::AtomicPairCounter *apc,
                                       GPUCACell const *__restrict__ cells,
                                       uint32_t const *__restrict__ nCells,
                                       gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
                                       gpuPixelDoublets::CellTracksVector const *cellTracks,
                                       GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
-                                      uint32_t nHits,
+                                      int32_t nHits,
                                       uint32_t maxNumberOfDoublets,
                                       CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
@@ -54,28 +56,29 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
 
 #ifdef NTUPLE_DEBUG
   if (0 == first) {
-    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
+    printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
            *nCells,
            apc->get().m,
            apc->get().n,
-           nHits);
-    if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) {
+           nHits,
+           hitToTuple->totOnes());
+    if (apc->get().m < caConstants::maxNumberOfQuadruplets()) {
       assert(foundNtuplets->size(apc->get().m) == 0);
       assert(foundNtuplets->size() == apc->get().n);
     }
   }
 
-  for (int idx = first, nt = foundNtuplets->nbins(); idx < nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
     if (foundNtuplets->size(idx) > 5)
       printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
     assert(foundNtuplets->size(idx) < 6);
     for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
-      assert(*ih < nHits);
+      assert(int(*ih) < nHits);
   }
 #endif
 
   if (0 == first) {
-    if (apc->get().m >= CAConstants::maxNumberOfQuadruplets())
+    if (apc->get().m >= caConstants::maxNumberOfQuadruplets)
       printf("Tuples overflow\n");
     if (*nCells >= maxNumberOfDoublets)
       printf("Cells overflow\n");
@@ -83,19 +86,21 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
       printf("cellNeighbors overflow\n");
     if (cellTracks && cellTracks->full())
       printf("cellTracks overflow\n");
+    if (int(hitToTuple->nOnes()) < nHits)
+      printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
   }
 
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
     if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
-      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
+      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
     if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
-      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
-    if (thisCell.theDoubletId < 0)
+      printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
+    if (thisCell.isKilled())
       atomicAdd(&c.nKilledCells, 1);
-    if (0 == thisCell.theUsed)
+    if (thisCell.unused())
       atomicAdd(&c.nEmptyCells, 1);
-    if (thisCell.tracks().empty())
+    if (0 == hitToTuple->size(thisCell.inner_hit_id()) && 0 == hitToTuple->size(thisCell.outer_hit_id()))
       atomicAdd(&c.nZeroTrackCells, 1);
   }
 
@@ -106,12 +111,12 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
 }
 
 __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
-  constexpr auto bad = trackQuality::bad;
+  constexpr auto bad = pixelTrack::Quality::bad;
 
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
-    if (thisCell.theDoubletId >= 0)
+    if (!thisCell.isKilled())
       continue;
 
     for (auto it : thisCell.tracks())
@@ -124,7 +129,7 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
                                              HitContainer *foundNtuplets,
                                              Quality *quality) {
   // constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
+  constexpr auto dup = pixelTrack::Quality::dup;
   // constexpr auto loose = trackQuality::loose;
 
   assert(nCells);
@@ -156,9 +161,9 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
                                             uint32_t const *__restrict__ nCells,
                                             HitContainer const *__restrict__ foundNtuplets,
                                             TkSoA *__restrict__ tracks) {
-  constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
-  constexpr auto loose = trackQuality::loose;
+  constexpr auto bad = pixelTrack::Quality::bad;
+  constexpr auto dup = pixelTrack::Quality::dup;
+  constexpr auto loose = pixelTrack::Quality::loose;
 
   assert(nCells);
 
@@ -177,7 +182,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
       // return tracks->chi2(it);  //chi2
     };
 
-    // find min socre
+    // find min score
     for (auto it : thisCell.tracks()) {
       if (tracks->quality(it) == loose && score(it) < mc) {
         mc = score(it);
@@ -219,30 +224,22 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
   for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
     auto cellIndex = idx;
     auto &thisCell = cells[idx];
-    //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
-    //  continue;
-    auto innerHitId = thisCell.get_inner_hit_id();
+    auto innerHitId = thisCell.inner_hit_id();
     int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
     auto vi = isOuterHitOfCell[innerHitId].data();
 
-    constexpr uint32_t last_bpix1_detIndex = 96;
-    constexpr uint32_t last_barrel_detIndex = 1184;
-    auto ri = thisCell.get_inner_r(hh);
-    auto zi = thisCell.get_inner_z(hh);
+    auto ri = thisCell.inner_r(hh);
+    auto zi = thisCell.inner_z(hh);
 
-    auto ro = thisCell.get_outer_r(hh);
-    auto zo = thisCell.get_outer_z(hh);
-    auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
+    auto ro = thisCell.outer_r(hh);
+    auto zo = thisCell.outer_z(hh);
+    auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
 
     for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
       auto otherCell = __ldg(vi + j);
       auto &oc = cells[otherCell];
-      // if (cells[otherCell].theDoubletId < 0 ||
-      //    cells[otherCell].theUsed>1 )
-      //  continue;
-      auto r1 = oc.get_inner_r(hh);
-      auto z1 = oc.get_inner_z(hh);
-      // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
+      auto r1 = oc.inner_r(hh);
+      auto z1 = oc.inner_z(hh);
       bool aligned = GPUCACell::areAlignedRZ(
           r1,
           z1,
@@ -252,14 +249,14 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
           zo,
           ptmin,
           isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-      if (aligned &&
-          thisCell.dcaCut(hh,
-                          oc,
-                          oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
-                          hardCurvCut)) {  // FIXME tune cuts
+      if (aligned && thisCell.dcaCut(hh,
+                                     oc,
+                                     oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                              : dcaCutOuterTriplet,
+                                     hardCurvCut)) {  // FIXME tune cuts
         oc.addOuterNeighbor(cellIndex, *cellNeighbors);
-        thisCell.theUsed |= 1;
-        oc.theUsed |= 1;
+        thisCell.setUsedBit(1);
+        oc.setUsedBit(1);
       }
     }  // loop on inner cells
   }    // loop on outer cells
@@ -279,10 +276,10 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
-    if (thisCell.theDoubletId < 0)
+    if (thisCell.isKilled())
       continue;  // cut by earlyFishbone
 
-    auto pid = thisCell.theLayerPairId;
+    auto pid = thisCell.layerPairId();
     auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
     if (doit) {
       GPUCACell::TmpTuple stack;
@@ -297,48 +294,47 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
 __global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
                                  GPUCACell *__restrict__ cells,
                                  uint32_t const *nCells) {
-  // auto const &hh = *hhp;
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto &thisCell = cells[idx];
     if (!thisCell.tracks().empty())
-      thisCell.theUsed |= 2;
+      thisCell.setUsedBit(2);
   }
 }
 
 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
                                          Quality const *__restrict__ quality,
-                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
+                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == trackQuality::dup)
+    if (quality[it] == pixelTrack::Quality::dup)
       continue;
-    assert(quality[it] == trackQuality::bad);
+    assert(quality[it] == pixelTrack::Quality::bad);
     if (nhits > 5)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits < 8);
-    tupleMultiplicity->countDirect(nhits);
+    tupleMultiplicity->count(nhits);
   }
 }
 
 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
                                         Quality const *__restrict__ quality,
-                                        CAConstants::TupleMultiplicity *tupleMultiplicity) {
+                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
-    if (quality[it] == trackQuality::dup)
+    if (quality[it] == pixelTrack::Quality::dup)
       continue;
-    assert(quality[it] == trackQuality::bad);
+    assert(quality[it] == pixelTrack::Quality::bad);
     if (nhits > 5)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits < 8);
-    tupleMultiplicity->fillDirect(nhits, it);
+    tupleMultiplicity->fill(nhits, it);
   }
 }
 
@@ -347,16 +343,16 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
                                       Quality *__restrict__ quality) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int it = first, nt = tuples->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = tuples->size(it);
     if (nhits == 0)
       break;  // guard
 
     // if duplicate: not even fit
-    if (quality[it] == trackQuality::dup)
+    if (quality[it] == pixelTrack::Quality::dup)
       continue;
 
-    assert(quality[it] == trackQuality::bad);
+    assert(quality[it] == pixelTrack::Quality::bad);
 
     // mark doublets as bad
     if (nhits < 3)
@@ -385,7 +381,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                     (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
     // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
     if (3.f * tracks->chi2(it) >= chi2Cut) {
-#ifdef NTUPLE_DEBUG
+#ifdef NTUPLE_FIT_DEBUG
       printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
              it,
              tuples->size(it),
@@ -406,7 +402,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                 (std::abs(tracks->zip(it)) < region.maxZip);
 
     if (isOk)
-      quality[it] = trackQuality::loose;
+      quality[it] = pixelTrack::Quality::loose;
   }
 }
 
@@ -414,10 +410,10 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
                                         Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  //guard
-    if (quality[idx] != trackQuality::loose)
+    if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     atomicAdd(&(counters->nGoodTracks), 1);
   }
@@ -427,13 +423,13 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
                                         Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
-    if (quality[idx] != trackQuality::loose)
+    if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->countDirect(*h);
+      hitToTuple->count(*h);
   }
 }
 
@@ -441,13 +437,13 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
                                        Quality const *__restrict__ quality,
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
-    if (quality[idx] != trackQuality::loose)
+    if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->fillDirect(*h, idx);
+      hitToTuple->fill(*h, idx);
   }
 }
 
@@ -456,15 +452,15 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
                                          HitContainer *__restrict__ hitDetIndices) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   // copy offsets
-  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     hitDetIndices->off[idx] = tuples->off[idx];
   }
   // fill hit indices
   auto const &hh = *hhp;
   auto nhits = hh.nHits();
   for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    assert(tuples->bins[idx] < nhits);
-    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+    assert(tuples->content[idx] < nhits);
+    hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
   }
 }
 
@@ -472,7 +468,7 @@ __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::Hi
                                              CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   auto &c = *counters;
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (hitToTuple->size(idx) == 0)
       continue;  // SHALL NOT BE break
     atomicAdd(&c.nUsedHits, 1);
@@ -481,24 +477,25 @@ __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::Hi
   }
 }
 
-__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
-                                      HitContainer const *__restrict__ ptuples,
-                                      TkSoA const *__restrict__ ptracks,
-                                      Quality *__restrict__ quality,
-                                      CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
-  constexpr auto bad = trackQuality::bad;
-  constexpr auto dup = trackQuality::dup;
+__global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                        HitContainer const *__restrict__ ptuples,
+                                        TkSoA const *__restrict__ ptracks,
+                                        Quality *__restrict__ quality,
+                                        uint16_t nmin,
+                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
+  constexpr auto bad = pixelTrack::Quality::bad;
+  constexpr auto dup = pixelTrack::Quality::dup;
   // constexpr auto loose = trackQuality::loose;
 
   auto &hitToTuple = *phitToTuple;
   auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
 
-  //  auto const & hh = *hhp;
-  // auto l1end = hh.hitsLayerStart_d[1];
+  auto const &hh = *hhp;
+  int l1end = hh.hitsLayerStart()[1];
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (hitToTuple.size(idx) < 2)
       continue;
 
@@ -514,13 +511,17 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_
     // kill all tracks shorter than maxHn (only triplets???)
     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
       uint32_t nh = foundNtuplets.size(*it);
+
+      //checking if shared hit is on bpix1 and if the tuple is short enough
+      if (idx < l1end and nh > nmin)
+        continue;
+
       if (maxNh != nh)
         quality[*it] = dup;
     }
 
     if (maxNh > 3)
       continue;
-    // if (idx>=l1end) continue;  // only for layer 1
     // for triplets choose best tip!
     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
       auto const it = *ip;
@@ -543,12 +544,12 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
                                             TkSoA const *__restrict__ ptracks,
                                             Quality const *__restrict__ quality,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
-                                            uint32_t maxPrint,
+                                            int32_t maxPrint,
                                             int iev) {
   auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i < np; i += blockDim.x * gridDim.x) {
+  for (int i = first, np = std::min(maxPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
     auto nh = foundNtuplets.size(i);
     if (nh < 3)
       continue;
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
index d0e428da6..714748cc1 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc
@@ -2,6 +2,8 @@
 // Original Author: Felice Pantaleo, CERN
 //
 
+// #define GPU_DEBUG
+
 #include <array>
 #include <cassert>
 #include <functional>
@@ -46,16 +48,18 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(edm::ProductRegistry& reg
     : m_params(true,              // onGPU
                3,                 // minHitsPerNtuplet,
                458752,            // maxNumberOfDoublets
-               false,             //useRiemannFit
+               5,                 // minHitsForSharingCut
+               false,             // useRiemannFit
                true,              // fit5as4,
-               true,              //includeJumpingForwardDoublets
+               true,              // includeJumpingForwardDoublets
                true,              // earlyFishbone
                false,             // lateFishbone
                true,              // idealConditions
-               false,             //fillStatistics
+               false,             // doStats
                true,              // doClusterCut
                true,              // doZ0Cut
                true,              // doPtCut
+               true,              // doSharedHitCut
                0.899999976158,    // ptmin
                0.00200000009499,  // CAThetaCutBarrel
                0.00300000002608,  // CAThetaCutForward
@@ -112,11 +116,11 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
 
   auto* soa = tracks.get();
+  assert(soa);
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
-  kernels.counters_ = m_counters;
-
-  kernels.allocateOnGPU(stream);
+  kernels.setCounters(m_counters);
+  kernels.allocateOnGPU(hits_d.nHits(), stream);
 
   kernels.buildDoublets(hits_d, stream);
   kernels.launchKernels(hits_d, soa, stream);
@@ -125,12 +129,18 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   } else {
-    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream);
   }
   kernels.classifyTuples(hits_d, soa, stream);
 
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+  std::cout << "finished building pixel tracks on GPU" << std::endl;
+#endif
+
   return tracks;
 }
 
@@ -141,8 +151,8 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
   assert(soa);
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
-  kernels.counters_ = m_counters;
-  kernels.allocateOnGPU(nullptr);
+  kernels.setCounters(m_counters);
+  kernels.allocateOnGPU(hits_d.nHits(), nullptr);
 
   kernels.buildDoublets(hits_d, nullptr);
   kernels.launchKernels(hits_d, soa, nullptr);
@@ -156,12 +166,16 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
   fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
 
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
   } else {
-    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
+    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets);
   }
 
   kernels.classifyTuples(hits_d, soa, nullptr);
 
+#ifdef GPU_DEBUG
+  std::cout << "finished building pixel tracks on CPU" << std::endl;
+#endif
+
   return tracks;
 }
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
index 823987658..f42bb301b 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h
@@ -5,7 +5,7 @@
 
 #include "CUDACore/SimpleVector.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 #include "CAHitNtupletGeneratorKernels.h"
 #include "GPUCACell.h"
diff --git a/src/cudadev/plugin-PixelTriplets/CircleEq.h b/src/cudadev/plugin-PixelTriplets/CircleEq.h
index dfe7da010..986fe2e29 100644
--- a/src/cudadev/plugin-PixelTriplets/CircleEq.h
+++ b/src/cudadev/plugin-PixelTriplets/CircleEq.h
@@ -80,16 +80,6 @@ constexpr void CircleEq<T>::compute(T x1, T y1, T x2, T y2, T x3, T y3) {
   auto num = x1p * y3p - y1p * x3p;  // num also gives correct sign for CT
   auto det = d12 * y3p - d32 * y1p;
 
-  /*
-  auto ct  = num/det;
-  auto sn  = det>0 ? T(1.) : T(-1.);
-  auto st2 = (d12*x3p-d32*x1p)/det;
-  auto seq = T(1.) +st2*st2;
-  auto al2 = sn/std::sqrt(seq);
-  auto be2 = -st2*al2;
-  ct *= T(2.)*al2;
-  */
-
   auto st2 = (d12 * x3p - d32 * x1p);
   auto seq = det * det + st2 * st2;
   auto al2 = T(1.) / std::sqrt(seq);
diff --git a/src/cudadev/plugin-PixelTriplets/FitResult.h b/src/cudadev/plugin-PixelTriplets/FitResult.h
index b97dda4e6..01497719d 100644
--- a/src/cudadev/plugin-PixelTriplets/FitResult.h
+++ b/src/cudadev/plugin-PixelTriplets/FitResult.h
@@ -8,7 +8,7 @@
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 
-namespace Rfit {
+namespace riemannFit {
 
   using Vector2d = Eigen::Vector2d;
   using Vector3d = Eigen::Vector3d;
@@ -23,7 +23,7 @@ namespace Rfit {
   template <int N>
   using Matrix3xNd = Eigen::Matrix<double, 3, N>;  // used for inputs hits
 
-  struct circle_fit {
+  struct CircleFit {
     Vector3d par;  //!< parameter: (X0,Y0,R)
     Matrix3d cov;
     /*!< covariance matrix: \n
@@ -31,11 +31,11 @@ namespace Rfit {
       |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
     */
-    int32_t q;  //!< particle charge
+    int32_t qCharge;  //!< particle charge
     float chi2;
   };
 
-  struct line_fit {
+  struct LineFit {
     Vector2d par;  //!<(cotan(theta),Zip)
     Matrix2d cov;
     /*!<
@@ -45,7 +45,7 @@ namespace Rfit {
     double chi2;
   };
 
-  struct helix_fit {
+  struct HelixFit {
     Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
     Matrix5d cov;
     /*!< ()->cov() \n
@@ -58,8 +58,8 @@ namespace Rfit {
     float chi2_circle;
     float chi2_line;
     //    Vector4d fast_fit;
-    int32_t q;  //!< particle charge
-  };            // __attribute__((aligned(16)));
+    int32_t qCharge;  //!< particle charge
+  };                  // __attribute__((aligned(16)));
 
-}  // namespace Rfit
+}  // namespace riemannFit
 #endif
diff --git a/src/cudadev/plugin-PixelTriplets/FitUtils.h b/src/cudadev/plugin-PixelTriplets/FitUtils.h
index d69e03194..62492451e 100644
--- a/src/cudadev/plugin-PixelTriplets/FitUtils.h
+++ b/src/cudadev/plugin-PixelTriplets/FitUtils.h
@@ -6,9 +6,9 @@
 #include "choleskyInversion.h"
 #include "FitResult.h"
 
-namespace Rfit {
+namespace riemannFit {
 
-  constexpr double d = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
+  constexpr double epsilon = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
 
   using VectorXd = Eigen::VectorXd;
   using MatrixXd = Eigen::MatrixXd;
@@ -50,13 +50,11 @@ namespace Rfit {
   using Vector4f = Eigen::Vector4f;
   using Vector6f = Eigen::Matrix<double, 6, 1>;
 
-  using u_int = unsigned int;
-
   template <class C>
   __host__ __device__ void printIt(C* m, const char* prefix = "") {
 #ifdef RFIT_DEBUG
-    for (u_int r = 0; r < m->rows(); ++r) {
-      for (u_int c = 0; c < m->cols(); ++c) {
+    for (uint r = 0; r < m->rows(); ++r) {
+      for (uint c = 0; c < m->cols(); ++c) {
         printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
       }
     }
@@ -103,19 +101,19 @@ namespace Rfit {
     // | 3  4  5 |
     constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
     for (uint32_t i = 0; i < hits_in_fit; ++i) {
-      auto ge_idx = 0;
-      auto j = 0;
-      auto l = 0;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2;
-      j = 1;
-      l = 1;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1;
-      j = 1;
-      l = 0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
+      {
+        constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
     }
   }
 
@@ -135,33 +133,33 @@ namespace Rfit {
     // | 3  4  5 |
     constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
     for (uint32_t i = 0; i < hits_in_fit; ++i) {
-      auto ge_idx = 0;
-      auto j = 0;
-      auto l = 0;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 2;
-      j = 1;
-      l = 1;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 5;
-      j = 2;
-      l = 2;
-      hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
-      ge_idx = 1;
-      j = 1;
-      l = 0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
-      ge_idx = 3;
-      j = 2;
-      l = 0;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
-      ge_idx = 4;
-      j = 2;
-      l = 1;
-      hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
-          ge.col(i)[ge_idx];
+      {
+        constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 5, j = 2, l = 2;
+        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 3, j = 2, l = 0;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
+      {
+        constexpr uint32_t ge_idx = 4, j = 2, l = 1;
+        hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+            ge.col(i)[ge_idx];
+      }
     }
   }
 
@@ -173,19 +171,19 @@ namespace Rfit {
     \param B magnetic field in Gev/cm/c unit.
     \param error flag for errors computation.
   */
-  __host__ __device__ inline void par_uvrtopak(circle_fit& circle, const double B, const bool error) {
+  __host__ __device__ inline void par_uvrtopak(CircleFit& circle, const double B, const bool error) {
     Vector3d par_pak;
     const double temp0 = circle.par.head(2).squaredNorm();
     const double temp1 = sqrt(temp0);
-    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
-        circle.par(2) * B;
+    par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+        circle.qCharge * (temp1 - circle.par(2)), circle.par(2) * B;
     if (error) {
       const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-      const double temp3 = 1. / temp1 * circle.q;
-      Matrix3d J4;
-      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
-          circle.par(1) * temp3, -circle.q, 0., 0., B;
-      circle.cov = J4 * circle.cov * J4.transpose();
+      const double temp3 = 1. / temp1 * circle.qCharge;
+      Matrix3d j4Mat;
+      j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.qCharge, 0., 0., B;
+      circle.cov = j4Mat * circle.cov * j4Mat.transpose();
     }
     circle.par = par_pak;
   }
@@ -196,19 +194,19 @@ namespace Rfit {
     \param circle_uvr parameter (X0,Y0,R), covariance matrix to
     be transformed and particle charge.
   */
-  __host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) {
+  __host__ __device__ inline void fromCircleToPerigee(CircleFit& circle) {
     Vector3d par_pak;
     const double temp0 = circle.par.head(2).squaredNorm();
     const double temp1 = sqrt(temp0);
-    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
-        circle.q / circle.par(2);
+    par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+        circle.qCharge * (temp1 - circle.par(2)), circle.qCharge / circle.par(2);
 
     const double temp2 = sqr(circle.par(0)) * 1. / temp0;
-    const double temp3 = 1. / temp1 * circle.q;
-    Matrix3d J4;
-    J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
-        circle.par(1) * temp3, -circle.q, 0., 0., -circle.q / (circle.par(2) * circle.par(2));
-    circle.cov = J4 * circle.cov * J4.transpose();
+    const double temp3 = 1. / temp1 * circle.qCharge;
+    Matrix3d j4Mat;
+    j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+        circle.par(1) * temp3, -circle.qCharge, 0., 0., -circle.qCharge / (circle.par(2) * circle.par(2));
+    circle.cov = j4Mat * circle.cov * j4Mat.transpose();
 
     circle.par = par_pak;
   }
@@ -229,18 +227,18 @@ namespace Rfit {
     op(3) = ip(1);
     op(4) = -ip(4);
 
-    Matrix5d J = Matrix5d::Zero();
+    Matrix5d jMat = Matrix5d::Zero();
 
-    J(0, 2) = sinTheta;
-    J(0, 3) = -sinTheta2 * cosTheta * ip(2);
-    J(1, 0) = 1.;
-    J(2, 3) = -1.;
-    J(3, 1) = 1.;
-    J(4, 4) = -1;
+    jMat(0, 2) = sinTheta;
+    jMat(0, 3) = -sinTheta2 * cosTheta * ip(2);
+    jMat(1, 0) = 1.;
+    jMat(2, 3) = -1.;
+    jMat(3, 1) = 1.;
+    jMat(4, 4) = -1;
 
-    ocov = J * icov * J.transpose();
+    ocov = jMat * icov * jMat.transpose();
   }
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 #endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
diff --git a/src/cudadev/plugin-PixelTriplets/GPUCACell.h b/src/cudadev/plugin-PixelTriplets/GPUCACell.h
index df4354e59..9b1ddc05f 100644
--- a/src/cudadev/plugin-PixelTriplets/GPUCACell.h
+++ b/src/cudadev/plugin-PixelTriplets/GPUCACell.h
@@ -13,21 +13,21 @@
 #include "CUDACore/VecArray.h"
 #include "CUDACore/cuda_assert.h"
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 #include "CAConstants.h"
 #include "CircleEq.h"
 
 class GPUCACell {
 public:
-  using ptrAsInt = unsigned long long;
+  using PtrAsInt = unsigned long long;
 
-  static constexpr int maxCellsPerHit = CAConstants::maxCellsPerHit();
-  using OuterHitOfCell = CAConstants::OuterHitOfCell;
-  using CellNeighbors = CAConstants::CellNeighbors;
-  using CellTracks = CAConstants::CellTracks;
-  using CellNeighborsVector = CAConstants::CellNeighborsVector;
-  using CellTracksVector = CAConstants::CellTracksVector;
+  static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit;
+  using OuterHitOfCell = caConstants::OuterHitOfCell;
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
 
   using Hits = TrackingRecHit2DSOAView;
   using hindex_type = Hits::hindex_type;
@@ -35,8 +35,8 @@ class GPUCACell {
   using TmpTuple = cms::cuda::VecArray<uint32_t, 6>;
 
   using HitContainer = pixelTrack::HitContainer;
-  using Quality = trackQuality::Quality;
-  static constexpr auto bad = trackQuality::bad;
+  using Quality = pixelTrack::Quality;
+  static constexpr auto bad = pixelTrack::Quality::bad;
 
   GPUCACell() = default;
 
@@ -49,9 +49,9 @@ class GPUCACell {
                                        hindex_type outerHitId) {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
-    theDoubletId = doubletId;
-    theLayerPairId = layerPairId;
-    theUsed = 0;
+    theDoubletId_ = doubletId;
+    theLayerPairId_ = layerPairId;
+    theUsed_ = 0;
 
     // optimization that depends on access pattern
     theInnerZ = hh.zGlobal(innerHitId);
@@ -67,14 +67,14 @@ class GPUCACell {
   __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) {
     // use smart cache
     if (outerNeighbors().empty()) {
-      auto i = cellNeighbors.extend();  // maybe waisted....
+      auto i = cellNeighbors.extend();  // maybe wasted....
       if (i > 0) {
         cellNeighbors[i].reset();
 #ifdef __CUDACC__
-        auto zero = (ptrAsInt)(&cellNeighbors[0]);
-        atomicCAS((ptrAsInt*)(&theOuterNeighbors),
+        auto zero = (PtrAsInt)(&cellNeighbors[0]);
+        atomicCAS((PtrAsInt*)(&theOuterNeighbors),
                   zero,
-                  (ptrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
+                  (PtrAsInt)(&cellNeighbors[i]));  // if fails we cannot give "i" back...
 #else
         theOuterNeighbors = &cellNeighbors[i];
 #endif
@@ -87,12 +87,12 @@ class GPUCACell {
 
   __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) {
     if (tracks().empty()) {
-      auto i = cellTracks.extend();  // maybe waisted....
+      auto i = cellTracks.extend();  // maybe wasted....
       if (i > 0) {
         cellTracks[i].reset();
 #ifdef __CUDACC__
-        auto zero = (ptrAsInt)(&cellTracks[0]);
-        atomicCAS((ptrAsInt*)(&theTracks), zero, (ptrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
+        auto zero = (PtrAsInt)(&cellTracks[0]);
+        atomicCAS((PtrAsInt*)(&theTracks), zero, (PtrAsInt)(&cellTracks[i]));  // if fails we cannot give "i" back...
 #else
         theTracks = &cellTracks[i];
 #endif
@@ -107,30 +107,30 @@ class GPUCACell {
   __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
   __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
   __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
-  __device__ __forceinline__ float get_inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_z(Hits const& hh) const { return theInnerZ; }
+  __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
+  __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
+  __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; }
   // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
-  __device__ __forceinline__ float get_outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
-  __device__ __forceinline__ float get_inner_r(Hits const& hh) const { return theInnerR; }
+  __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
+  __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; }
   // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
-  __device__ __forceinline__ float get_outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
+  __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
 
-  __device__ __forceinline__ auto get_inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
-  __device__ __forceinline__ auto get_outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
+  __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
+  __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
 
-  __device__ __forceinline__ float get_inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
-  __device__ __forceinline__ float get_outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
+  __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
+  __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
 
-  constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; }
-  constexpr unsigned int get_outer_hit_id() const { return theOuterHitId; }
+  constexpr unsigned int inner_hit_id() const { return theInnerHitId; }
+  constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
 
   __device__ void print_cell() const {
     printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
-           theDoubletId,
-           theLayerPairId,
+           theDoubletId_,
+           theLayerPairId_,
            theInnerHitId,
            theOuterHitId);
   }
@@ -139,24 +139,22 @@ class GPUCACell {
                                   GPUCACell const& otherCell,
                                   const float ptmin,
                                   const float hardCurvCut,
-                                  const float CAThetaCutBarrel,
-                                  const float CAThetaCutForward,
+                                  const float caThetaCutBarrel,
+                                  const float caThetaCutForward,
                                   const float dcaCutInnerTriplet,
                                   const float dcaCutOuterTriplet) const {
     // detIndex of the layerStart for the Phase1 Pixel Detector:
     // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
     // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
-    constexpr uint32_t last_bpix1_detIndex = 96;
-    constexpr uint32_t last_barrel_detIndex = 1184;
-    auto ri = get_inner_r(hh);
-    auto zi = get_inner_z(hh);
+    auto ri = inner_r(hh);
+    auto zi = inner_z(hh);
 
-    auto ro = get_outer_r(hh);
-    auto zo = get_outer_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
 
-    auto r1 = otherCell.get_inner_r(hh);
-    auto z1 = otherCell.get_inner_z(hh);
-    auto isBarrel = otherCell.get_outer_detIndex(hh) < last_barrel_detIndex;
+    auto r1 = otherCell.inner_r(hh);
+    auto z1 = otherCell.inner_z(hh);
+    auto isBarrel = otherCell.outer_detIndex(hh) < caConstants::last_barrel_detIndex;
     bool aligned = areAlignedRZ(r1,
                                 z1,
                                 ri,
@@ -164,12 +162,12 @@ class GPUCACell {
                                 ro,
                                 zo,
                                 ptmin,
-                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
-    return (aligned &&
-            dcaCut(hh,
-                   otherCell,
-                   otherCell.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
-                   hardCurvCut));  // FIXME tune cuts
+                                isBarrel ? caThetaCutBarrel : caThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+    return (aligned && dcaCut(hh,
+                              otherCell,
+                              otherCell.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                              : dcaCutOuterTriplet,
+                              hardCurvCut));  // FIXME tune cuts
   }
 
   __device__ __forceinline__ static bool areAlignedRZ(
@@ -188,14 +186,14 @@ class GPUCACell {
                                 GPUCACell const& otherCell,
                                 const float region_origin_radius_plus_tolerance,
                                 const float maxCurv) const {
-    auto x1 = otherCell.get_inner_x(hh);
-    auto y1 = otherCell.get_inner_y(hh);
+    auto x1 = otherCell.inner_x(hh);
+    auto y1 = otherCell.inner_y(hh);
 
-    auto x2 = get_inner_x(hh);
-    auto y2 = get_inner_y(hh);
+    auto x2 = inner_x(hh);
+    auto y2 = inner_y(hh);
 
-    auto x3 = get_outer_x(hh);
-    auto y3 = get_outer_y(hh);
+    auto x3 = outer_x(hh);
+    auto y3 = outer_y(hh);
 
     CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
 
@@ -222,52 +220,48 @@ class GPUCACell {
   }
 
   __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const {
-    constexpr uint32_t max_ladder_bpx0 = 12;
-    constexpr uint32_t first_ladder_bpx0 = 0;
-    constexpr float module_length = 6.7f;
-    constexpr float module_tolerance = 0.4f;  // projection to cylinder is inaccurate on BPIX1
-    int p = innerCell.get_inner_iphi(hh);
+    using caConstants::first_ladder_bpx0;
+    using caConstants::max_ladder_bpx0;
+    using caConstants::module_length_bpx0;
+    using caConstants::module_tolerance_bpx0;
+    int p = innerCell.inner_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
     p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
     p %= max_ladder_bpx0;
     auto il = first_ladder_bpx0 + p;
     auto r0 = hh.averageGeometry().ladderR[il];
-    auto ri = innerCell.get_inner_r(hh);
-    auto zi = innerCell.get_inner_z(hh);
-    auto ro = get_outer_r(hh);
-    auto zo = get_outer_z(hh);
+    auto ri = innerCell.inner_r(hh);
+    auto zi = innerCell.inner_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
     auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
     auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]);
-    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
-    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    auto z_in_module = z_in_ladder - module_length_bpx0 * int(z_in_ladder / module_length_bpx0);
+    auto gap = z_in_module < module_tolerance_bpx0 || z_in_module > (module_length_bpx0 - module_tolerance_bpx0);
     return gap;
   }
 
   __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const {
-    constexpr uint32_t max_ladder_bpx4 = 64;
-    constexpr uint32_t first_ladder_bpx4 = 84;
-    // constexpr float radius_even_ladder = 15.815f;
-    // constexpr float radius_odd_ladder = 16.146f;
-    constexpr float module_length = 6.7f;
-    constexpr float module_tolerance = 0.2f;
-    // constexpr float barrel_z_length = 26.f;
-    // constexpr float forward_z_begin = 32.f;
-    int p = get_outer_iphi(hh);
+    using caConstants::first_ladder_bpx4;
+    using caConstants::max_ladder_bpx4;
+    using caConstants::module_length_bpx4;
+    using caConstants::module_tolerance_bpx4;
+    int p = outer_iphi(hh);
     if (p < 0)
       p += std::numeric_limits<unsigned short>::max();
     p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
     p %= max_ladder_bpx4;
     auto il = first_ladder_bpx4 + p;
     auto r4 = hh.averageGeometry().ladderR[il];
-    auto ri = innerCell.get_inner_r(hh);
-    auto zi = innerCell.get_inner_z(hh);
-    auto ro = get_outer_r(hh);
-    auto zo = get_outer_z(hh);
+    auto ri = innerCell.inner_r(hh);
+    auto zi = innerCell.inner_z(hh);
+    auto ro = outer_r(hh);
+    auto zo = outer_z(hh);
     auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
     auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]);
-    auto z_in_module = z_in_ladder - module_length * int(z_in_ladder / module_length);
-    auto gap = z_in_module < module_tolerance || z_in_module > (module_length - module_tolerance);
+    auto z_in_module = z_in_ladder - module_length_bpx4 * int(z_in_ladder / module_length_bpx4);
+    auto gap = z_in_module < module_tolerance_bpx4 || z_in_module > (module_length_bpx4 - module_tolerance_bpx4);
     auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
     auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1];
     return gap || holeP || holeN;
@@ -290,13 +284,12 @@ class GPUCACell {
     // the ntuplets is then saved if the number of hits it contains is greater
     // than a threshold
 
-    tmpNtuplet.push_back_unsafe(theDoubletId);
+    tmpNtuplet.push_back_unsafe(theDoubletId_);
     assert(tmpNtuplet.size() <= 4);
 
     bool last = true;
-    for (int j = 0; j < outerNeighbors().size(); ++j) {
-      auto otherCell = outerNeighbors()[j];
-      if (cells[otherCell].theDoubletId < 0)
+    for (unsigned int otherCell : outerNeighbors()) {
+      if (cells[otherCell].theDoubletId_ < 0)
         continue;  // killed by earlyFishbone
       last = false;
       cells[otherCell].find_ntuplets(
@@ -329,16 +322,23 @@ class GPUCACell {
     assert(tmpNtuplet.size() < 4);
   }
 
+  // Cell status management
+  __device__ __forceinline__ void kill() { theDoubletId_ = -1; }
+  __device__ __forceinline__ bool isKilled() const { return theDoubletId_ < 0; }
+
+  __device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; }
+
+  __device__ __forceinline__ bool unused() const { return !theUsed_; }
+  __device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; }
+
 private:
   CellNeighbors* theOuterNeighbors;
   CellTracks* theTracks;
 
-public:
-  int32_t theDoubletId;
-  int16_t theLayerPairId;
-  uint16_t theUsed;  // tbd
+  int32_t theDoubletId_;
+  int16_t theLayerPairId_;
+  uint16_t theUsed_;  // tbd
 
-private:
   float theInnerZ;
   float theInnerR;
   hindex_type theInnerHitId;
diff --git a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.cc b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.cc
index bae8a88b6..1b9d292b8 100644
--- a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.cc
+++ b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.cc
@@ -4,13 +4,13 @@
 void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
                                   TupleMultiplicity const *tupleMultiplicity,
                                   OutputSoA *helix_fit_results) {
-  tuples_d = tuples;
-  tupleMultiplicity_d = tupleMultiplicity;
-  outputSoa_d = helix_fit_results;
+  tuples_ = tuples;
+  tupleMultiplicity_ = tupleMultiplicity;
+  outputSoa_ = helix_fit_results;
 
-  assert(tuples_d);
-  assert(tupleMultiplicity_d);
-  assert(outputSoa_d);
+  assert(tuples_);
+  assert(tupleMultiplicity_);
+  assert(outputSoa_);
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h
index 77ce7719d..fee0f8dae 100644
--- a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h
+++ b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h
@@ -1,35 +1,35 @@
-#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
-#define RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
+#define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 #include "CAConstants.h"
 #include "FitResult.h"
 
-namespace Rfit {
+namespace riemannFit {
   // in case of memory issue can be made smaller
-  constexpr uint32_t maxNumberOfConcurrentFits() { return CAConstants::maxNumberOfTuples(); }
-  constexpr uint32_t stride() { return maxNumberOfConcurrentFits(); }
+  constexpr uint32_t maxNumberOfConcurrentFits = caConstants::maxNumberOfTuples;
+  constexpr uint32_t stride = maxNumberOfConcurrentFits;
   using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
-  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride(), stride()> >;
+  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride, stride> >;
   using Matrix6x4f = Eigen::Matrix<float, 6, 4>;
-  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride(), stride()> >;
+  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride, stride> >;
 
   // hits
   template <int N>
   using Matrix3xNd = Eigen::Matrix<double, 3, N>;
   template <int N>
-  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride(), stride()> >;
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride, stride> >;
   // errors
   template <int N>
   using Matrix6xNf = Eigen::Matrix<float, 6, N>;
   template <int N>
-  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride(), stride()> >;
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride, stride> >;
   // fast fit
-  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride> >;
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 class HelixFitOnGPU {
 public:
@@ -38,7 +38,7 @@ class HelixFitOnGPU {
   using Tuples = pixelTrack::HitContainer;
   using OutputSoA = pixelTrack::TrackSoA;
 
-  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+  using TupleMultiplicity = caConstants::TupleMultiplicity;
 
   explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
   ~HelixFitOnGPU() { deallocateOnGPU(); }
@@ -54,15 +54,15 @@ class HelixFitOnGPU {
   void deallocateOnGPU();
 
 private:
-  static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
+  static constexpr uint32_t maxNumberOfConcurrentFits_ = riemannFit::maxNumberOfConcurrentFits;
 
   // fowarded
-  Tuples const *tuples_d = nullptr;
-  TupleMultiplicity const *tupleMultiplicity_d = nullptr;
-  OutputSoA *outputSoa_d;
+  Tuples const *tuples_ = nullptr;
+  TupleMultiplicity const *tupleMultiplicity_ = nullptr;
+  OutputSoA *outputSoa_;
   float bField_;
 
   const bool fit5as4_;
 };
 
-#endif  // RecoPixelVertexing_PixelTrackFitting_plugins_HelixFitOnGPU_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
diff --git a/src/cudadev/plugin-PixelTriplets/RiemannFit.h b/src/cudadev/plugin-PixelTriplets/RiemannFit.h
index 994b1dcf9..4968b1777 100644
--- a/src/cudadev/plugin-PixelTriplets/RiemannFit.h
+++ b/src/cudadev/plugin-PixelTriplets/RiemannFit.h
@@ -3,12 +3,12 @@
 
 #include "FitUtils.h"
 
-namespace Rfit {
+namespace riemannFit {
 
   /*!  Compute the Radiation length in the uniform hypothesis
  *
- * The Pixel detector, barrel and forward, is considered as an omogeneous
- * cilinder of material, whose radiation lengths has been derived from the TDR
+ * The Pixel detector, barrel and forward, is considered as an homogeneous
+ * cylinder of material, whose radiation lengths has been derived from the TDR
  * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
  * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
  * lengths are computed using this unique number, in both regions, barrel and
@@ -16,12 +16,12 @@ namespace Rfit {
  *
  * NB: no angle corrections nor projections are computed inside this routine.
  * It is therefore the responsibility of the caller to supply the proper
- * lengths in input. These lenghts are the path travelled by the particle along
+ * lengths in input. These lengths are the path traveled by the particle along
  * its trajectory, namely the so called S of the helix in 3D space.
  *
  * \param length_values vector of incremental distances that will be translated
  * into radiation length equivalent. Each radiation length i is computed
- * incrementally with respect to the previous length i-1. The first lenght has
+ * incrementally with respect to the previous length i-1. The first length has
  * no reference point (i.e. it has the dca).
  *
  * \return incremental radiation lengths that correspond to each segment.
@@ -31,11 +31,11 @@ namespace Rfit {
   __host__ __device__ inline void computeRadLenUniformMaterial(const VNd1& length_values, VNd2& rad_lengths) {
     // Radiation length of the pixel detector in the uniform assumption, with
     // 0.06 rad_len at 16 cm
-    constexpr double XX_0_inv = 0.06 / 16.;
-    u_int n = length_values.rows();
-    rad_lengths(0) = length_values(0) * XX_0_inv;
-    for (u_int j = 1; j < n; ++j) {
-      rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * XX_0_inv;
+    constexpr double xx_0_inv = 0.06 / 16.;
+    uint n = length_values.rows();
+    rad_lengths(0) = length_values(0) * xx_0_inv;
+    for (uint j = 1; j < n; ++j) {
+      rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * xx_0_inv;
     }
   }
 
@@ -59,41 +59,41 @@ namespace Rfit {
  */
 
   template <typename V4, typename VNd1, typename VNd2, int N>
-  __host__ __device__ inline auto Scatter_cov_line(Matrix2d const* cov_sz,
-                                                   const V4& fast_fit,
-                                                   VNd1 const& s_arcs,
-                                                   VNd2 const& z_values,
-                                                   const double theta,
-                                                   const double B,
-                                                   MatrixNd<N>& ret) {
+  __host__ __device__ inline auto scatterCovLine(Matrix2d const* cov_sz,
+                                                 const V4& fast_fit,
+                                                 VNd1 const& s_arcs,
+                                                 VNd2 const& z_values,
+                                                 const double theta,
+                                                 const double bField,
+                                                 MatrixNd<N>& ret) {
 #ifdef RFIT_DEBUG
-    Rfit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
+    riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
 #endif
-    constexpr u_int n = N;
-    double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
-    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    constexpr uint n = N;
+    double p_t = std::min(20., fast_fit(2) * bField);  // limit pt to avoid too small error!!!
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
     VectorNd<N> rad_lengths_S;
     // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
     // Basically, to perform cwise operations on Matrices and Vectors, you need
     // to transform them into Array-like objects.
-    VectorNd<N> S_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
-    S_values = S_values.array().sqrt();
-    computeRadLenUniformMaterial(S_values, rad_lengths_S);
+    VectorNd<N> s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+    s_values = s_values.array().sqrt();
+    computeRadLenUniformMaterial(s_values, rad_lengths_S);
     VectorNd<N> sig2_S;
     sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
 #ifdef RFIT_DEBUG
-    Rfit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
+    riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
 #endif
     Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
-    for (u_int k = 0; k < n; ++k) {
+    for (uint k = 0; k < n; ++k) {
       tmp(k, k) = cov_sz[k](0, 0);
       tmp(k + n, k + n) = cov_sz[k](1, 1);
       tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
     }
-    for (u_int k = 0; k < n; ++k) {
-      for (u_int l = k; l < n; ++l) {
-        for (u_int i = 0; i < std::min(k, l); ++i) {
-          tmp(k + n, l + n) += std::abs(S_values(k) - S_values(i)) * std::abs(S_values(l) - S_values(i)) * sig2_S(i);
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < std::min(k, l); ++i) {
+          tmp(k + n, l + n) += std::abs(s_values(k) - s_values(i)) * std::abs(s_values(l) - s_values(i)) * sig2_S(i);
         }
         tmp(l + n, k + n) = tmp(k + n, l + n);
       }
@@ -101,7 +101,7 @@ namespace Rfit {
     // We are interested only in the errors orthogonal to the rotated s-axis
     // which, in our formalism, are in the lower square matrix.
 #ifdef RFIT_DEBUG
-    Rfit::printIt(&tmp, "Scatter_cov_line - tmp: ");
+    riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: ");
 #endif
     ret = tmp.block(n, n, n, n);
   }
@@ -120,41 +120,41 @@ namespace Rfit {
     negligible).
  */
   template <typename M2xN, typename V4, int N>
-  __host__ __device__ inline MatrixNd<N> Scatter_cov_rad(const M2xN& p2D,
+  __host__ __device__ inline MatrixNd<N> scatter_cov_rad(const M2xN& p2D,
                                                          const V4& fast_fit,
                                                          VectorNd<N> const& rad,
                                                          double B) {
-    constexpr u_int n = N;
+    constexpr uint n = N;
     double p_t = std::min(20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
-    double p_2 = p_t * p_t * (1. + 1. / (fast_fit(3) * fast_fit(3)));
+    double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
     double theta = atan(fast_fit(3));
     theta = theta < 0. ? theta + M_PI : theta;
     VectorNd<N> s_values;
     VectorNd<N> rad_lengths;
-    const Vector2d o(fast_fit(0), fast_fit(1));
+    const Vector2d oVec(fast_fit(0), fast_fit(1));
 
     // associated Jacobian, used in weights and errors computation
-    for (u_int i = 0; i < n; ++i) {  // x
-      Vector2d p = p2D.block(0, i, 2, 1) - o;
-      const double cross = cross2D(-o, p);
-      const double dot = (-o).dot(p);
-      const double atan2_ = atan2(cross, dot);
-      s_values(i) = std::abs(atan2_ * fast_fit(2));
+    for (uint i = 0; i < n; ++i) {  // x
+      Vector2d pVec = p2D.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(-oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
+      const double tempAtan2 = atan2(cross, dot);
+      s_values(i) = std::abs(tempAtan2 * fast_fit(2));
     }
-    computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / (fast_fit(3) * fast_fit(3))), rad_lengths);
+    computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths);
     MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
     VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
     sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
-    for (u_int k = 0; k < n; ++k) {
-      for (u_int l = k; l < n; ++l) {
-        for (u_int i = 0; i < std::min(k, l); ++i) {
+    for (uint k = 0; k < n; ++k) {
+      for (uint l = k; l < n; ++l) {
+        for (uint i = 0; i < std::min(k, l); ++i) {
           scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
         }
         scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
       }
     }
 #ifdef RFIT_DEBUG
-    Rfit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+    riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
 #endif
     return scatter_cov_rad;
   }
@@ -175,12 +175,12 @@ namespace Rfit {
     printf("Address of p2D: %p\n", &p2D);
 #endif
     printIt(&p2D, "cov_radtocart - p2D:");
-    constexpr u_int n = N;
+    constexpr uint n = N;
     Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
     VectorNd<N> rad_inv = rad.cwiseInverse();
     printIt(&rad_inv, "cov_radtocart - rad_inv:");
-    for (u_int i = 0; i < n; ++i) {
-      for (u_int j = i; j < n; ++j) {
+    for (uint i = 0; i < n; ++i) {
+      for (uint j = i; j < n; ++j) {
         cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
         cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
         cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
@@ -208,10 +208,10 @@ namespace Rfit {
   __host__ __device__ inline VectorNd<N> cov_carttorad(const M2xN& p2D,
                                                        const Matrix2Nd<N>& cov_cart,
                                                        const VectorNd<N>& rad) {
-    constexpr u_int n = N;
+    constexpr uint n = N;
     VectorNd<N> cov_rad;
     const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
-    for (u_int i = 0; i < n; ++i) {
+    for (uint i = 0; i < n; ++i) {
       //!< in case you have (0,0) to avoid dividing by 0 radius
       if (rad(i) < 1.e-4)
         cov_rad(i) = cov_cart(i, i);
@@ -240,9 +240,9 @@ namespace Rfit {
                                                               const Matrix2Nd<N>& cov_cart,
                                                               V4& fast_fit,
                                                               const VectorNd<N>& rad) {
-    constexpr u_int n = N;
+    constexpr uint n = N;
     VectorNd<N> cov_rad;
-    for (u_int i = 0; i < n; ++i) {
+    for (uint i = 0; i < n; ++i) {
       //!< in case you have (0,0) to avoid dividing by 0 radius
       if (rad(i) < 1.e-4)
         cov_rad(i) = cov_cart(i, i);  // TO FIX
@@ -272,7 +272,7 @@ namespace Rfit {
 */
 
   template <int N>
-  __host__ __device__ inline VectorNd<N> Weight_circle(const MatrixNd<N>& cov_rad_inv) {
+  __host__ __device__ inline VectorNd<N> weightCircle(const MatrixNd<N>& cov_rad_inv) {
     return cov_rad_inv.colwise().sum().transpose();
   }
 
@@ -285,7 +285,7 @@ namespace Rfit {
     \return q int 1 or -1.
 */
   template <typename M2xN>
-  __host__ __device__ inline int32_t Charge(const M2xN& p2D, const Vector3d& par_uvr) {
+  __host__ __device__ inline int32_t charge(const M2xN& p2D, const Vector3d& par_uvr) {
     return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
             0)
                ? -1
@@ -342,7 +342,7 @@ namespace Rfit {
 
   /*!
     \brief 2D version of min_eigen3D().
-    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param aMat the Matrix you want to know eigenvector and eigenvalue.
     \param chi2 the double were the chi2-related quantity will be stored
     \return the eigenvector associated to the minimum eigenvalue.
     \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix
@@ -350,9 +350,9 @@ namespace Rfit {
     significantly in single precision.
 */
 
-  __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& A, double& chi2) {
+  __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& aMat, double& chi2) {
     Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
-    solver.computeDirect(A);
+    solver.computeDirect(aMat);
     int min_index;
     chi2 = solver.eigenvalues().minCoeff(&min_index);
     return solver.eigenvectors().col(min_index);
@@ -372,48 +372,48 @@ namespace Rfit {
 */
 
   template <typename M3xN, typename V4>
-  __host__ __device__ inline void Fast_fit(const M3xN& hits, V4& result) {
+  __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
     constexpr auto n = N;  // get the number of hits
     printIt(&hits, "Fast_fit - hits: ");
 
     // CIRCLE FIT
     // Make segments between middle-to-first(b) and last-to-first(c) hits
-    const Vector2d b = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
-    const Vector2d c = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
-    printIt(&b, "Fast_fit - b: ");
-    printIt(&c, "Fast_fit - c: ");
+    const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+    const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+    printIt(&bVec, "Fast_fit - b: ");
+    printIt(&cVec, "Fast_fit - c: ");
     // Compute their lengths
-    auto b2 = b.squaredNorm();
-    auto c2 = c.squaredNorm();
+    auto b2 = bVec.squaredNorm();
+    auto c2 = cVec.squaredNorm();
     // The algebra has been verified (MR). The usual approach has been followed:
     // * use an orthogonal reference frame passing from the first point.
     // * build the segments (chords)
     // * build orthogonal lines through mid points
     // * make a system and solve for X0 and Y0.
     // * add the initial point
-    bool flip = abs(b.x()) < abs(b.y());
-    auto bx = flip ? b.y() : b.x();
-    auto by = flip ? b.x() : b.y();
-    auto cx = flip ? c.y() : c.x();
-    auto cy = flip ? c.x() : c.y();
+    bool flip = abs(bVec.x()) < abs(bVec.y());
+    auto bx = flip ? bVec.y() : bVec.x();
+    auto by = flip ? bVec.x() : bVec.y();
+    auto cx = flip ? cVec.y() : cVec.x();
+    auto cy = flip ? cVec.x() : cVec.y();
     //!< in case b.x is 0 (2 hits with same x)
     auto div = 2. * (cx * by - bx * cy);
     // if aligned TO FIX
-    auto Y0 = (cx * b2 - bx * c2) / div;
-    auto X0 = (0.5 * b2 - Y0 * by) / bx;
-    result(0) = hits(0, 0) + (flip ? Y0 : X0);
-    result(1) = hits(1, 0) + (flip ? X0 : Y0);
-    result(2) = sqrt(sqr(X0) + sqr(Y0));
+    auto y0 = (cx * b2 - bx * c2) / div;
+    auto x0 = (0.5 * b2 - y0 * by) / bx;
+    result(0) = hits(0, 0) + (flip ? y0 : x0);
+    result(1) = hits(1, 0) + (flip ? x0 : y0);
+    result(2) = sqrt(sqr(x0) + sqr(y0));
     printIt(&result, "Fast_fit - result: ");
 
     // LINE FIT
-    const Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
-    const Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
-    printIt(&e, "Fast_fit - e: ");
-    printIt(&d, "Fast_fit - d: ");
+    const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2);
+    const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2);
+    printIt(&eVec, "Fast_fit - e: ");
+    printIt(&dVec, "Fast_fit - d: ");
     // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
-    auto dr = result(2) * atan2(cross2D(d, e), d.dot(e));
+    auto dr = result(2) * atan2(cross2D(dVec, eVec), dVec.dot(eVec));
     // Simple difference in Z between last and first hit
     auto dz = hits(2, n - 1) - hits(2, 0);
 
@@ -432,7 +432,7 @@ namespace Rfit {
     \param hits_cov2D covariance matrix of 2D points.
     \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)).
     (tan(theta) is not used).
-    \param B magnetic field
+    \param bField magnetic field
     \param error flag for error computation.
     \param scattering flag for multiple scattering
     \return circle circle_fit:
@@ -452,18 +452,18 @@ namespace Rfit {
     scattering.
 */
   template <typename M2xN, typename V4, int N>
-  __host__ __device__ inline circle_fit Circle_fit(const M2xN& hits2D,
-                                                   const Matrix2Nd<N>& hits_cov2D,
-                                                   const V4& fast_fit,
-                                                   const VectorNd<N>& rad,
-                                                   const double B,
-                                                   const bool error) {
+  __host__ __device__ inline CircleFit circleFit(const M2xN& hits2D,
+                                                 const Matrix2Nd<N>& hits_cov2D,
+                                                 const V4& fast_fit,
+                                                 const VectorNd<N>& rad,
+                                                 const double bField,
+                                                 const bool error) {
 #ifdef RFIT_DEBUG
     printf("circle_fit - enter\n");
 #endif
     // INITIALIZATION
-    Matrix2Nd<N> V = hits_cov2D;
-    constexpr u_int n = N;
+    Matrix2Nd<N> vMat = hits_cov2D;
+    constexpr uint n = N;
     printIt(&hits2D, "circle_fit - hits2D:");
     printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
 
@@ -472,25 +472,25 @@ namespace Rfit {
 #endif
     // WEIGHT COMPUTATION
     VectorNd<N> weight;
-    MatrixNd<N> G;
+    MatrixNd<N> gMat;
     double renorm;
     {
-      MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, V, fast_fit, rad).asDiagonal();
-      MatrixNd<N> scatter_cov_rad = Scatter_cov_rad(hits2D, fast_fit, rad, B);
-      printIt(&scatter_cov_rad, "circle_fit - scatter_cov_rad:");
+      MatrixNd<N> cov_rad = cov_carttorad_prefit(hits2D, vMat, fast_fit, rad).asDiagonal();
+      MatrixNd<N> scatterCovRadMat = scatter_cov_rad(hits2D, fast_fit, rad, bField);
+      printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:");
       printIt(&hits2D, "circle_fit - hits2D bis:");
 #ifdef RFIT_DEBUG
       printf("Address of hits2D: a) %p\n", &hits2D);
 #endif
-      V += cov_radtocart(hits2D, scatter_cov_rad, rad);
-      printIt(&V, "circle_fit - V:");
-      cov_rad += scatter_cov_rad;
+      vMat += cov_radtocart(hits2D, scatterCovRadMat, rad);
+      printIt(&vMat, "circle_fit - V:");
+      cov_rad += scatterCovRadMat;
       printIt(&cov_rad, "circle_fit - cov_rad:");
-      math::cholesky::invert(cov_rad, G);
-      // G = cov_rad.inverse();
-      renorm = G.sum();
-      G *= 1. / renorm;
-      weight = Weight_circle(G);
+      math::cholesky::invert(cov_rad, gMat);
+      // gMat = cov_rad.inverse();
+      renorm = gMat.sum();
+      gMat *= 1. / renorm;
+      weight = weightCircle(gMat);
     }
     printIt(&weight, "circle_fit - weight:");
 
@@ -503,19 +503,19 @@ namespace Rfit {
 #ifdef RFIT_DEBUG
     printf("Address of hits2D: b) %p\n", &hits2D);
 #endif
-    const Vector2d h_ = hits2D.rowwise().mean();  // centroid
-    printIt(&h_, "circle_fit - h_:");
+    const Vector2d hCentroid = hits2D.rowwise().mean();  // centroid
+    printIt(&hCentroid, "circle_fit - h_:");
     Matrix3xNd<N> p3D;
-    p3D.block(0, 0, 2, n) = hits2D.colwise() - h_;
+    p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid;
     printIt(&p3D, "circle_fit - p3D: a)");
     Vector2Nd<N> mc;  // centered hits, used in error computation
     mc << p3D.row(0).transpose(), p3D.row(1).transpose();
     printIt(&mc, "circle_fit - mc(centered hits):");
 
     // scale
-    const double q = mc.squaredNorm();
-    const double s = sqrt(n * 1. / q);  // scaling factor
-    p3D *= s;
+    const double tempQ = mc.squaredNorm();
+    const double tempS = sqrt(n * 1. / tempQ);  // scaling factor
+    p3D.block(0, 0, 2, n) *= tempS;
 
     // project on paraboloid
     p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
@@ -529,22 +529,22 @@ namespace Rfit {
     // compute
     Vector3d r0;
     r0.noalias() = p3D * weight;  // center of gravity
-    const Matrix3xNd<N> X = p3D.colwise() - r0;
-    Matrix3d A = X * G * X.transpose();
-    printIt(&A, "circle_fit - A:");
+    const Matrix3xNd<N> xMat = p3D.colwise() - r0;
+    Matrix3d aMat = xMat * gMat * xMat.transpose();
+    printIt(&aMat, "circle_fit - A:");
 
 #ifdef RFIT_DEBUG
     printf("circle_fit - MINIMIZE\n");
 #endif
     // minimize
     double chi2;
-    Vector3d v = min_eigen3D(A, chi2);
+    Vector3d vVec = min_eigen3D(aMat, chi2);
 #ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN\n");
 #endif
-    printIt(&v, "v BEFORE INVERSION");
-    v *= (v(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
-    printIt(&v, "v AFTER INVERSION");
+    printIt(&vVec, "v BEFORE INVERSION");
+    vVec *= (vVec(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+    printIt(&vVec, "v AFTER INVERSION");
     // This hack to be able to run on GPU where the automatic assignment to a
     // double from the vector multiplication is not working.
 #ifdef RFIT_DEBUG
@@ -554,12 +554,11 @@ namespace Rfit {
 #ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN 2\n");
 #endif
-    cm = -v.transpose() * r0;
+    cm = -vVec.transpose() * r0;
 #ifdef RFIT_DEBUG
     printf("circle_fit - AFTER MIN_EIGEN 3\n");
 #endif
-    const double c = cm(0, 0);
-    //  const double c = -v.transpose() * r0;
+    const double tempC = cm(0, 0);
 
 #ifdef RFIT_DEBUG
     printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
@@ -567,20 +566,20 @@ namespace Rfit {
     // COMPUTE CIRCLE PARAMETER
 
     // auxiliary quantities
-    const double h = sqrt(1. - sqr(v(2)) - 4. * c * v(2));
-    const double v2x2_inv = 1. / (2. * v(2));
-    const double s_inv = 1. / s;
-    Vector3d par_uvr_;  // used in error propagation
-    par_uvr_ << -v(0) * v2x2_inv, -v(1) * v2x2_inv, h * v2x2_inv;
-
-    circle_fit circle;
-    circle.par << par_uvr_(0) * s_inv + h_(0), par_uvr_(1) * s_inv + h_(1), par_uvr_(2) * s_inv;
-    circle.q = Charge(hits2D, circle.par);
-    circle.chi2 = abs(chi2) * renorm * 1. / sqr(2 * v(2) * par_uvr_(2) * s);
+    const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2));
+    const double v2x2_inv = 1. / (2. * vVec(2));
+    const double s_inv = 1. / tempS;
+    Vector3d par_uvr;  // used in error propagation
+    par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv;
+
+    CircleFit circle;
+    circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv;
+    circle.qCharge = charge(hits2D, circle.par);
+    circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS);
     printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
     printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
 #ifdef RFIT_DEBUG
-    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.q);
+    printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge);
 #endif
 
 #ifdef RFIT_DEBUG
@@ -591,28 +590,28 @@ namespace Rfit {
 #ifdef RFIT_DEBUG
       printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
 #endif
-      ArrayNd<N> Vcs_[2][2];  // cov matrix of center & scaled points
-      MatrixNd<N> C[3][3];    // cov matrix of 3D transformed points
+      ArrayNd<N> vcsMat[2][2];  // cov matrix of center & scaled points
+      MatrixNd<N> cMat[3][3];   // cov matrix of 3D transformed points
 #ifdef RFIT_DEBUG
       printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
 #endif
       {
         Eigen::Matrix<double, 1, 1> cm;
         Eigen::Matrix<double, 1, 1> cm2;
-        cm = mc.transpose() * V * mc;
-        const double c = cm(0, 0);
-        Matrix2Nd<N> Vcs;
-        Vcs.template triangularView<Eigen::Upper>() =
-            (sqr(s) * V + sqr(sqr(s)) * 1. / (4. * q * n) *
-                              (2. * V.squaredNorm() + 4. * c) *  // mc.transpose() * V * mc) *
-                              (mc * mc.transpose()));
-
-        printIt(&Vcs, "circle_fit - Vcs:");
-        C[0][0] = Vcs.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
-        Vcs_[0][1] = Vcs.block(0, n, n, n);
-        C[1][1] = Vcs.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
-        Vcs_[1][0] = Vcs_[0][1].transpose();
-        printIt(&Vcs, "circle_fit - Vcs:");
+        cm = mc.transpose() * vMat * mc;
+        const double tempC2 = cm(0, 0);
+        Matrix2Nd<N> tempVcsMat;
+        tempVcsMat.template triangularView<Eigen::Upper>() =
+            (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) *
+                                     (2. * vMat.squaredNorm() + 4. * tempC2) *  // mc.transpose() * V * mc) *
+                                     (mc * mc.transpose()));
+
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
+        cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[0][1] = tempVcsMat.block(0, n, n, n);
+        cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
+        vcsMat[1][0] = vcsMat[0][1].transpose();
+        printIt(&tempVcsMat, "circle_fit - Vcs:");
       }
 
       {
@@ -622,137 +621,139 @@ namespace Rfit {
         const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
         const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
         const ArrayNd<N> t10 = t01.transpose();
-        Vcs_[0][0] = C[0][0];
-        ;
-        C[0][1] = Vcs_[0][1];
-        C[0][2] = 2. * (Vcs_[0][0] * t0 + Vcs_[0][1] * t1);
-        Vcs_[1][1] = C[1][1];
-        C[1][2] = 2. * (Vcs_[1][0] * t0 + Vcs_[1][1] * t1);
+        vcsMat[0][0] = cMat[0][0];
+        cMat[0][1] = vcsMat[0][1];
+        cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1);
+        vcsMat[1][1] = cMat[1][1];
+        cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1);
         MatrixNd<N> tmp;
         tmp.template triangularView<Eigen::Upper>() =
-            (2. * (Vcs_[0][0] * Vcs_[0][0] + Vcs_[0][0] * Vcs_[0][1] + Vcs_[1][1] * Vcs_[1][0] +
-                   Vcs_[1][1] * Vcs_[1][1]) +
-             4. * (Vcs_[0][0] * t00 + Vcs_[0][1] * t01 + Vcs_[1][0] * t10 + Vcs_[1][1] * t11))
+            (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] +
+                   vcsMat[1][1] * vcsMat[1][1]) +
+             4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11))
                 .matrix();
-        C[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+        cMat[2][2] = tmp.template selfadjointView<Eigen::Upper>();
       }
-      printIt(&C[0][0], "circle_fit - C[0][0]:");
+      printIt(&cMat[0][0], "circle_fit - C[0][0]:");
 
-      Matrix3d C0;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
-      for (u_int i = 0; i < 3; ++i) {
-        for (u_int j = i; j < 3; ++j) {
+      Matrix3d c0Mat;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+      for (uint i = 0; i < 3; ++i) {
+        for (uint j = i; j < 3; ++j) {
           Eigen::Matrix<double, 1, 1> tmp;
-          tmp = weight.transpose() * C[i][j] * weight;
-          const double c = tmp(0, 0);
-          C0(i, j) = c;  //weight.transpose() * C[i][j] * weight;
-          C0(j, i) = C0(i, j);
+          tmp = weight.transpose() * cMat[i][j] * weight;
+          // Workaround to get things working in GPU
+          const double tempC = tmp(0, 0);
+          c0Mat(i, j) = tempC;  //weight.transpose() * C[i][j] * weight;
+          c0Mat(j, i) = c0Mat(i, j);
         }
       }
-      printIt(&C0, "circle_fit - C0:");
+      printIt(&c0Mat, "circle_fit - C0:");
 
-      const MatrixNd<N> W = weight * weight.transpose();
-      const MatrixNd<N> H = MatrixNd<N>::Identity().rowwise() - weight.transpose();
-      const MatrixNx3d<N> s_v = H * p3D.transpose();
-      printIt(&W, "circle_fit - W:");
-      printIt(&H, "circle_fit - H:");
+      const MatrixNd<N> wMat = weight * weight.transpose();
+      const MatrixNd<N> hMat = MatrixNd<N>::Identity().rowwise() - weight.transpose();
+      const MatrixNx3d<N> s_v = hMat * p3D.transpose();
+      printIt(&wMat, "circle_fit - W:");
+      printIt(&hMat, "circle_fit - H:");
       printIt(&s_v, "circle_fit - s_v:");
 
-      MatrixNd<N> D_[3][3];  // cov(s_v)
-      {
-        D_[0][0] = (H * C[0][0] * H.transpose()).cwiseProduct(W);
-        D_[0][1] = (H * C[0][1] * H.transpose()).cwiseProduct(W);
-        D_[0][2] = (H * C[0][2] * H.transpose()).cwiseProduct(W);
-        D_[1][1] = (H * C[1][1] * H.transpose()).cwiseProduct(W);
-        D_[1][2] = (H * C[1][2] * H.transpose()).cwiseProduct(W);
-        D_[2][2] = (H * C[2][2] * H.transpose()).cwiseProduct(W);
-        D_[1][0] = D_[0][1].transpose();
-        D_[2][0] = D_[0][2].transpose();
-        D_[2][1] = D_[1][2].transpose();
-      }
-      printIt(&D_[0][0], "circle_fit - D_[0][0]:");
-
-      constexpr u_int nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
-
-      Matrix6d E;  // cov matrix of the 6 independent elements of A
-      for (u_int a = 0; a < 6; ++a) {
-        const u_int i = nu[a][0], j = nu[a][1];
-        for (u_int b = a; b < 6; ++b) {
-          const u_int k = nu[b][0], l = nu[b][1];
+      MatrixNd<N> dMat[3][3];  // cov(s_v)
+      dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat);
+      dMat[1][0] = dMat[0][1].transpose();
+      dMat[2][0] = dMat[0][2].transpose();
+      dMat[2][1] = dMat[1][2].transpose();
+      printIt(&dMat[0][0], "circle_fit - D_[0][0]:");
+
+      constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+      Matrix6d eMat;  // cov matrix of the 6 independent elements of A
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        for (uint b = a; b < 6; ++b) {
+          const uint k = nu[b][0], l = nu[b][1];
           VectorNd<N> t0(n);
           VectorNd<N> t1(n);
           if (l == k) {
-            t0 = 2. * D_[j][l] * s_v.col(l);
+            t0 = 2. * dMat[j][l] * s_v.col(l);
             if (i == j)
               t1 = t0;
             else
-              t1 = 2. * D_[i][l] * s_v.col(l);
+              t1 = 2. * dMat[i][l] * s_v.col(l);
           } else {
-            t0 = D_[j][l] * s_v.col(k) + D_[j][k] * s_v.col(l);
+            t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l);
             if (i == j)
               t1 = t0;
             else
-              t1 = D_[i][l] * s_v.col(k) + D_[i][k] * s_v.col(l);
+              t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l);
           }
 
           if (i == j) {
             Eigen::Matrix<double, 1, 1> cm;
             cm = s_v.col(i).transpose() * (t0 + t1);
-            const double c = cm(0, 0);
-            E(a, b) = 0. + c;
+            // Workaround to get things working in GPU
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;
           } else {
             Eigen::Matrix<double, 1, 1> cm;
             cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
-            const double c = cm(0, 0);
-            E(a, b) = 0. + c;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+            // Workaround to get things working in GPU
+            const double tempC = cm(0, 0);
+            eMat(a, b) = 0. + tempC;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
           }
           if (b != a)
-            E(b, a) = E(a, b);
+            eMat(b, a) = eMat(a, b);
         }
       }
-      printIt(&E, "circle_fit - E:");
-
-      Eigen::Matrix<double, 3, 6> J2;  // Jacobian of min_eigen() (numerically computed)
-      for (u_int a = 0; a < 6; ++a) {
-        const u_int i = nu[a][0], j = nu[a][1];
-        Matrix3d Delta = Matrix3d::Zero();
-        Delta(i, j) = Delta(j, i) = abs(A(i, j) * d);
-        J2.col(a) = min_eigen3D_fast(A + Delta);
-        const int sign = (J2.col(a)(2) > 0) ? 1 : -1;
-        J2.col(a) = (J2.col(a) * sign - v) / Delta(i, j);
+      printIt(&eMat, "circle_fit - E:");
+
+      Eigen::Matrix<double, 3, 6> j2Mat;  // Jacobian of min_eigen() (numerically computed)
+      for (uint a = 0; a < 6; ++a) {
+        const uint i = nu[a][0], j = nu[a][1];
+        Matrix3d delta = Matrix3d::Zero();
+        delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon);
+        j2Mat.col(a) = min_eigen3D_fast(aMat + delta);
+        const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1;
+        j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j);
       }
-      printIt(&J2, "circle_fit - J2:");
+      printIt(&j2Mat, "circle_fit - J2:");
 
-      Matrix4d Cvc;  // joint cov matrix of (v0,v1,v2,c)
+      Matrix4d cvcMat;  // joint cov matrix of (v0,v1,v2,c)
       {
-        Matrix3d t0 = J2 * E * J2.transpose();
+        Matrix3d t0 = j2Mat * eMat * j2Mat.transpose();
         Vector3d t1 = -t0 * r0;
-        Cvc.block(0, 0, 3, 3) = t0;
-        Cvc.block(0, 3, 3, 1) = t1;
-        Cvc.block(3, 0, 1, 3) = t1.transpose();
+        cvcMat.block(0, 0, 3, 3) = t0;
+        cvcMat.block(0, 3, 3, 1) = t1;
+        cvcMat.block(3, 0, 1, 3) = t1.transpose();
         Eigen::Matrix<double, 1, 1> cm1;
         Eigen::Matrix<double, 1, 1> cm3;
-        cm1 = (v.transpose() * C0 * v);
-        //      cm2 = (C0.cwiseProduct(t0)).sum();
+        cm1 = (vVec.transpose() * c0Mat * vVec);
+        //      cm2 = (c0Mat.cwiseProduct(t0)).sum();
         cm3 = (r0.transpose() * t0 * r0);
-        const double c = cm1(0, 0) + (C0.cwiseProduct(t0)).sum() + cm3(0, 0);
-        Cvc(3, 3) = c;
-        // (v.transpose() * C0 * v) + (C0.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+        // Workaround to get things working in GPU
+        const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0);
+        cvcMat(3, 3) = tempC;
+        // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
       }
-      printIt(&Cvc, "circle_fit - Cvc:");
+      printIt(&cvcMat, "circle_fit - Cvc:");
 
-      Eigen::Matrix<double, 3, 4> J3;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+      Eigen::Matrix<double, 3, 4> j3Mat;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
       {
-        const double t = 1. / h;
-        J3 << -v2x2_inv, 0, v(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, v(1) * sqr(v2x2_inv) * 2., 0,
-            v(0) * v2x2_inv * t, v(1) * v2x2_inv * t, -h * sqr(v2x2_inv) * 2. - (2. * c + v(2)) * v2x2_inv * t, -t;
+        const double t = 1. / tempH;
+        j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0,
+            vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t,
+            -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t;
       }
-      printIt(&J3, "circle_fit - J3:");
+      printIt(&j3Mat, "circle_fit - J3:");
 
-      const RowVector2Nd<N> Jq = mc.transpose() * s * 1. / n;  // var(q)
+      const RowVector2Nd<N> Jq = mc.transpose() * tempS * 1. / n;  // var(q)
       printIt(&Jq, "circle_fit - Jq:");
 
-      Matrix3d cov_uvr = J3 * Cvc * J3.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
-                         + (par_uvr_ * par_uvr_.transpose()) * (Jq * V * Jq.transpose());
+      Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                         + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose());
 
       circle.cov = cov_uvr;
     }
@@ -781,15 +782,15 @@ namespace Rfit {
  */
 
   template <typename M3xN, typename M6xN, typename V4>
-  __host__ __device__ inline line_fit Line_fit(const M3xN& hits,
-                                               const M6xN& hits_ge,
-                                               const circle_fit& circle,
-                                               const V4& fast_fit,
-                                               const double B,
-                                               const bool error) {
+  __host__ __device__ inline LineFit lineFit(const M3xN& hits,
+                                             const M6xN& hits_ge,
+                                             const CircleFit& circle,
+                                             const V4& fast_fit,
+                                             const double bField,
+                                             const bool error) {
     constexpr uint32_t N = M3xN::ColsAtCompileTime;
     constexpr auto n = N;
-    double theta = -circle.q * atan(fast_fit(3));
+    double theta = -circle.qCharge * atan(fast_fit(3));
     theta = theta < 0. ? theta + M_PI : theta;
 
     // Prepare the Rotation Matrix to rotate the points
@@ -805,10 +806,10 @@ namespace Rfit {
     // z values will be ordinary y-values
 
     Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
-    Eigen::Matrix<double, 2, 6> Jx;
+    Eigen::Matrix<double, 2, 6> jxMat;
 
 #ifdef RFIT_DEBUG
-    printf("Line_fit - B: %g\n", B);
+    printf("Line_fit - B: %g\n", bField);
     printIt(&hits, "Line_fit points: ");
     printIt(&hits_ge, "Line_fit covs: ");
     printIt(&rot, "Line_fit rot: ");
@@ -818,41 +819,41 @@ namespace Rfit {
     // Slide 11
     // a ==> -o i.e. the origin of the circle in XY plane, negative
     // b ==> p i.e. distances of the points wrt the origin of the circle.
-    const Vector2d o(circle.par(0), circle.par(1));
+    const Vector2d oVec(circle.par(0), circle.par(1));
 
     // associated Jacobian, used in weights and errors computation
-    Matrix6d Cov = Matrix6d::Zero();
+    Matrix6d covMat = Matrix6d::Zero();
     Matrix2d cov_sz[N];
-    for (u_int i = 0; i < n; ++i) {
-      Vector2d p = hits.block(0, i, 2, 1) - o;
-      const double cross = cross2D(-o, p);
-      const double dot = (-o).dot(p);
+    for (uint i = 0; i < n; ++i) {
+      Vector2d pVec = hits.block(0, i, 2, 1) - oVec;
+      const double cross = cross2D(-oVec, pVec);
+      const double dot = (-oVec).dot(pVec);
       // atan2(cross, dot) give back the angle in the transverse plane so tha the
       // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
-      const double atan2_ = -circle.q * atan2(cross, dot);
+      const double tempQAtan2 = -circle.qCharge * atan2(cross, dot);
       //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
-      p2D(0, i) = atan2_ * circle.par(2);
+      p2D(0, i) = tempQAtan2 * circle.par(2);
 
       // associated Jacobian, used in weights and errors- computation
-      const double temp0 = -circle.q * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+      const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
       double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
       if (error) {
-        d_X0 = -temp0 * ((p(1) + o(1)) * dot - (p(0) - o(0)) * cross);
-        d_Y0 = temp0 * ((p(0) + o(0)) * dot - (o(1) - p(1)) * cross);
-        d_R = atan2_;
+        d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross);
+        d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross);
+        d_R = tempQAtan2;
       }
-      const double d_x = temp0 * (o(1) * dot + o(0) * cross);
-      const double d_y = temp0 * (-o(0) * dot + o(1) * cross);
-      Jx << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
-
-      Cov.block(0, 0, 3, 3) = circle.cov;
-      Cov(3, 3) = hits_ge.col(i)[0];              // x errors
-      Cov(4, 4) = hits_ge.col(i)[2];              // y errors
-      Cov(5, 5) = hits_ge.col(i)[5];              // z errors
-      Cov(3, 4) = Cov(4, 3) = hits_ge.col(i)[1];  // cov_xy
-      Cov(3, 5) = Cov(5, 3) = hits_ge.col(i)[3];  // cov_xz
-      Cov(4, 5) = Cov(5, 4) = hits_ge.col(i)[4];  // cov_yz
-      Matrix2d tmp = Jx * Cov * Jx.transpose();
+      const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross);
+      const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross);
+      jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+      covMat.block(0, 0, 3, 3) = circle.cov;
+      covMat(3, 3) = hits_ge.col(i)[0];                 // x errors
+      covMat(4, 4) = hits_ge.col(i)[2];                 // y errors
+      covMat(5, 5) = hits_ge.col(i)[5];                 // z errors
+      covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1];  // cov_xy
+      covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3];  // cov_xz
+      covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4];  // cov_yz
+      Matrix2d tmp = jxMat * covMat * jxMat.transpose();
       cov_sz[i].noalias() = rot * tmp * rot.transpose();
     }
     // Math of d_{X0,Y0,R,x,y} all verified by hand
@@ -861,7 +862,7 @@ namespace Rfit {
     // The following matrix will contain errors orthogonal to the rotated S
     // component only, with the Multiple Scattering properly treated!!
     MatrixNd<N> cov_with_ms;
-    Scatter_cov_line(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, B, cov_with_ms);
+    scatterCovLine(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms);
 #ifdef RFIT_DEBUG
     printIt(cov_sz, "line_fit - cov_sz:");
     printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
@@ -880,52 +881,54 @@ namespace Rfit {
 #endif
 
     // Build the A Matrix
-    Matrix2xNd<N> A;
-    A << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+    Matrix2xNd<N> aMat;
+    aMat << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
 
 #ifdef RFIT_DEBUG
-    printIt(&A, "A Matrix:");
+    printIt(&aMat, "A Matrix:");
 #endif
 
     // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
-    MatrixNd<N> Vy_inv;
-    math::cholesky::invert(cov_with_ms, Vy_inv);
-    // MatrixNd<N> Vy_inv = cov_with_ms.inverse();
-    Eigen::Matrix<double, 2, 2> Cov_params = A * Vy_inv * A.transpose();
+    MatrixNd<N> vyInvMat;
+    math::cholesky::invert(cov_with_ms, vyInvMat);
+    // MatrixNd<N> vyInvMat = cov_with_ms.inverse();
+    Eigen::Matrix<double, 2, 2> covParamsMat = aMat * vyInvMat * aMat.transpose();
     // Compute the Covariance Matrix of the fit parameters
-    math::cholesky::invert(Cov_params, Cov_params);
+    math::cholesky::invert(covParamsMat, covParamsMat);
 
     // Now Compute the Parameters in the form [2,1]
     // The first component is q.
     // The second component is m.
-    Eigen::Matrix<double, 2, 1> sol = Cov_params * A * Vy_inv * p2D_rot.row(1).transpose();
+    Eigen::Matrix<double, 2, 1> sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose();
 
 #ifdef RFIT_DEBUG
     printIt(&sol, "Rotated solutions:");
 #endif
 
     // We need now to transfer back the results in the original s-z plane
-    auto common_factor = 1. / (sin(theta) - sol(1, 0) * cos(theta));
-    Eigen::Matrix<double, 2, 2> J;
-    J << 0., common_factor * common_factor, common_factor, sol(0, 0) * cos(theta) * common_factor * common_factor;
+    const auto sinTheta = sin(theta);
+    const auto cosTheta = cos(theta);
+    auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta);
+    Eigen::Matrix<double, 2, 2> jMat;
+    jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor;
 
-    double m = common_factor * (sol(1, 0) * sin(theta) + cos(theta));
-    double q = common_factor * sol(0, 0);
-    auto cov_mq = J * Cov_params * J.transpose();
+    double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta);
+    double tempQ = common_factor * sol(0, 0);
+    auto cov_mq = jMat * covParamsMat * jMat.transpose();
 
-    VectorNd<N> res = p2D_rot.row(1).transpose() - A.transpose() * sol;
-    double chi2 = res.transpose() * Vy_inv * res;
+    VectorNd<N> res = p2D_rot.row(1).transpose() - aMat.transpose() * sol;
+    double chi2 = res.transpose() * vyInvMat * res;
 
-    line_fit line;
-    line.par << m, q;
+    LineFit line;
+    line.par << tempM, tempQ;
     line.cov << cov_mq;
     line.chi2 = chi2;
 
 #ifdef RFIT_DEBUG
     printf("Common_factor: %g\n", common_factor);
-    printIt(&J, "Jacobian:");
+    printIt(&jMat, "Jacobian:");
     printIt(&sol, "Rotated solutions:");
-    printIt(&Cov_params, "Cov_params:");
+    printIt(&covParamsMat, "Cov_params:");
     printIt(&cov_mq, "Rotated Covariance Matrix:");
     printIt(&(line.par), "Real Parameters:");
     printIt(&(line.cov), "Real Covariance Matrix:");
@@ -959,7 +962,7 @@ namespace Rfit {
    |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n
    |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n
    |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)|
-   \param B magnetic field in the center of the detector in Gev/cm/c
+   \param bField magnetic field in the center of the detector in Gev/cm/c
    unit, in order to perform pt calculation.
    \param error flag for error computation.
    \param scattering flag for multiple scattering treatment.
@@ -969,37 +972,37 @@ namespace Rfit {
 */
 
   template <int N>
-  inline helix_fit Helix_fit(const Matrix3xNd<N>& hits,
-                             const Eigen::Matrix<float, 6, N>& hits_ge,
-                             const double B,
-                             const bool error) {
-    constexpr u_int n = N;
+  inline HelixFit helixFit(const Matrix3xNd<N>& hits,
+                           const Eigen::Matrix<float, 6, N>& hits_ge,
+                           const double bField,
+                           const bool error) {
+    constexpr uint n = N;
     VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm());
 
     // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
     Vector4d fast_fit;
-    Fast_fit(hits, fast_fit);
-    Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
-    Rfit::loadCovariance2D(hits_ge, hits_cov);
-    circle_fit circle = Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, B, error);
-    line_fit line = Line_fit(hits, hits_ge, circle, fast_fit, B, error);
+    fastFit(hits, fast_fit);
+    riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+    riemannFit::loadCovariance2D(hits_ge, hits_cov);
+    CircleFit circle = circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, bField, error);
+    LineFit line = lineFit(hits, hits_ge, circle, fast_fit, bField, error);
 
-    par_uvrtopak(circle, B, error);
+    par_uvrtopak(circle, bField, error);
 
-    helix_fit helix;
+    HelixFit helix;
     helix.par << circle.par, line.par;
     if (error) {
       helix.cov = MatrixXd::Zero(5, 5);
       helix.cov.block(0, 0, 3, 3) = circle.cov;
       helix.cov.block(3, 3, 2, 2) = line.cov;
     }
-    helix.q = circle.q;
+    helix.qCharge = circle.qCharge;
     helix.chi2_circle = circle.chi2;
     helix.chi2_line = line.chi2;
 
     return helix;
   }
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 #endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
diff --git a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cc b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cc
index 347636286..491dd0df2 100644
--- a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cc
+++ b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cc
@@ -1,110 +1,113 @@
 #include "RiemannFitOnGPU.h"
 
 void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
+  assert(tuples_);
 
   //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
-  auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
-  auto circle_fit_resultsGPU_holder = std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+  auto hitsGPU =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
+  auto hits_geGPU =
+      std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
+  auto fast_fit_resultsGPU =
+      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
+  auto circle_fit_resultsGPU_holder =
+      std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit));
+  riemannFit::CircleFit *circle_fit_resultsGPU = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernelFastFit<3>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<3>(
+        tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-    kernelCircleFit<3>(tupleMultiplicity_d,
-                       3,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+    kernel_CircleFit<3>(tupleMultiplicity_,
+                        3,
+                        bField_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
 
-    kernelLineFit<3>(tupleMultiplicity_d,
-                     3,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
+    kernel_LineFit<3>(tupleMultiplicity_,
+                      3,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get(),
+                      circle_fit_resultsGPU,
+                      offset);
 
     // quads
-    kernelFastFit<4>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<4>(
+        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-    kernelCircleFit<4>(tupleMultiplicity_d,
-                       4,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+    kernel_CircleFit<4>(tupleMultiplicity_,
+                        4,
+                        bField_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
 
-    kernelLineFit<4>(tupleMultiplicity_d,
-                     4,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
+    kernel_LineFit<4>(tupleMultiplicity_,
+                      4,
+                      bField_,
+                      outputSoa_,
+                      hitsGPU.get(),
+                      hits_geGPU.get(),
+                      fast_fit_resultsGPU.get(),
+                      circle_fit_resultsGPU,
+                      offset);
 
     if (fit5as4_) {
       // penta
-      kernelFastFit<4>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<4>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-      kernelCircleFit<4>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
+      kernel_CircleFit<4>(tupleMultiplicity_,
+                          5,
+                          bField_,
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          circle_fit_resultsGPU,
+                          offset);
 
-      kernelLineFit<4>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+      kernel_LineFit<4>(tupleMultiplicity_,
+                        5,
+                        bField_,
+                        outputSoa_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
 
     } else {
       // penta all 5
-      kernelFastFit<5>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<5>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
 
-      kernelCircleFit<5>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
+      kernel_CircleFit<5>(tupleMultiplicity_,
+                          5,
+                          bField_,
+                          hitsGPU.get(),
+                          hits_geGPU.get(),
+                          fast_fit_resultsGPU.get(),
+                          circle_fit_resultsGPU,
+                          offset);
 
-      kernelLineFit<5>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
+      kernel_LineFit<5>(tupleMultiplicity_,
+                        5,
+                        bField_,
+                        outputSoa_,
+                        hitsGPU.get(),
+                        hits_geGPU.get(),
+                        fast_fit_resultsGPU.get(),
+                        circle_fit_resultsGPU,
+                        offset);
     }
   }
 }
diff --git a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu
index fe27153ac..8e6061611 100644
--- a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu
+++ b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.cu
@@ -5,126 +5,126 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
                                          cudaStream_t stream) {
-  assert(tuples_d);
+  assert(tuples_);
 
   auto blockSize = 64;
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto hitsGPU = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
   auto circle_fit_resultsGPU_holder =
-      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), stream);
+  riemannFit::CircleFit *circle_fit_resultsGPU_ = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // triplets
-    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 3,
-                                                                 bField_,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 circle_fit_resultsGPU_,
-                                                                 offset);
+    kernel_CircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                  3,
+                                                                  bField_,
+                                                                  hitsGPU.get(),
+                                                                  hits_geGPU.get(),
+                                                                  fast_fit_resultsGPU.get(),
+                                                                  circle_fit_resultsGPU_,
+                                                                  offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                               3,
-                                                               bField_,
-                                                               outputSoa_d,
-                                                               hitsGPU_.get(),
-                                                               hits_geGPU_.get(),
-                                                               fast_fit_resultsGPU_.get(),
-                                                               circle_fit_resultsGPU_,
-                                                               offset);
+    kernel_LineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                3,
+                                                                bField_,
+                                                                outputSoa_,
+                                                                hitsGPU.get(),
+                                                                hits_geGPU.get(),
+                                                                fast_fit_resultsGPU.get(),
+                                                                circle_fit_resultsGPU_,
+                                                                offset);
     cudaCheck(cudaGetLastError());
 
     // quads
-    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+        tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
     cudaCheck(cudaGetLastError());
 
-    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     4,
-                                                                     bField_,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
+    kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      4,
+                                                                      bField_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
     cudaCheck(cudaGetLastError());
 
-    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   4,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   circle_fit_resultsGPU_,
-                                                                   offset);
+    kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                    4,
+                                                                    bField_,
+                                                                    outputSoa_,
+                                                                    hitsGPU.get(),
+                                                                    hits_geGPU.get(),
+                                                                    fast_fit_resultsGPU.get(),
+                                                                    circle_fit_resultsGPU_,
+                                                                    offset);
     cudaCheck(cudaGetLastError());
 
     if (fit5as4_) {
       // penta
-      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
+      kernel_CircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
+      kernel_LineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
       cudaCheck(cudaGetLastError());
     } else {
       // penta all 5
-      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      kernel_FastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
+          tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset);
       cudaCheck(cudaGetLastError());
 
-      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
+      kernel_CircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                        5,
+                                                                        bField_,
+                                                                        hitsGPU.get(),
+                                                                        hits_geGPU.get(),
+                                                                        fast_fit_resultsGPU.get(),
+                                                                        circle_fit_resultsGPU_,
+                                                                        offset);
       cudaCheck(cudaGetLastError());
 
-      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
+      kernel_LineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_,
+                                                                      5,
+                                                                      bField_,
+                                                                      outputSoa_,
+                                                                      hitsGPU.get(),
+                                                                      hits_geGPU.get(),
+                                                                      fast_fit_resultsGPU.get(),
+                                                                      circle_fit_resultsGPU_,
+                                                                      offset);
       cudaCheck(cudaGetLastError());
     }
   }
diff --git a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h
index 02766b557..12c9856fa 100644
--- a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h
+++ b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h
@@ -6,7 +6,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/cuda_assert.h"
 #include "CondFormats/pixelCPEforGPU.h"
@@ -19,14 +19,14 @@ using Tuples = pixelTrack::HitContainer;
 using OutputSoA = pixelTrack::TrackSoA;
 
 template <int N>
-__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
-                              CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                              uint32_t nHits,
-                              HitsOnGPU const *__restrict__ hhp,
-                              double *__restrict__ phits,
-                              float *__restrict__ phits_ge,
-                              double *__restrict__ pfast_fit,
-                              uint32_t offset) {
+__global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
+                               caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                               uint32_t nHits,
+                               HitsOnGPU const *__restrict__ hhp,
+                               double *__restrict__ phits,
+                               float *__restrict__ phits_ge,
+                               double *__restrict__ pfast_fit,
+                               uint32_t offset) {
   constexpr uint32_t hitsInFit = N;
 
   assert(hitsInFit <= nHits);
@@ -43,7 +43,7 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
     printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
 #endif
 
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -51,13 +51,13 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
 
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
-    assert(tkid < foundNtuplets->nbins());
+    assert(tkid < foundNtuplets->nOnes());
 
     assert(foundNtuplets->size(tkid) == nHits);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
     // Prepare data structure
     auto const *hitId = foundNtuplets->begin(tkid);
@@ -73,7 +73,7 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
       hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
-    Rfit::Fast_fit(hits, fast_fit);
+    riemannFit::fastFit(hits, fast_fit);
 
     // no NaN here....
     assert(fast_fit(0) == fast_fit(0));
@@ -84,14 +84,14 @@ __global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
 }
 
 template <int N>
-__global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                                uint32_t nHits,
-                                double B,
-                                double *__restrict__ phits,
-                                float *__restrict__ phits_ge,
-                                double *__restrict__ pfast_fit_input,
-                                Rfit::circle_fit *circle_fit,
-                                uint32_t offset) {
+__global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                                 uint32_t nHits,
+                                 double bField,
+                                 double *__restrict__ phits,
+                                 float *__restrict__ phits_ge,
+                                 double *__restrict__ pfast_fit_input,
+                                 riemannFit::CircleFit *circle_fit,
+                                 uint32_t offset) {
   assert(circle_fit);
   assert(N <= nHits);
 
@@ -99,22 +99,22 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
 
   // look in bin for this hit multiplicity
   auto local_start = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
       break;
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
-    Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+    riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-    Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
-    Rfit::loadCovariance2D(hits_ge, hits_cov);
+    riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+    riemannFit::loadCovariance2D(hits_ge, hits_cov);
 
-    circle_fit[local_idx] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
+    circle_fit[local_idx] = riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, bField, true);
 
 #ifdef RIEMANN_DEBUG
 //    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
@@ -125,15 +125,15 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
 }
 
 template <int N>
-__global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
-                              uint32_t nHits,
-                              double B,
-                              OutputSoA *results,
-                              double *__restrict__ phits,
-                              float *__restrict__ phits_ge,
-                              double *__restrict__ pfast_fit_input,
-                              Rfit::circle_fit *__restrict__ circle_fit,
-                              uint32_t offset) {
+__global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
+                               uint32_t nHits,
+                               double bField,
+                               OutputSoA *results,
+                               double *__restrict__ phits,
+                               float *__restrict__ phits_ge,
+                               double *__restrict__ pfast_fit_input,
+                               riemannFit::CircleFit *__restrict__ circle_fit,
+                               uint32_t offset) {
   assert(results);
   assert(circle_fit);
   assert(N <= nHits);
@@ -142,7 +142,7 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 
   // look in bin for this hit multiplicity
   auto local_start = (blockIdx.x * blockDim.x + threadIdx.x);
-  for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt;
+  for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt;
        local_idx += gridDim.x * blockDim.x) {
     auto tuple_idx = local_idx + offset;
     if (tuple_idx >= tupleMultiplicity->size(nHits))
@@ -151,17 +151,17 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
     // get it for the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
-    Rfit::Map3xNd<N> hits(phits + local_idx);
-    Rfit::Map4d fast_fit(pfast_fit_input + local_idx);
-    Rfit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+    riemannFit::Map3xNd<N> hits(phits + local_idx);
+    riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+    riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
 
-    auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_idx], fast_fit, B, true);
+    auto const &line_fit = riemannFit::lineFit(hits, hits_ge, circle_fit[local_idx], fast_fit, bField, true);
 
-    Rfit::fromCircleToPerigee(circle_fit[local_idx]);
+    riemannFit::fromCircleToPerigee(circle_fit[local_idx]);
 
     results->stateAtBS.copyFromCircle(
-        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(B), tkid);
-    results->pt(tkid) = B / std::abs(circle_fit[local_idx].par(2));
+        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid);
+    results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2));
     results->eta(tkid) = asinhf(line_fit.par(0));
     results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
 
diff --git a/src/cudadev/plugin-PixelTriplets/choleskyInversion.h b/src/cudadev/plugin-PixelTriplets/choleskyInversion.h
index 2cb4105f8..b84e7cc80 100644
--- a/src/cudadev/plugin-PixelTriplets/choleskyInversion.h
+++ b/src/cudadev/plugin-PixelTriplets/choleskyInversion.h
@@ -20,13 +20,13 @@ namespace math {
   namespace cholesky {
 
     template <typename M1, typename M2>
-    inline constexpr void invert11(M1 const& src, M2& dst) {
+    inline constexpr void __attribute__((always_inline)) invert11(M1 const& src, M2& dst) {
       using F = decltype(src(0, 0));
       dst(0, 0) = F(1.0) / src(0, 0);
     }
 
     template <typename M1, typename M2>
-    inline constexpr void invert22(M1 const& src, M2& dst) {
+    inline constexpr void __attribute__((always_inline)) invert22(M1 const& src, M2& dst) {
       using F = decltype(src(0, 0));
       auto luc0 = F(1.0) / src(0, 0);
       auto luc1 = src(1, 0) * src(1, 0) * luc0;
@@ -40,7 +40,7 @@ namespace math {
     }
 
     template <typename M1, typename M2>
-    inline constexpr void invert33(M1 const& src, M2& dst) {
+    inline constexpr void __attribute__((always_inline)) invert33(M1 const& src, M2& dst) {
       using F = decltype(src(0, 0));
       auto luc0 = F(1.0) / src(0, 0);
       auto luc1 = src(1, 0);
@@ -64,7 +64,7 @@ namespace math {
     }
 
     template <typename M1, typename M2>
-    inline constexpr void invert44(M1 const& src, M2& dst) {
+    inline constexpr void __attribute__((always_inline)) invert44(M1 const& src, M2& dst) {
       using F = decltype(src(0, 0));
       auto luc0 = F(1.0) / src(0, 0);
       auto luc1 = src(1, 0);
@@ -100,7 +100,7 @@ namespace math {
     }
 
     template <typename M1, typename M2>
-    inline constexpr void invert55(M1 const& src, M2& dst) {
+    inline constexpr void __attribute__((always_inline)) invert55(M1 const& src, M2& dst) {
       using F = decltype(src(0, 0));
       auto luc0 = F(1.0) / src(0, 0);
       auto luc1 = src(1, 0);
@@ -155,7 +155,7 @@ namespace math {
     }
 
     template <typename M1, typename M2>
-    inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
+    inline constexpr void __attribute__((always_inline)) invert66(M1 const& src, M2& dst) {
       using F = decltype(src(0, 0));
       auto luc0 = F(1.0) / src(0, 0);
       auto luc1 = src(1, 0);
@@ -297,7 +297,7 @@ namespace math {
 
     template <typename M1, typename M2>
     struct Inverter<M1, M2, 2> {
-      static constexpr void eval(M1 const& src, M2& dst) {
+      static constexpr void __attribute__((always_inline)) eval(M1 const& src, M2& dst) {
         invert22(src, dst);
         symmetrize22(dst);
       }
@@ -305,7 +305,7 @@ namespace math {
 
     template <typename M1, typename M2>
     struct Inverter<M1, M2, 3> {
-      static constexpr void eval(M1 const& src, M2& dst) {
+      static constexpr void __attribute__((always_inline)) eval(M1 const& src, M2& dst) {
         invert33(src, dst);
         symmetrize33(dst);
       }
@@ -313,7 +313,7 @@ namespace math {
 
     template <typename M1, typename M2>
     struct Inverter<M1, M2, 4> {
-      static constexpr void eval(M1 const& src, M2& dst) {
+      static constexpr void __attribute__((always_inline)) eval(M1 const& src, M2& dst) {
         invert44(src, dst);
         symmetrize44(dst);
       }
@@ -321,7 +321,7 @@ namespace math {
 
     template <typename M1, typename M2>
     struct Inverter<M1, M2, 5> {
-      static constexpr void eval(M1 const& src, M2& dst) {
+      static constexpr void __attribute__((always_inline)) eval(M1 const& src, M2& dst) {
         invert55(src, dst);
         symmetrize55(dst);
       }
@@ -329,7 +329,7 @@ namespace math {
 
     template <typename M1, typename M2>
     struct Inverter<M1, M2, 6> {
-      static constexpr void eval(M1 const& src, M2& dst) {
+      static constexpr void __attribute__((always_inline)) eval(M1 const& src, M2& dst) {
         invert66(src, dst);
         symmetrize66(dst);
       }
@@ -337,7 +337,8 @@ namespace math {
 
     // Eigen interface
     template <typename D1, typename D2>
-    inline constexpr void invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
+    inline constexpr void __attribute__((always_inline))
+    invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
       using M1 = Eigen::DenseBase<D1>;
       using M2 = Eigen::DenseBase<D2>;
       Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
diff --git a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h
index 2e2446ea3..4a493f0c9 100644
--- a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h
+++ b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
 
 #include <algorithm>
 #include <cmath>
@@ -27,7 +27,6 @@ namespace gpuPixelDoublets {
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
     auto const& hh = *hhp;
-    // auto layer = [&](uint16_t id) { return hh.cpeParams().layer(id); };
 
     // x run faster...
     auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
@@ -39,28 +38,27 @@ namespace gpuPixelDoublets {
 
     for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) {
       auto const& vc = isOuterHitOfCell[idy];
-      auto s = vc.size();
-      if (s < 2)
+      auto size = vc.size();
+      if (size < 2)
         continue;
       // if alligned kill one of the two.
       // in principle one could try to relax the cut (only in r-z?) for jumping-doublets
       auto const& c0 = cells[vc[0]];
-      auto xo = c0.get_outer_x(hh);
-      auto yo = c0.get_outer_y(hh);
-      auto zo = c0.get_outer_z(hh);
+      auto xo = c0.outer_x(hh);
+      auto yo = c0.outer_y(hh);
+      auto zo = c0.outer_z(hh);
       auto sg = 0;
-      for (int32_t ic = 0; ic < s; ++ic) {
+      for (int32_t ic = 0; ic < size; ++ic) {
         auto& ci = cells[vc[ic]];
-        if (0 == ci.theUsed)
+        if (ci.unused())
           continue;  // for triplets equivalent to next
         if (checkTrack && ci.tracks().empty())
           continue;
         cc[sg] = vc[ic];
-        d[sg] = ci.get_inner_detIndex(hh);
-        //      l[sg] = layer(d[sg]);
-        x[sg] = ci.get_inner_x(hh) - xo;
-        y[sg] = ci.get_inner_y(hh) - yo;
-        z[sg] = ci.get_inner_z(hh) - zo;
+        d[sg] = ci.inner_detIndex(hh);
+        x[sg] = ci.inner_x(hh) - xo;
+        y[sg] = ci.inner_y(hh) - yo;
+        z[sg] = ci.inner_z(hh) - zo;
         n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
         ++sg;
       }
@@ -78,10 +76,10 @@ namespace gpuPixelDoublets {
           if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
             // alligned:  kill farthest  (prefer consecutive layers)
             if (n[ic] > n[jc]) {
-              ci.theDoubletId = -1;
+              ci.kill();
               break;
             } else {
-              cj.theDoubletId = -1;
+              cj.kill();
             }
           }
         }  //cj
@@ -90,4 +88,4 @@ namespace gpuPixelDoublets {
   }
 }  // namespace gpuPixelDoublets
 
-#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h
diff --git a/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h b/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h
index e906f85f1..cbb374698 100644
--- a/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h
+++ b/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
 
 #include "gpuPixelDoubletsAlgos.h"
 
@@ -7,15 +7,17 @@
 
 namespace gpuPixelDoublets {
 
-  constexpr int nPairs = 13 + 2 + 4;
-  static_assert(nPairs <= CAConstants::maxNumberOfLayerPairs());
+  constexpr int nPairsForQuadruplets = 13;                     // quadruplets require hits in all layers
+  constexpr int nPairsForTriplets = nPairsForQuadruplets + 2;  // include barrel "jumping" layer pairs
+  constexpr int nPairs = nPairsForTriplets + 4;                // include forward "jumping" layer pairs
+  static_assert(nPairs <= caConstants::maxNumberOfLayerPairs);
 
   // start constants
   // clang-format off
 
   CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = {
       0, 1, 0, 4, 0, 7,              // BPIX1 (3)
-      1, 2, 1, 4, 1, 7,              // BPIX2 (5)
+      1, 2, 1, 4, 1, 7,              // BPIX2 (6)
       4, 5, 7, 8,                    // FPIX1 (8)
       2, 3, 2, 4, 2, 7, 5, 6, 8, 9,  // BPIX3 & FPIX2 (13)
       0, 2, 1, 3,                    // Jumping Barrel (15)
@@ -58,10 +60,10 @@ namespace gpuPixelDoublets {
   // end constants
   // clang-format on
 
-  using CellNeighbors = CAConstants::CellNeighbors;
-  using CellTracks = CAConstants::CellTracks;
-  using CellNeighborsVector = CAConstants::CellNeighborsVector;
-  using CellTracksVector = CAConstants::CellTracksVector;
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
 
   __global__ void initDoublets(GPUCACell::OuterHitOfCell* isOuterHitOfCell,
                                int nHits,
@@ -75,8 +77,8 @@ namespace gpuPixelDoublets {
       isOuterHitOfCell[i].reset();
 
     if (0 == first) {
-      cellNeighbors->construct(CAConstants::maxNumOfActiveDoublets(), cellNeighborsContainer);
-      cellTracks->construct(CAConstants::maxNumOfActiveDoublets(), cellTracksContainer);
+      cellNeighbors->construct(caConstants::maxNumOfActiveDoublets, cellNeighborsContainer);
+      cellTracks->construct(caConstants::maxNumOfActiveDoublets, cellTracksContainer);
       auto i = cellNeighbors->extend();
       assert(0 == i);
       (*cellNeighbors)[0].reset();
@@ -127,4 +129,4 @@ namespace gpuPixelDoublets {
 
 }  // namespace gpuPixelDoublets
 
-#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h
diff --git a/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h b/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
index 6d6a62c88..0dcc65203 100644
--- a/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
+++ b/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoubletsAlgos_h
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
+#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
 
 #include <algorithm>
 #include <cmath>
@@ -7,7 +7,7 @@
 #include <cstdio>
 #include <limits>
 
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "DataFormats/approx_atan2.h"
 #include "CUDACore/VecArray.h"
 #include "CUDACore/cuda_assert.h"
@@ -17,10 +17,10 @@
 
 namespace gpuPixelDoublets {
 
-  using CellNeighbors = CAConstants::CellNeighbors;
-  using CellTracks = CAConstants::CellTracks;
-  using CellNeighborsVector = CAConstants::CellNeighborsVector;
-  using CellTracksVector = CAConstants::CellTracksVector;
+  using CellNeighbors = caConstants::CellNeighbors;
+  using CellTracks = caConstants::CellTracks;
+  using CellNeighborsVector = caConstants::CellNeighborsVector;
+  using CellTracksVector = caConstants::CellTracksVector;
 
   __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
                                                     uint32_t nPairs,
@@ -50,9 +50,9 @@ namespace gpuPixelDoublets {
 
     bool isOuterLadder = ideal_cond;
 
-    using Hist = TrackingRecHit2DSOAView::Hist;
+    using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
 
-    auto const& __restrict__ hist = hh.phiBinner();
+    auto const& __restrict__ phiBinner = hh.phiBinner();
     uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
     assert(offsets);
 
@@ -61,7 +61,7 @@ namespace gpuPixelDoublets {
     // nPairsMax to be optimized later (originally was 64).
     // If it should be much bigger, consider using a block-wide parallel prefix scan,
     // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
-    const int nPairsMax = CAConstants::maxNumberOfLayerPairs();
+    const int nPairsMax = caConstants::maxNumberOfLayerPairs;
     assert(nPairs <= nPairsMax);
     __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
     __shared__ uint32_t ntot;
@@ -93,7 +93,7 @@ namespace gpuPixelDoublets {
       uint8_t outer = layerPairs[2 * pairLayerId + 1];
       assert(outer > inner);
 
-      auto hoff = Hist::histOff(outer);
+      auto hoff = PhiBinner::histOff(outer);
 
       auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
       i += offsets[inner];
@@ -105,7 +105,7 @@ namespace gpuPixelDoublets {
 
       // found hit corresponding to our cuda thread, now do the job
       auto mi = hh.detectorIndex(i);
-      if (mi > 2000)
+      if (mi > gpuClustering::maxNumModules)
         continue;  // invalid
 
       /* maybe clever, not effective when zoCut is on
@@ -142,8 +142,8 @@ namespace gpuPixelDoublets {
       // all cuts: true if fails
       constexpr float z0cut = 12.f;      // cm
       constexpr float hardPtCut = 0.5f;  // GeV
-      constexpr float minRadius =
-          hardPtCut * 87.78f;  // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      constexpr float minRadius = hardPtCut * 87.78f;
       constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
       auto ptcut = [&](int j, int16_t idphi) {
         auto r2t4 = minRadius2T4;
@@ -175,10 +175,9 @@ namespace gpuPixelDoublets {
 
       auto iphicut = phicuts[pairLayerId];
 
-      auto kl = Hist::bin(int16_t(mep - iphicut));
-      auto kh = Hist::bin(int16_t(mep + iphicut));
-      auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
-      // bool piWrap = std::abs(kh-kl) > Hist::nbins()/2;
+      auto kl = PhiBinner::bin(int16_t(mep - iphicut));
+      auto kh = PhiBinner::bin(int16_t(mep + iphicut));
+      auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
 
 #ifdef GPU_DEBUG
       int tot = 0;
@@ -191,17 +190,17 @@ namespace gpuPixelDoublets {
       for (auto kk = kl; kk != khh; incr(kk)) {
 #ifdef GPU_DEBUG
         if (kk != kl && kk != kh)
-          nmin += hist.size(kk + hoff);
+          nmin += phiBinner.size(kk + hoff);
 #endif
-        auto const* __restrict__ p = hist.begin(kk + hoff);
-        auto const* __restrict__ e = hist.end(kk + hoff);
+        auto const* __restrict__ p = phiBinner.begin(kk + hoff);
+        auto const* __restrict__ e = phiBinner.end(kk + hoff);
         p += first;
         for (; p < e; p += stride) {
           auto oi = __ldg(p);
           assert(oi >= offsets[outer]);
           assert(oi < offsets[outer + 1]);
           auto mo = hh.detectorIndex(oi);
-          if (mo > 2000)
+          if (mo > gpuClustering::maxNumModules)
             continue;  //    invalid
 
           if (doZ0Cut && z0cutoff(oi))
@@ -241,4 +240,4 @@ namespace gpuPixelDoublets {
 
 }  // namespace gpuPixelDoublets
 
-#endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoupletsAlgos_h
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
index 15e3c486e..723f7eb4c 100644
--- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc
@@ -10,41 +10,45 @@
 
 #include "gpuVertexFinder.h"
 
+#undef PIXVERTEX_DEBUG_PRODUCE
+
 class PixelVertexProducerCUDA : public edm::EDProducer {
 public:
   explicit PixelVertexProducerCUDA(edm::ProductRegistry& reg);
   ~PixelVertexProducerCUDA() override = default;
 
 private:
+  void produceOnGPU(edm::Event& iEvent, const edm::EventSetup& iSetup);
+  void produceOnCPU(edm::Event& iEvent, const edm::EventSetup& iSetup);
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
-  bool m_OnGPU;
+  bool onGPU_;
 
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
   edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
 
-  const gpuVertexFinder::Producer m_gpuAlgo;
+  const gpuVertexFinder::Producer gpuAlgo_;
 
   // Tracking cuts before sending tracks to vertex algo
-  const float m_ptMin;
+  const float ptMin_;
 };
 
 PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg)
-    : m_OnGPU(true),
-      m_gpuAlgo(true,   // oneKernel
-                true,   // useDensity
-                false,  // useDBSCAN
-                false,  // useIterative
-                2,      // minT
-                0.07,   // eps
-                0.01,   // errmax
-                9       // chi2max
-                ),
-      m_ptMin(0.5)  // 0.5 GeV
+    : onGPU_(true),
+      gpuAlgo_(true,   // oneKernel
+               true,   // useDensity
+               false,  // useDBSCAN
+               false,  // useIterative
+               2,      // minT
+               0.07,   // eps
+               0.01,   // errmax
+               9       // chi2max
+               ),
+      ptMin_(0.5)  // 0.5 GeV
 {
-  if (m_OnGPU) {
+  if (onGPU_) {
     tokenGPUTrack_ = reg.consumes<cms::cuda::Product<PixelTrackHeterogeneous>>();
     tokenGPUVertex_ = reg.produces<ZVertexCUDAProduct>();
   } else {
@@ -53,37 +57,45 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg)
   }
 }
 
-void PixelVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  if (m_OnGPU) {
-    auto const& ptracks = iEvent.get(tokenGPUTrack_);
+void PixelVertexProducerCUDA::produceOnGPU(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  auto const& ptracks = iEvent.get(tokenGPUTrack_);
+
+  cms::cuda::ScopedContextProduce ctx{ptracks};
+  auto const* tracks = ctx.get(ptracks).get();
+
+  assert(tracks);
 
-    cms::cuda::ScopedContextProduce ctx{ptracks};
-    auto const* tracks = ctx.get(ptracks).get();
+  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+}
 
-    assert(tracks);
+void PixelVertexProducerCUDA::produceOnCPU(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+  assert(tracks);
+
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+  auto const& tsoa = *tracks;
+  auto maxTracks = tsoa.stride();
+  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    assert(nHits == int(tsoa.hitIndices.size(it)));
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
+    nt++;
+  }
+  std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
 
-    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_));
+}
 
+void PixelVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  if (onGPU_) {
+    produceOnGPU(iEvent, iSetup);
   } else {
-    auto const* tracks = iEvent.get(tokenCPUTrack_).get();
-    assert(tracks);
-
-    /*
-    auto const & tsoa = *tracks;
-    auto maxTracks = tsoa.stride();
-    std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
-
-    int32_t nt = 0;
-    for (int32_t it = 0; it < maxTracks; ++it) {
-      auto nHits = tsoa.nHits(it);
-      assert(nHits==int(tsoa.hitIndices.size(it)));
-      if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
-      nt++;
-    }
-    std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
-    */
-
-    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+    produceOnCPU(iEvent, iSetup);
   }
 }
 
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h b/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
index 201971770..9e9a8d4c0 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksByDensity.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
 
 #include <algorithm>
 #include <cmath>
@@ -59,7 +59,7 @@ namespace gpuVertexFinder {
     if (verbose && 0 == threadIdx.x)
       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
 
-    assert(nt <= hist.capacity());
+    assert((int)nt <= hist.capacity());
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
@@ -231,4 +231,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h b/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
index 504ee4cf2..33403307a 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksDBSCAN.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
 
 #include <algorithm>
 #include <cmath>
@@ -55,14 +55,13 @@ namespace gpuVertexFinder {
     if (verbose && 0 == threadIdx.x)
       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
 
-    assert(nt <= hist.capacity());
+    assert((int)nt <= hist.capacity());
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i < ZVertices::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
-      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
-      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
       assert(iz - INT8_MIN >= 0);
       assert(iz - INT8_MIN < 256);
@@ -239,4 +238,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksIterative.h b/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
index 6e7da0efd..010f77932 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuClusterTracksIterative.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
 
 #include <algorithm>
 #include <cmath>
@@ -55,14 +55,13 @@ namespace gpuVertexFinder {
     if (verbose && 0 == threadIdx.x)
       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
 
-    assert(nt <= hist.capacity());
+    assert((int)nt <= hist.capacity());
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i < ZVertices::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
-      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
-      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
       assert(iz - INT8_MIN >= 0);
       assert(iz - INT8_MIN < 256);
@@ -210,4 +209,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuFitVertices.h b/src/cudadev/plugin-PixelVertexFinding/gpuFitVertices.h
index 3840a3f99..2a72ce840 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuFitVertices.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuFitVertices.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
 
 #include <algorithm>
 #include <cmath>
@@ -110,4 +110,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuSortByPt2.h b/src/cudadev/plugin-PixelVertexFinding/gpuSortByPt2.h
index 9fa98f9e4..211046420 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuSortByPt2.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuSortByPt2.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
 
 #include <algorithm>
 #include <cmath>
@@ -70,4 +70,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuSplitVertices.h b/src/cudadev/plugin-PixelVertexFinding/gpuSplitVertices.h
index 7c779b75b..07b606a81 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuSplitVertices.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuSplitVertices.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
 
 #include <algorithm>
 #include <cmath>
@@ -136,4 +136,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc
index 084763385..608d4efc0 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cc
@@ -1 +1,187 @@
-#include "gpuVertexFinderImpl.h"
+#include "CUDACore/cudaCheck.h"
+
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
+#include "gpuFitVertices.h"
+#include "gpuSortByPt2.h"
+#include "gpuSplitVertices.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+
+namespace gpuVertexFinder {
+
+  // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
+  constexpr float maxChi2ForFirstFit = 50.f;
+  constexpr float maxChi2ForFinalFit = 5000.f;
+
+  // split vertices with a chi2/NDoF greater than this
+  constexpr float maxChi2ForSplit = 9.f;
+
+  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
+    assert(ptracks);
+    assert(soa);
+    auto const& tracks = *ptracks;
+    auto const& fit = tracks.stateAtBS;
+    auto const* quality = tracks.qualityData();
+
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = tracks.nHits(idx);
+      if (nHits == 0)
+        break;  // this is a guard: maybe we need to move to nTracks...
+
+      // initialize soa...
+      soa->idv[idx] = -1;
+
+      if (nHits < 4)
+        continue;  // no triplets
+      if (quality[idx] != pixelTrack::Quality::loose)
+        continue;
+
+      auto pt = tracks.pt(idx);
+
+      if (pt < ptMin)
+        continue;
+
+      auto& data = *pws;
+      auto it = atomicAdd(&data.ntrks, 1);
+      data.itrk[it] = idx;
+      data.zt[it] = tracks.zip(idx);
+      data.ezt2[it] = fit.covariance(idx)(14);
+      data.ptt2[it] = pt * pt;
+    }
+  }
+
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                        gpuVertexFinder::WorkSpace* pws,
+                                        int minT,      // min number of neighbours to be "seed"
+                                        float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                        float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, maxChi2ForFirstFit);
+    __syncthreads();
+    splitVertices(pdata, pws, maxChi2ForSplit);
+    __syncthreads();
+    fitVertices(pdata, pws, maxChi2ForFinalFit);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#else
+  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, maxChi2ForFirstFit);
+  }
+
+  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+    fitVertices(pdata, pws, maxChi2ForFinalFit);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#endif
+
+#ifdef __CUDACC__
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "producing Vertices on GPU" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+#else
+  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "producing Vertices on  CPU" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+#endif
+    assert(tksoa);
+    auto* soa = vertices.get();
+    assert(soa);
+
+#ifdef __CUDACC__
+    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+#else
+    auto ws_d = std::make_unique<WorkSpace>();
+#endif
+
+#ifdef __CUDACC__
+    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    auto blockSize = 128;
+    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
+    cudaCheck(cudaGetLastError());
+#else
+    init(soa, ws_d.get());
+    loadTracks(tksoa, soa, ws_d.get(), ptMin);
+#endif
+
+#ifdef __CUDACC__
+    // Running too many thread lead to problems when printf is enabled.
+    constexpr int maxThreadsForPrint = 1024 - 256;
+    constexpr int numBlocks = 1024;
+    constexpr int threadsPerBlock = 128;
+
+    if (oneKernel_) {
+      // implemented only for density clustesrs
+#ifndef THREE_KERNELS
+      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+#else
+      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      cudaCheck(cudaGetLastError());
+      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+#endif
+    } else {  // five kernels
+      if (useDensity_) {
+        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useDBSCAN_) {
+        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useIterative_) {
+        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      }
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
+      cudaCheck(cudaGetLastError());
+      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+    }
+    cudaCheck(cudaGetLastError());
+#else  // __CUDACC__
+    if (useDensity_) {
+      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useDBSCAN_) {
+      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useIterative_) {
+      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    }
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+    fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
+    // one block per vertex!
+    splitVertices(soa, ws_d.get(), maxChi2ForSplit);
+    fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
+    sortByPt2(soa, ws_d.get());
+#endif
+
+    return vertices;
+  }
+
+}  // namespace gpuVertexFinder
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cu b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cu
index 084763385..9674eac7d 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cu
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.cu
@@ -1 +1 @@
-#include "gpuVertexFinderImpl.h"
+#include "gpuVertexFinder.cc"
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h
index d42a5d93a..b9b8b35d7 100644
--- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h
+++ b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinder.h
@@ -1,5 +1,5 @@
-#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
-#define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
 
 #include <cstddef>
 #include <cstdint>
@@ -80,4 +80,4 @@ namespace gpuVertexFinder {
 
 }  // namespace gpuVertexFinder
 
-#endif  // RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
diff --git a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinderImpl.h b/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
deleted file mode 100644
index f3260cad7..000000000
--- a/src/cudadev/plugin-PixelVertexFinding/gpuVertexFinderImpl.h
+++ /dev/null
@@ -1,173 +0,0 @@
-#include "CUDACore/cudaCheck.h"
-
-#include "gpuClusterTracksByDensity.h"
-#include "gpuClusterTracksDBSCAN.h"
-#include "gpuClusterTracksIterative.h"
-#include "gpuFitVertices.h"
-#include "gpuSortByPt2.h"
-#include "gpuSplitVertices.h"
-
-namespace gpuVertexFinder {
-
-  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
-    assert(ptracks);
-    assert(soa);
-    auto const& tracks = *ptracks;
-    auto const& fit = tracks.stateAtBS;
-    auto const* quality = tracks.qualityData();
-
-    auto first = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
-      auto nHits = tracks.nHits(idx);
-      if (nHits == 0)
-        break;  // this is a guard: maybe we need to move to nTracks...
-
-      // initialize soa...
-      soa->idv[idx] = -1;
-
-      if (nHits < 4)
-        continue;  // no triplets
-      if (quality[idx] != trackQuality::loose)
-        continue;
-
-      auto pt = tracks.pt(idx);
-
-      if (pt < ptMin)
-        continue;
-
-      auto& data = *pws;
-      auto it = atomicAdd(&data.ntrks, 1);
-      data.itrk[it] = idx;
-      data.zt[it] = tracks.zip(idx);
-      data.ezt2[it] = fit.covariance(idx)(14);
-      data.ptt2[it] = pt * pt;
-    }
-  }
-
-// #define THREE_KERNELS
-#ifndef THREE_KERNELS
-  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
-                                        gpuVertexFinder::WorkSpace* pws,
-                                        int minT,      // min number of neighbours to be "seed"
-                                        float eps,     // max absolute distance to cluster
-                                        float errmax,  // max error to be "seed"
-                                        float chi2max  // max normalized distance to cluster,
-  ) {
-    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
-    __syncthreads();
-    fitVertices(pdata, pws, 50.);
-    __syncthreads();
-    splitVertices(pdata, pws, 9.f);
-    __syncthreads();
-    fitVertices(pdata, pws, 5000.);
-    __syncthreads();
-    sortByPt2(pdata, pws);
-  }
-#else
-  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
-                                      gpuVertexFinder::WorkSpace* pws,
-                                      int minT,      // min number of neighbours to be "seed"
-                                      float eps,     // max absolute distance to cluster
-                                      float errmax,  // max error to be "seed"
-                                      float chi2max  // max normalized distance to cluster,
-  ) {
-    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
-    __syncthreads();
-    fitVertices(pdata, pws, 50.);
-  }
-
-  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
-    fitVertices(pdata, pws, 5000.);
-    __syncthreads();
-    sortByPt2(pdata, pws);
-  }
-#endif
-
-#ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
-    // std::cout << "producing Vertices on GPU" << std::endl;
-    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
-#else
-  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
-    // std::cout << "producing Vertices on  CPU" <<    std::endl;
-    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
-#endif
-    assert(tksoa);
-    auto* soa = vertices.get();
-    assert(soa);
-
-#ifdef __CUDACC__
-    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
-#else
-    auto ws_d = std::make_unique<WorkSpace>();
-#endif
-
-#ifdef __CUDACC__
-    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
-    auto blockSize = 128;
-    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
-    cudaCheck(cudaGetLastError());
-#else
-    cms::cudacompat::resetGrid();
-    init(soa, ws_d.get());
-    loadTracks(tksoa, soa, ws_d.get(), ptMin);
-#endif
-
-#ifdef __CUDACC__
-    if (oneKernel_) {
-      // implemented only for density clustesrs
-#ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-#else
-      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-      cudaCheck(cudaGetLastError());
-      // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
-      cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
-#endif
-    } else {  // five kernels
-      if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-      } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-      } else if (useIterative_) {
-        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-      }
-      cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
-      cudaCheck(cudaGetLastError());
-      // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
-      cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
-      cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
-    }
-    cudaCheck(cudaGetLastError());
-#else  // __CUDACC__
-    if (useDensity_) {
-      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useDBSCAN_) {
-      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useIterative_) {
-      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    }
-    // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
-    fitVertices(soa, ws_d.get(), 50.);
-    // one block per vertex!
-    blockIdx.x = 0;
-    gridDim.x = 1;
-    splitVertices(soa, ws_d.get(), 9.f);
-    resetGrid();
-    fitVertices(soa, ws_d.get(), 5000.);
-    sortByPt2(soa, ws_d.get());
-#endif
-
-    return vertices;
-  }
-
-}  // namespace gpuVertexFinder
-
-#undef FROM
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelClusterThresholds.h b/src/cudadev/plugin-SiPixelClusterizer/SiPixelClusterThresholds.h
new file mode 100644
index 000000000..41dd43026
--- /dev/null
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelClusterThresholds.h
@@ -0,0 +1,14 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelClusterThresholds_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelClusterThresholds_h
+
+struct SiPixelClusterThresholds {
+  inline constexpr int32_t getThresholdForLayerOnCondition(bool isLayer1) const noexcept {
+    return isLayer1 ? layer1 : otherLayers;
+  }
+  const int32_t layer1;
+  const int32_t otherLayers;
+};
+
+constexpr SiPixelClusterThresholds kSiPixelClusterThresholdsDefaultPhase1{.layer1 = 2000, .otherLayers = 4000};
+
+#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelClusterThresholds_h
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelFedCablingMapGPUWrapperESProducer.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelROCsStatusAndMappingWrapperESProducer.cc
similarity index 62%
rename from src/cudadev/plugin-SiPixelClusterizer/SiPixelFedCablingMapGPUWrapperESProducer.cc
rename to src/cudadev/plugin-SiPixelClusterizer/SiPixelROCsStatusAndMappingWrapperESProducer.cc
index 263e6d066..c77e1ed35 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelFedCablingMapGPUWrapperESProducer.cc
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelROCsStatusAndMappingWrapperESProducer.cc
@@ -1,6 +1,6 @@
 #include "CondFormats/SiPixelFedIds.h"
-#include "CondFormats/SiPixelFedCablingMapGPU.h"
-#include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"
+#include "CondFormats/SiPixelROCsStatusAndMapping.h"
+#include "CondFormats/SiPixelROCsStatusAndMappingWrapper.h"
 #include "Framework/ESProducer.h"
 #include "Framework/EventSetup.h"
 #include "Framework/ESPluginFactory.h"
@@ -8,16 +8,16 @@
 #include <fstream>
 #include <memory>
 
-class SiPixelFedCablingMapGPUWrapperESProducer : public edm::ESProducer {
+class SiPixelROCsStatusAndMappingWrapperESProducer : public edm::ESProducer {
 public:
-  explicit SiPixelFedCablingMapGPUWrapperESProducer(std::filesystem::path const& datadir) : data_(datadir) {}
+  explicit SiPixelROCsStatusAndMappingWrapperESProducer(std::filesystem::path const& datadir) : data_(datadir) {}
   void produce(edm::EventSetup& eventSetup);
 
 private:
   std::filesystem::path data_;
 };
 
-void SiPixelFedCablingMapGPUWrapperESProducer::produce(edm::EventSetup& eventSetup) {
+void SiPixelROCsStatusAndMappingWrapperESProducer::produce(edm::EventSetup& eventSetup) {
   {
     std::ifstream in(data_ / "fedIds.bin", std::ios::binary);
     in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
@@ -30,14 +30,14 @@ void SiPixelFedCablingMapGPUWrapperESProducer::produce(edm::EventSetup& eventSet
   {
     std::ifstream in(data_ / "cablingMap.bin", std::ios::binary);
     in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
-    SiPixelFedCablingMapGPU obj;
-    in.read(reinterpret_cast<char*>(&obj), sizeof(SiPixelFedCablingMapGPU));
+    SiPixelROCsStatusAndMapping obj;
+    in.read(reinterpret_cast<char*>(&obj), sizeof(SiPixelROCsStatusAndMapping));
     unsigned int modToUnpDefSize;
     in.read(reinterpret_cast<char*>(&modToUnpDefSize), sizeof(unsigned int));
     std::vector<unsigned char> modToUnpDefault(modToUnpDefSize);
     in.read(reinterpret_cast<char*>(modToUnpDefault.data()), modToUnpDefSize);
-    eventSetup.put(std::make_unique<SiPixelFedCablingMapGPUWrapper>(obj, std::move(modToUnpDefault)));
+    eventSetup.put(std::make_unique<SiPixelROCsStatusAndMappingWrapper>(obj, std::move(modToUnpDefault)));
   }
 }
 
-DEFINE_FWK_EVENTSETUP_MODULE(SiPixelFedCablingMapGPUWrapperESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(SiPixelROCsStatusAndMappingWrapperESProducer);
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
index 06624744e..994dd5731 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -1,27 +1,31 @@
+// C++ includes
+#include <memory>
+#include <string>
+#include <vector>
+
+// CMSSW includes
 #include "CUDACore/Product.h"
+#include "CUDACore/ScopedContext.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
-#include "CUDADataFormats/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h"
-#include "CondFormats/SiPixelGainCalibrationForHLTGPU.h"
-#include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"
+#include "CUDADataFormats/SiPixelDigisCUDA.h"
 #include "CondFormats/SiPixelFedIds.h"
-#include "DataFormats/PixelErrors.h"
+#include "CondFormats/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/SiPixelROCsStatusAndMappingWrapper.h"
 #include "DataFormats/FEDNumbering.h"
 #include "DataFormats/FEDRawData.h"
 #include "DataFormats/FEDRawDataCollection.h"
-#include "Framework/EventSetup.h"
+#include "DataFormats/SiPixelErrorCompact.h"
+#include "Framework/EDProducer.h"
 #include "Framework/Event.h"
+#include "Framework/EventSetup.h"
 #include "Framework/PluginFactory.h"
-#include "Framework/EDProducer.h"
-#include "CUDACore/ScopedContext.h"
 
+// local includes
 #include "ErrorChecker.h"
+#include "SiPixelClusterThresholds.h"
 #include "SiPixelRawToClusterGPUKernel.h"
 
-#include <memory>
-#include <string>
-#include <vector>
-
 class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
 public:
   explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg);
@@ -42,11 +46,12 @@ class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
 
   pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
   std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
-  PixelFormatterErrors errors_;
+  SiPixelFormatterErrors errors_;
 
   const bool isRun2_;
   const bool includeErrors_;
   const bool useQuality_;
+  const SiPixelClusterThresholds clusterThresholds_;
 };
 
 SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
@@ -55,7 +60,9 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
       clusterPutToken_(reg.produces<cms::cuda::Product<SiPixelClustersCUDA>>()),
       isRun2_(true),
       includeErrors_(true),
-      useQuality_(true) {
+      useQuality_(true),
+      clusterThresholds_{kSiPixelClusterThresholdsDefaultPhase1.layer1, kSiPixelClusterThresholdsDefaultPhase1.otherLayers}
+{
   if (includeErrors_) {
     digiErrorPutToken_ = reg.produces<cms::cuda::Product<SiPixelDigiErrorsCUDA>>();
   }
@@ -68,10 +75,10 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
 
-  auto const& hgpuMap = iSetup.get<SiPixelFedCablingMapGPUWrapper>();
+  auto const& hgpuMap = iSetup.get<SiPixelROCsStatusAndMappingWrapper>();
   if (hgpuMap.hasQuality() != useQuality_) {
     throw std::runtime_error("UseQuality of the module (" + std::to_string(useQuality_) +
-                             ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.");
+                             ") differs the one from SiPixelROCsStatusAndMappingWrapper. Please fix your configuration.");
   }
   // get the GPU product already here so that the async transfer can begin
   const auto* gpuMap = hgpuMap.getGPUProductAsync(ctx.stream());
@@ -101,7 +108,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
     // for GPU
     // first 150 index stores the fedId and next 150 will store the
     // start index of word in that fed
-    assert(fedId >= 1200);
+    assert(fedId >= FEDNumbering::MINSiPixeluTCAFEDID);
     fedCounter++;
 
     // get event data for this fed
@@ -148,6 +155,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
   }  // end of for loop
 
   gpuAlgo_.makeClustersAsync(isRun2_,
+                             clusterThresholds_,
                              gpuMap,
                              gpuModulesToUnpack,
                              gpuGains,
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
index f5070130a..aaa72c5e0 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu
@@ -3,30 +3,30 @@
  * File Name: RawToClusterGPU.cu
  * Description: It converts Raw data into Digi Format on GPU
  * Finaly the Output of RawToDigi data is given to pixelClusterizer
- *
 **/
 
 // C++ includes
 #include <cassert>
-#include <chrono>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
-#include <string>
 
 // CUDA includes
-#include <cuda.h>
 #include <cuda_runtime.h>
 
 // CMSSW includes
-#include "CUDADataFormats/gpuClusteringConstants.h"
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
-#include "CondFormats/SiPixelFedCablingMapGPU.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "CondFormats/SiPixelROCsStatusAndMapping.h"
+#include "DataFormats/DetId.h"
+#include "DataFormats/FEDNumbering.h"
+#include "DataFormats/PixelSubdetector.h"
+#include "DataFormats/SiPixelDigiConstants.h"
 
 // local includes
 #include "SiPixelRawToClusterGPUKernel.h"
@@ -49,28 +49,22 @@ namespace pixelgpudetails {
                                                                         const uint32_t *src,
                                                                         unsigned int length) {
     std::memcpy(word_.get() + wordCounterGPU, src, sizeof(uint32_t) * length);
-    std::memset(fedId_.get() + wordCounterGPU / 2, fedId - 1200, length / 2);
+    std::memset(fedId_.get() + wordCounterGPU / 2, fedId - FEDNumbering::MINSiPixeluTCAFEDID, length / 2);
   }
 
   ////////////////////
 
-  __device__ uint32_t getLink(uint32_t ww) {
-    return ((ww >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask);
+  __device__ bool isBarrel(uint32_t rawId) {
+    return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
   }
 
-  __device__ uint32_t getRoc(uint32_t ww) { return ((ww >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask); }
-
-  __device__ uint32_t getADC(uint32_t ww) { return ((ww >> pixelgpudetails::ADC_shift) & pixelgpudetails::ADC_mask); }
-
-  __device__ bool isBarrel(uint32_t rawId) { return (1 == ((rawId >> 25) & 0x7)); }
-
-  __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelFedCablingMapGPU *cablingMap,
+  __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelROCsStatusAndMapping *cablingMap,
                                                 uint8_t fed,
                                                 uint32_t link,
                                                 uint32_t roc) {
     uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
     pixelgpudetails::DetIdGPU detId = {
-        cablingMap->RawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]};
+        cablingMap->rawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]};
     return detId;
   }
 
@@ -140,16 +134,15 @@ namespace pixelgpudetails {
 
     uint32_t gRow = rowOffset + slopeRow * local.row;
     uint32_t gCol = colOffset + slopeCol * local.col;
-    //printf("Inside frameConversion row: %u, column: %u\n", gRow, gCol);
+    // inside frameConversion row: gRow, column: gCol
     pixelgpudetails::Pixel global = {gRow, gCol};
     return global;
   }
 
+  // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
   __device__ uint8_t conversionError(uint8_t fedId, uint8_t status, bool debug = false) {
     uint8_t errorType = 0;
 
-    // debug = true;
-
     switch (status) {
       case (1): {
         if (debug)
@@ -184,18 +177,19 @@ namespace pixelgpudetails {
   }
 
   __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) {
-    uint32_t numRowsInRoc = 80;
-    uint32_t numColsInRoc = 52;
-
-    /// row and collumn in ROC representation
-    return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc));
+    /// row and column in ROC representation
+    return ((rocRow < pixelgpudetails::numRowsInRoc) & (rocCol < pixelgpudetails::numColsInRoc));
   }
 
   __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
 
-  __device__ uint8_t checkROC(
-      uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false) {
-    uint8_t errorType = (errorWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ERROR_mask;
+  // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
+  __device__ uint8_t checkROC(uint32_t errorWord,
+                              uint8_t fedId,
+                              uint32_t link,
+                              const SiPixelROCsStatusAndMapping *cablingMap,
+                              bool debug = false) {
+    uint8_t errorType = (errorWord >> sipixelconstants::ROC_shift) & sipixelconstants::ERROR_mask;
     if (errorType < 25)
       return 0;
     bool errorFound = false;
@@ -233,7 +227,7 @@ namespace pixelgpudetails {
       case (29): {
         if (debug)
           printf("Timeout on a channel (errorType = 29)\n");
-        if ((errorWord >> pixelgpudetails::OMIT_ERR_shift) & pixelgpudetails::OMIT_ERR_mask) {
+        if ((errorWord >> sipixelconstants::OMIT_ERR_shift) & sipixelconstants::OMIT_ERR_mask) {
           if (debug)
             printf("...first errorType=29 error, this gets masked out\n");
         }
@@ -243,15 +237,15 @@ namespace pixelgpudetails {
       case (30): {
         if (debug)
           printf("TBM error trailer (errorType = 30)\n");
-        int StateMatch_bits = 4;
-        int StateMatch_shift = 8;
-        uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
-        int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
-        if (StateMatch != 1 && StateMatch != 8) {
+        int stateMatch_bits = 4;
+        int stateMatch_shift = 8;
+        uint32_t stateMatch_mask = ~(~uint32_t(0) << stateMatch_bits);
+        int stateMatch = (errorWord >> stateMatch_shift) & stateMatch_mask;
+        if (stateMatch != 1 && stateMatch != 8) {
           if (debug)
             printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
         }
-        if (StateMatch == 1)
+        if (stateMatch == 1)
           errorType = 40;  // 1=Overflow -> 40, 8=number of ROCs -> 30
         errorFound = true;
         break;
@@ -269,10 +263,11 @@ namespace pixelgpudetails {
     return errorFound ? errorType : 0;
   }
 
+  // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
   __device__ uint32_t getErrRawID(uint8_t fedId,
                                   uint32_t errWord,
                                   uint32_t errorType,
-                                  const SiPixelFedCablingMapGPU *cablingMap,
+                                  const SiPixelROCsStatusAndMapping *cablingMap,
                                   bool debug = false) {
     uint32_t rID = 0xffffffff;
 
@@ -282,13 +277,10 @@ namespace pixelgpudetails {
       case 31:
       case 36:
       case 40: {
-        //set dummy values for cabling just to get detId from link
-        //cabling.dcol = 0;
-        //cabling.pxid = 2;
         uint32_t roc = 1;
-        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
-        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
-        if (rID_temp != 9999)
+        uint32_t link = sipixelconstants::getLink(errWord);
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
+        if (rID_temp != gpuClustering::invalidModuleId)
           rID = rID_temp;
         break;
       }
@@ -318,24 +310,19 @@ namespace pixelgpudetails {
         if ((chanNmbr < 1) || (chanNmbr > 36))
           break;  // signifies unexpected result
 
-        // set dummy values for cabling just to get detId from link if in Barrel
-        //cabling.dcol = 0;
-        //cabling.pxid = 2;
         uint32_t roc = 1;
         uint32_t link = chanNmbr;
-        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
-        if (rID_temp != 9999)
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
+        if (rID_temp != gpuClustering::invalidModuleId)
           rID = rID_temp;
         break;
       }
       case 37:
       case 38: {
-        //cabling.dcol = 0;
-        //cabling.pxid = 2;
-        uint32_t roc = (errWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask;
-        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
-        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
-        if (rID_temp != 9999)
+        uint32_t roc = sipixelconstants::getROC(errWord);
+        uint32_t link = sipixelconstants::getLink(errWord);
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
+        if (rID_temp != gpuClustering::invalidModuleId)
           rID = rID_temp;
         break;
       }
@@ -347,7 +334,7 @@ namespace pixelgpudetails {
   }
 
   // Kernel to perform Raw to Digi conversion
-  __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *cablingMap,
+  __global__ void RawToDigi_kernel(const SiPixelROCsStatusAndMapping *cablingMap,
                                    const unsigned char *modToUnp,
                                    const uint32_t wordCounter,
                                    const uint32_t *word,
@@ -358,7 +345,7 @@ namespace pixelgpudetails {
                                    uint32_t *pdigi,
                                    uint32_t *rawIdArr,
                                    uint16_t *moduleId,
-                                   cms::cuda::SimpleVector<PixelErrorCompact> *err,
+                                   cms::cuda::SimpleVector<SiPixelErrorCompact> *err,
                                    bool useQualityInfo,
                                    bool includeErrors,
                                    bool debug) {
@@ -377,7 +364,7 @@ namespace pixelgpudetails {
       // initialize (too many coninue below)
       pdigi[gIndex] = 0;
       rawIdArr[gIndex] = 0;
-      moduleId[gIndex] = 9999;
+      moduleId[gIndex] = gpuClustering::invalidModuleId;
 
       uint32_t ww = word[gIndex];  // Array containing 32 bit raw data
       if (ww == 0) {
@@ -385,19 +372,19 @@ namespace pixelgpudetails {
         continue;
       }
 
-      uint32_t link = getLink(ww);  // Extract link
-      uint32_t roc = getRoc(ww);    // Extract Roc in link
+      uint32_t link = sipixelconstants::getLink(ww);  // Extract link
+      uint32_t roc = sipixelconstants::getROC(ww);    // Extract Roc in link
       pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
 
       uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug);
       skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
       if (includeErrors and skipROC) {
         uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
-        err->push_back(PixelErrorCompact{rID, ww, errorType, fedId});
+        err->push_back(SiPixelErrorCompact{rID, ww, errorType, fedId});
         continue;
       }
 
-      uint32_t rawId = detId.RawId;
+      uint32_t rawId = detId.rawId;
       uint32_t rocIdInDetUnit = detId.rocInDet;
       bool barrel = isBarrel(rawId);
 
@@ -411,8 +398,8 @@ namespace pixelgpudetails {
       if (skipROC)
         continue;
 
-      uint32_t layer = 0;                   //, ladder =0;
-      int side = 0, panel = 0, module = 0;  //disk = 0, blade = 0
+      uint32_t layer = 0;
+      int side = 0, panel = 0, module = 0;
 
       if (barrel) {
         layer = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
@@ -422,22 +409,20 @@ namespace pixelgpudetails {
         // endcap ids
         layer = 0;
         panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
-        //disk  = (rawId >> diskStartBit_) & diskMask_;
         side = (panel == 1) ? -1 : 1;
-        //blade = (rawId >> bladeStartBit_) & bladeMask_;
       }
 
       // ***special case of layer to 1 be handled here
       pixelgpudetails::Pixel localPix;
       if (layer == 1) {
-        uint32_t col = (ww >> pixelgpudetails::COL_shift) & pixelgpudetails::COL_mask;
-        uint32_t row = (ww >> pixelgpudetails::ROW_shift) & pixelgpudetails::ROW_mask;
+        uint32_t col = sipixelconstants::getCol(ww);
+        uint32_t row = sipixelconstants::getRow(ww);
         localPix.row = row;
         localPix.col = col;
         if (includeErrors) {
           if (not rocRowColIsValid(row, col)) {
             uint8_t error = conversionError(fedId, 3, debug);  //use the device function and fill the arrays
-            err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
+            err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
             if (debug)
               printf("BPIX1  Error status: %i\n", error);
             continue;
@@ -445,15 +430,15 @@ namespace pixelgpudetails {
         }
       } else {
         // ***conversion rules for dcol and pxid
-        uint32_t dcol = (ww >> pixelgpudetails::DCOL_shift) & pixelgpudetails::DCOL_mask;
-        uint32_t pxid = (ww >> pixelgpudetails::PXID_shift) & pixelgpudetails::PXID_mask;
+        uint32_t dcol = sipixelconstants::getDCol(ww);
+        uint32_t pxid = sipixelconstants::getPxId(ww);
         uint32_t row = pixelgpudetails::numRowsInRoc - pxid / 2;
         uint32_t col = dcol * 2 + pxid % 2;
         localPix.row = row;
         localPix.col = col;
         if (includeErrors and not dcolIsValid(dcol, pxid)) {
           uint8_t error = conversionError(fedId, 3, debug);
-          err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
+          err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
           if (debug)
             printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
           continue;
@@ -463,7 +448,7 @@ namespace pixelgpudetails {
       pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
       xx[gIndex] = globalPix.row;  // origin shifting by 1 0-159
       yy[gIndex] = globalPix.col;  // origin shifting by 1 0-415
-      adc[gIndex] = getADC(ww);
+      adc[gIndex] = sipixelconstants::getADC(ww);
       pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]);
       moduleId[gIndex] = detId.moduleId;
       rawIdArr[gIndex] = rawId;
@@ -472,22 +457,22 @@ namespace pixelgpudetails {
   }  // end of Raw to Digi kernel
 
   __global__ void fillHitsModuleStart(uint32_t const *__restrict__ cluStart, uint32_t *__restrict__ moduleStart) {
-    assert(gpuClustering::MaxNumModules < 2048);  // easy to extend at least till 32*1024
+    assert(gpuClustering::maxNumModules < 2048);  // easy to extend at least till 32*1024
     assert(1 == gridDim.x);
     assert(0 == blockIdx.x);
 
     int first = threadIdx.x;
 
-    // limit to MaxHitsInModule;
-    for (int i = first, iend = gpuClustering::MaxNumModules; i < iend; i += blockDim.x) {
+    // limit to maxHitsInModule()
+    for (int i = first, iend = gpuClustering::maxNumModules; i < iend; i += blockDim.x) {
       moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), cluStart[i]);
     }
 
     __shared__ uint32_t ws[32];
     cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
-    cms::cuda::blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::MaxNumModules - 1024, ws);
+    cms::cuda::blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::maxNumModules - 1024, ws);
 
-    for (int i = first + 1025, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) {
+    for (int i = first + 1025, iend = gpuClustering::maxNumModules + 1; i < iend; i += blockDim.x) {
       moduleStart[i] += moduleStart[1024];
     }
     __syncthreads();
@@ -498,33 +483,27 @@ namespace pixelgpudetails {
     assert(c0 == moduleStart[1]);
     assert(moduleStart[1024] >= moduleStart[1023]);
     assert(moduleStart[1025] >= moduleStart[1024]);
-    assert(moduleStart[gpuClustering::MaxNumModules] >= moduleStart[1025]);
+    assert(moduleStart[gpuClustering::maxNumModules] >= moduleStart[1025]);
 
-    for (int i = first, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) {
+    for (int i = first, iend = gpuClustering::maxNumModules + 1; i < iend; i += blockDim.x) {
       if (0 != i)
         assert(moduleStart[i] >= moduleStart[i - i]);
       // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
       // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
-      if (i == 96 || i == 1184 || i == 1744 || i == gpuClustering::MaxNumModules)
+      if (i == 96 || i == 1184 || i == 1744 || i == gpuClustering::maxNumModules)
         printf("moduleStart %d %d\n", i, moduleStart[i]);
     }
 #endif
-
-    // avoid overflow
-    constexpr auto MAX_HITS = gpuClustering::MaxNumClusters;
-    for (int i = first, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) {
-      if (moduleStart[i] > MAX_HITS)
-        moduleStart[i] = MAX_HITS;
-    }
   }
 
   // Interface to outside
   void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2,
-                                                       const SiPixelFedCablingMapGPU *cablingMap,
+                                                       const SiPixelClusterThresholds clusterThresholds,
+                                                       const SiPixelROCsStatusAndMapping *cablingMap,
                                                        const unsigned char *modToUnp,
                                                        const SiPixelGainForHLTonGPU *gains,
                                                        const WordFedAppender &wordFed,
-                                                       PixelFormatterErrors &&errors,
+                                                       SiPixelFormatterErrors &&errors,
                                                        const uint32_t wordCounter,
                                                        const uint32_t fedCounter,
                                                        bool useQualityInfo,
@@ -541,7 +520,7 @@ namespace pixelgpudetails {
     if (includeErrors) {
       digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
     }
-    clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
+    clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, stream);
 
     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
 
@@ -594,12 +573,12 @@ namespace pixelgpudetails {
       using namespace gpuClustering;
       int threadsPerBlock = 256;
       int blocks =
-          (std::max(int(wordCounter), int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
+          (std::max(int(wordCounter), int(gpuClustering::maxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
 
       gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
                                                                         digis_d.moduleInd(),
-                                                                        digis_d.c_xx(),
-                                                                        digis_d.c_yy(),
+                                                                        digis_d.xx(),
+                                                                        digis_d.yy(),
                                                                         digis_d.adc(),
                                                                         gains,
                                                                         wordCounter,
@@ -618,7 +597,7 @@ namespace pixelgpudetails {
 #endif
 
       countModules<<<blocks, threadsPerBlock, 0, stream>>>(
-          digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
+          digis_d.moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
       // read the number of modules into a data member, used by getProduct())
@@ -626,14 +605,14 @@ namespace pixelgpudetails {
           &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
 
       threadsPerBlock = 256;
-      blocks = MaxNumModules;
+      blocks = maxNumModules;
 #ifdef GPU_DEBUG
       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
 #endif
-      findClus<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.c_moduleInd(),
-                                                       digis_d.c_xx(),
-                                                       digis_d.c_yy(),
-                                                       clusters_d.c_moduleStart(),
+      findClus<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
+                                                       digis_d.xx(),
+                                                       digis_d.yy(),
+                                                       clusters_d.moduleStart(),
                                                        clusters_d.clusInModule(),
                                                        clusters_d.moduleId(),
                                                        digis_d.clus(),
@@ -645,11 +624,12 @@ namespace pixelgpudetails {
 #endif
 
       // apply charge cut
-      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
-                                                               digis_d.c_adc(),
-                                                               clusters_d.c_moduleStart(),
+      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
+                                                               digis_d.moduleInd(),
+                                                               digis_d.adc(),
+                                                               clusters_d.moduleStart(),
                                                                clusters_d.clusInModule(),
-                                                               clusters_d.c_moduleId(),
+                                                               clusters_d.moduleId(),
                                                                digis_d.clus(),
                                                                wordCounter);
       cudaCheck(cudaGetLastError());
@@ -660,11 +640,11 @@ namespace pixelgpudetails {
       // synchronization/ExternalWork
 
       // MUST be ONE block
-      fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.c_clusInModule(), clusters_d.clusModuleStart());
+      fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.clusInModule(), clusters_d.clusModuleStart());
 
       // last element holds the number of all clusters
       cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]),
-                                clusters_d.clusModuleStart() + gpuClustering::MaxNumModules,
+                                clusters_d.clusModuleStart() + gpuClustering::maxNumModules,
                                 sizeof(uint32_t),
                                 cudaMemcpyDefault,
                                 stream));
diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
index 3cbce9e71..04e8b99b9 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
+++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h
@@ -10,41 +10,33 @@
 #include "CUDACore/SimpleVector.h"
 #include "CUDACore/host_unique_ptr.h"
 #include "CUDACore/host_noncached_unique_ptr.h"
-#include "DataFormats/PixelErrors.h"
+#include "DataFormats/SiPixelErrorCompact.h"
+#include "DataFormats/SiPixelFormatterErrors.h"
 
-struct SiPixelFedCablingMapGPU;
+// local include(s)
+#include "SiPixelClusterThresholds.h"
+
+struct SiPixelROCsStatusAndMapping;
 class SiPixelGainForHLTonGPU;
 
 namespace pixelgpudetails {
 
-  // Phase 1 geometry constants
-  const uint32_t layerStartBit = 20;
-  const uint32_t ladderStartBit = 12;
-  const uint32_t moduleStartBit = 2;
-
-  const uint32_t panelStartBit = 10;
-  const uint32_t diskStartBit = 18;
-  const uint32_t bladeStartBit = 12;
-
-  const uint32_t layerMask = 0xF;
-  const uint32_t ladderMask = 0xFF;
-  const uint32_t moduleMask = 0x3FF;
-  const uint32_t panelMask = 0x3;
-  const uint32_t diskMask = 0xF;
-  const uint32_t bladeMask = 0x3F;
-
-  const uint32_t LINK_bits = 6;
-  const uint32_t ROC_bits = 5;
-  const uint32_t DCOL_bits = 5;
-  const uint32_t PXID_bits = 8;
-  const uint32_t ADC_bits = 8;
-
-  // special for layer 1
-  const uint32_t LINK_bits_l1 = 6;
-  const uint32_t ROC_bits_l1 = 5;
-  const uint32_t COL_bits_l1 = 6;
-  const uint32_t ROW_bits_l1 = 7;
-  const uint32_t OMIT_ERR_bits = 1;
+  inline namespace phase1geometry {
+    const uint32_t layerStartBit = 20;
+    const uint32_t ladderStartBit = 12;
+    const uint32_t moduleStartBit = 2;
+
+    const uint32_t panelStartBit = 10;
+    const uint32_t diskStartBit = 18;
+    const uint32_t bladeStartBit = 12;
+
+    const uint32_t layerMask = 0xF;
+    const uint32_t ladderMask = 0xFF;
+    const uint32_t moduleMask = 0x3FF;
+    const uint32_t panelMask = 0x3;
+    const uint32_t diskMask = 0xF;
+    const uint32_t bladeMask = 0x3F;
+  }  // namespace phase1geometry
 
   const uint32_t maxROCIndex = 8;
   const uint32_t numRowsInRoc = 80;
@@ -52,28 +44,8 @@ namespace pixelgpudetails {
 
   const uint32_t MAX_WORD = 2000;
 
-  const uint32_t ADC_shift = 0;
-  const uint32_t PXID_shift = ADC_shift + ADC_bits;
-  const uint32_t DCOL_shift = PXID_shift + PXID_bits;
-  const uint32_t ROC_shift = DCOL_shift + DCOL_bits;
-  const uint32_t LINK_shift = ROC_shift + ROC_bits_l1;
-  // special for layer 1 ROC
-  const uint32_t ROW_shift = ADC_shift + ADC_bits;
-  const uint32_t COL_shift = ROW_shift + ROW_bits_l1;
-  const uint32_t OMIT_ERR_shift = 20;
-
-  const uint32_t LINK_mask = ~(~uint32_t(0) << LINK_bits_l1);
-  const uint32_t ROC_mask = ~(~uint32_t(0) << ROC_bits_l1);
-  const uint32_t COL_mask = ~(~uint32_t(0) << COL_bits_l1);
-  const uint32_t ROW_mask = ~(~uint32_t(0) << ROW_bits_l1);
-  const uint32_t DCOL_mask = ~(~uint32_t(0) << DCOL_bits);
-  const uint32_t PXID_mask = ~(~uint32_t(0) << PXID_bits);
-  const uint32_t ADC_mask = ~(~uint32_t(0) << ADC_bits);
-  const uint32_t ERROR_mask = ~(~uint32_t(0) << ROC_bits_l1);
-  const uint32_t OMIT_ERR_mask = ~(~uint32_t(0) << OMIT_ERR_bits);
-
   struct DetIdGPU {
-    uint32_t RawId;
+    uint32_t rawId;
     uint32_t rocInDet;
     uint32_t moduleId;
   };
@@ -168,11 +140,12 @@ namespace pixelgpudetails {
     SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
 
     void makeClustersAsync(bool isRun2,
-                           const SiPixelFedCablingMapGPU* cablingMap,
+                           const SiPixelClusterThresholds clusterThresholds,
+                           const SiPixelROCsStatusAndMapping* cablingMap,
                            const unsigned char* modToUnp,
                            const SiPixelGainForHLTonGPU* gains,
                            const WordFedAppender& wordFed,
-                           PixelFormatterErrors&& errors,
+                           SiPixelFormatterErrors&& errors,
                            const uint32_t wordCounter,
                            const uint32_t fedCounter,
                            bool useQualityInfo,
@@ -205,19 +178,6 @@ namespace pixelgpudetails {
     SiPixelDigiErrorsCUDA digiErrors_d;
   };
 
-  // see RecoLocalTracker/SiPixelClusterizer
-  // all are runtime const, should be specified in python _cfg.py
-  struct ADCThreshold {
-    const int thePixelThreshold = 1000;      // default Pixel threshold in electrons
-    const int theSeedThreshold = 1000;       // seed thershold in electrons not used in our algo
-    const float theClusterThreshold = 4000;  // cluster threshold in electron
-    const int ConversionFactor = 65;         // adc to electron conversion factor
-
-    const int theStackADC_ = 255;               // the maximum adc count for stack layer
-    const int theFirstStack_ = 5;               // the index of the fits stack layer
-    const double theElectronPerADCGain_ = 600;  // ADC to electron conversion
-  };
-
 }  // namespace pixelgpudetails
 
 #endif  // RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
diff --git a/src/cudadev/plugin-SiPixelClusterizer/gpuCalibPixel.h b/src/cudadev/plugin-SiPixelClusterizer/gpuCalibPixel.h
index da36be6c4..c57ac530d 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/gpuCalibPixel.h
+++ b/src/cudadev/plugin-SiPixelClusterizer/gpuCalibPixel.h
@@ -6,12 +6,11 @@
 
 #include "CondFormats/SiPixelGainForHLTonGPU.h"
 #include "CUDACore/cuda_assert.h"
-
-#include "gpuClusteringConstants.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
 
 namespace gpuCalibPixel {
 
-  constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
+  using gpuClustering::invalidModuleId;
 
   // valid for run2
   constexpr float VCaltoElectronGain = 47;         // L2-4: 47 +- 4.7
@@ -35,12 +34,12 @@ namespace gpuCalibPixel {
     // zero for next kernels...
     if (0 == first)
       clusModuleStart[0] = moduleStart[0] = 0;
-    for (int i = first; i < gpuClustering::MaxNumModules; i += gridDim.x * blockDim.x) {
+    for (int i = first; i < gpuClustering::maxNumModules; i += gridDim.x * blockDim.x) {
       nClustersInModule[i] = 0;
     }
 
     for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
-      if (InvId == id[i])
+      if (invalidModuleId == id[i])
         continue;
 
       float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f;
@@ -55,7 +54,7 @@ namespace gpuCalibPixel {
       float gain = ret.second;
       // float pedestal = 0; float gain = 1.;
       if (isDeadColumn | isNoisyColumn) {
-        id[i] = InvId;
+        id[i] = invalidModuleId;
         adc[i] = 0;
         printf("bad pixel at %d in %d\n", i, id[i]);
       } else {
diff --git a/src/cudadev/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/cudadev/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
index d0dd93044..a175e81dc 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
+++ b/src/cudadev/plugin-SiPixelClusterizer/gpuClusterChargeCut.h
@@ -6,118 +6,125 @@
 
 #include "CUDACore/cuda_assert.h"
 #include "CUDACore/prefixScan.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "Geometry/phase1PixelTopology.h"
 
-#include "gpuClusteringConstants.h"
+// local include(s)
+#include "SiPixelClusterThresholds.h"
 
 namespace gpuClustering {
 
   __global__ void clusterChargeCut(
-      uint16_t* __restrict__ id,                 // module id of each pixel (modified if bad cluster)
-      uint16_t const* __restrict__ adc,          //  charge of each pixel
+      SiPixelClusterThresholds
+          clusterThresholds,             // charge cut on cluster in electrons (for layer 1 and for other layers)
+      uint16_t* __restrict__ id,         // module id of each pixel (modified if bad cluster)
+      uint16_t const* __restrict__ adc,  //  charge of each pixel
       uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
       uint32_t* __restrict__ nClustersInModule,  // modified: number of clusters found in each module
       uint32_t const* __restrict__ moduleId,     // module id of each module
       int32_t* __restrict__ clusterId,           // modified: cluster id of each pixel
       uint32_t numElements) {
-    if (blockIdx.x >= moduleStart[0])
-      return;
+    __shared__ int32_t charge[maxNumClustersPerModules];
+    __shared__ uint8_t ok[maxNumClustersPerModules];
+    __shared__ uint16_t newclusId[maxNumClustersPerModules];
+
+    auto firstModule = blockIdx.x;
+    auto endModule = moduleStart[0];
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto firstPixel = moduleStart[1 + module];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < maxNumModules);
+      assert(thisModuleId == moduleId[module]);
+
+      auto nclus = nClustersInModule[thisModuleId];
+      if (nclus == 0)
+        continue;
+
+      if (threadIdx.x == 0 && nclus > maxNumClustersPerModules)
+        printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+               thisModuleId,
+               blockIdx.x,
+               nclus,
+               maxNumClustersPerModules);
+
+      auto first = firstPixel + threadIdx.x;
+
+      if (nclus > maxNumClustersPerModules) {
+        // remove excess  FIXME find a way to cut charge first....
+        for (auto i = first; i < numElements; i += blockDim.x) {
+          if (id[i] == invalidModuleId)
+            continue;  // not valid
+          if (id[i] != thisModuleId)
+            break;  // end of module
+          if (clusterId[i] >= maxNumClustersPerModules) {
+            id[i] = invalidModuleId;
+            clusterId[i] = invalidModuleId;
+          }
+        }
+        nclus = maxNumClustersPerModules;
+      }
+
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("start cluster charge cut for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+      assert(nclus <= maxNumClustersPerModules);
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        charge[i] = 0;
+      }
+      __syncthreads();
+
+      for (auto i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == invalidModuleId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        atomicAdd(&charge[clusterId[i]], adc[i]);
+      }
+      __syncthreads();
+
+      auto chargeCut =
+          clusterThresholds.getThresholdForLayerOnCondition(thisModuleId < phase1PixelTopology::layerStart[1]);
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
+      }
+
+      __syncthreads();
 
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
-    assert(thisModuleId == moduleId[blockIdx.x]);
+      // renumber
+      __shared__ uint16_t ws[32];
+      cms::cuda::blockPrefixScan(newclusId, nclus, ws);
 
-    auto nclus = nClustersInModule[thisModuleId];
-    if (nclus == 0)
-      return;
+      assert(nclus >= newclusId[nclus - 1]);
 
-    if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
-      printf("Warning too many clusters in module %d in block %d: %d > %d\n",
-             thisModuleId,
-             blockIdx.x,
-             nclus,
-             MaxNumClustersPerModules);
+      if (nclus == newclusId[nclus - 1])
+        continue;
 
-    auto first = firstPixel + threadIdx.x;
+      nClustersInModule[thisModuleId] = newclusId[nclus - 1];
+      __syncthreads();
 
-    if (nclus > MaxNumClustersPerModules) {
-      // remove excess  FIXME find a way to cut charge first....
+      // mark bad cluster again
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        if (0 == ok[i])
+          newclusId[i] = invalidModuleId + 1;
+      }
+      __syncthreads();
+
+      // reassign id
       for (auto i = first; i < numElements; i += blockDim.x) {
-        if (id[i] == InvId)
+        if (id[i] == invalidModuleId)
           continue;  // not valid
         if (id[i] != thisModuleId)
           break;  // end of module
-        if (clusterId[i] >= MaxNumClustersPerModules) {
-          id[i] = InvId;
-          clusterId[i] = InvId;
-        }
+        clusterId[i] = newclusId[clusterId[i]] - 1;
+        if (clusterId[i] == invalidModuleId)
+          id[i] = invalidModuleId;
       }
-      nclus = MaxNumClustersPerModules;
-    }
-
-#ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
-#endif
 
-    __shared__ int32_t charge[MaxNumClustersPerModules];
-    __shared__ uint8_t ok[MaxNumClustersPerModules];
-    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
-
-    assert(nclus <= MaxNumClustersPerModules);
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      charge[i] = 0;
-    }
-    __syncthreads();
-
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      atomicAdd(&charge[clusterId[i]], adc[i]);
-    }
-    __syncthreads();
-
-    auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
-    }
-
-    __syncthreads();
-
-    // renumber
-    __shared__ uint16_t ws[32];
-    cms::cuda::blockPrefixScan(newclusId, nclus, ws);
-
-    assert(nclus >= newclusId[nclus - 1]);
-
-    if (nclus == newclusId[nclus - 1])
-      return;
-
-    nClustersInModule[thisModuleId] = newclusId[nclus - 1];
-    __syncthreads();
-
-    // mark bad cluster again
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      if (0 == ok[i])
-        newclusId[i] = InvId + 1;
-    }
-    __syncthreads();
-
-    // reassign id
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      clusterId[i] = newclusId[clusterId[i]] - 1;
-      if (clusterId[i] == InvId)
-        id[i] = InvId;
-    }
-
-    //done
+      //done
+    }  // loop on modules
   }
 
 }  // namespace gpuClustering
diff --git a/src/cudadev/plugin-SiPixelClusterizer/gpuClustering.h b/src/cudadev/plugin-SiPixelClusterizer/gpuClustering.h
index 84609bd10..3679f5b16 100644
--- a/src/cudadev/plugin-SiPixelClusterizer/gpuClustering.h
+++ b/src/cudadev/plugin-SiPixelClusterizer/gpuClustering.h
@@ -4,11 +4,10 @@
 #include <cstdint>
 #include <cstdio>
 
-#include "Geometry/phase1PixelTopology.h"
 #include "CUDACore/HistoContainer.h"
 #include "CUDACore/cuda_assert.h"
-
-#include "gpuClusteringConstants.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "Geometry/phase1PixelTopology.h"
 
 namespace gpuClustering {
 
@@ -23,284 +22,282 @@ namespace gpuClustering {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
     for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
       clusterId[i] = i;
-      if (InvId == id[i])
+      if (invalidModuleId == id[i])
         continue;
       auto j = i - 1;
-      while (j >= 0 and id[j] == InvId)
+      while (j >= 0 and id[j] == invalidModuleId)
         --j;
       if (j < 0 or id[j] != id[i]) {
         // boundary...
-        auto loc = atomicInc(moduleStart, MaxNumModules);
+        auto loc = atomicInc(moduleStart, maxNumModules);
         moduleStart[loc + 1] = i;
       }
     }
   }
 
-  __global__
-      //  __launch_bounds__(256,4)
-      void
-      findClus(uint16_t const* __restrict__ id,           // module id of each pixel
-               uint16_t const* __restrict__ x,            // local coordinates of each pixel
-               uint16_t const* __restrict__ y,            //
-               uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
-               uint32_t* __restrict__ nClustersInModule,  // output: number of clusters found in each module
-               uint32_t* __restrict__ moduleId,           // output: module id of each module
-               int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
-               int numElements) {
-    if (blockIdx.x >= moduleStart[0])
-      return;
+  __global__ void findClus(uint16_t const* __restrict__ id,           // module id of each pixel
+                           uint16_t const* __restrict__ x,            // local coordinates of each pixel
+                           uint16_t const* __restrict__ y,            //
+                           uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
+                           uint32_t* __restrict__ nClustersInModule,  // output: number of clusters found in each module
+                           uint32_t* __restrict__ moduleId,           // output: module id of each module
+                           int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
+                           int numElements) {
+    __shared__ int msize;
 
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
+    auto firstModule = blockIdx.x;
+    auto endModule = moduleStart[0];
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto firstPixel = moduleStart[1 + module];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < maxNumModules);
 
 #ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
 #endif
 
-    auto first = firstPixel + threadIdx.x;
+      auto first = firstPixel + threadIdx.x;
 
-    // find the index of the first pixel not belonging to this module (or invalid)
-    __shared__ int msize;
-    msize = numElements;
-    __syncthreads();
+      // find the index of the first pixel not belonging to this module (or invalid)
+      msize = numElements;
+      __syncthreads();
 
-    // skip threads not associated to an existing pixel
-    for (int i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (id[i] != thisModuleId) {  // find the first pixel in a different module
-        atomicMin(&msize, i);
-        break;
+      // skip threads not associated to an existing pixel
+      for (int i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == invalidModuleId)  // skip invalid pixels
+          continue;
+        if (id[i] != thisModuleId) {  // find the first pixel in a different module
+          atomicMin(&msize, i);
+          break;
+        }
       }
-    }
 
-    //init hist  (ymax=416 < 512 : 9bits)
-    constexpr uint32_t maxPixInModule = 4000;
-    constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
-    using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
-    __shared__ Hist hist;
-    __shared__ typename Hist::Counter ws[32];
-    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
-      hist.off[j] = 0;
-    }
-    __syncthreads();
+      //init hist  (ymax=416 < 512 : 9bits)
+      constexpr uint32_t maxPixInModule = 4000;
+      constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
+      using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
+      __shared__ Hist hist;
+      __shared__ typename Hist::Counter ws[32];
+      for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+        hist.off[j] = 0;
+      }
+      __syncthreads();
 
-    assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
+      assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
 
-    // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
-    if (0 == threadIdx.x) {
-      if (msize - firstPixel > maxPixInModule) {
-        printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
-        msize = maxPixInModule + firstPixel;
+      // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
+      if (0 == threadIdx.x) {
+        if (msize - firstPixel > maxPixInModule) {
+          printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
+          msize = maxPixInModule + firstPixel;
+        }
       }
-    }
 
-    __syncthreads();
-    assert(msize - firstPixel <= maxPixInModule);
+      __syncthreads();
+      assert(msize - firstPixel <= maxPixInModule);
 
 #ifdef GPU_DEBUG
-    __shared__ uint32_t totGood;
-    totGood = 0;
-    __syncthreads();
+      __shared__ uint32_t totGood;
+      totGood = 0;
+      __syncthreads();
 #endif
 
-    // fill histo
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      hist.count(y[i]);
+      // fill histo
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == invalidModuleId)  // skip invalid pixels
+          continue;
+        hist.count(y[i]);
 #ifdef GPU_DEBUG
-      atomicAdd(&totGood, 1);
+        atomicAdd(&totGood, 1);
 #endif
-    }
-    __syncthreads();
-    if (threadIdx.x < 32)
-      ws[threadIdx.x] = 0;  // used by prefix scan...
-    __syncthreads();
-    hist.finalize(ws);
-    __syncthreads();
+      }
+      __syncthreads();
+      if (threadIdx.x < 32)
+        ws[threadIdx.x] = 0;  // used by prefix scan...
+      __syncthreads();
+      hist.finalize(ws);
+      __syncthreads();
 #ifdef GPU_DEBUG
-    assert(hist.size() == totGood);
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("histo size %d\n", hist.size());
+      assert(hist.size() == totGood);
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("histo size %d\n", hist.size());
 #endif
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      hist.fill(y[i], i - firstPixel);
-    }
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == invalidModuleId)  // skip invalid pixels
+          continue;
+        hist.fill(y[i], i - firstPixel);
+      }
 
 #ifdef __CUDA_ARCH__
-    // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
-    constexpr int maxiter = 16;
+      // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
+      constexpr int maxiter = 16;
 #else
-    auto maxiter = hist.size();
+      auto maxiter = hist.size();
 #endif
-    // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
-    constexpr int maxNeighbours = 10;
-    assert((hist.size() / blockDim.x) <= maxiter);
-    // nearest neighbour
-    uint16_t nn[maxiter][maxNeighbours];
-    uint8_t nnn[maxiter];  // number of nn
-    for (uint32_t k = 0; k < maxiter; ++k)
-      nnn[k] = 0;
+      // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
+      constexpr int maxNeighbours = 10;
+      assert((hist.size() / blockDim.x) <= maxiter);
+      // nearest neighbour
+      uint16_t nn[maxiter][maxNeighbours];
+      uint8_t nnn[maxiter];  // number of nn
+      for (uint32_t k = 0; k < maxiter; ++k)
+        nnn[k] = 0;
 
-    __syncthreads();  // for hit filling!
+      __syncthreads();  // for hit filling!
 
 #ifdef GPU_DEBUG
-    // look for anomalous high occupancy
-    __shared__ uint32_t n40, n60;
-    n40 = n60 = 0;
-    __syncthreads();
-    for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
-      if (hist.size(j) > 60)
-        atomicAdd(&n60, 1);
-      if (hist.size(j) > 40)
-        atomicAdd(&n40, 1);
-    }
-    __syncthreads();
-    if (0 == threadIdx.x) {
-      if (n60 > 0)
-        printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
-      else if (n40 > 0)
-        printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
-    }
-    __syncthreads();
+      // look for anomalous high occupancy
+      __shared__ uint32_t n40, n60;
+      n40 = n60 = 0;
+      __syncthreads();
+      for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
+        if (hist.size(j) > 60)
+          atomicAdd(&n60, 1);
+        if (hist.size(j) > 40)
+          atomicAdd(&n40, 1);
+      }
+      __syncthreads();
+      if (0 == threadIdx.x) {
+        if (n60 > 0)
+          printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
+        else if (n40 > 0)
+          printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
+      }
+      __syncthreads();
 #endif
 
-    // fill NN
-    for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-      assert(k < maxiter);
-      auto p = hist.begin() + j;
-      auto i = *p + firstPixel;
-      assert(id[i] != InvId);
-      assert(id[i] == thisModuleId);  // same module
-      int be = Hist::bin(y[i] + 1);
-      auto e = hist.end(be);
-      ++p;
-      assert(0 == nnn[k]);
-      for (; p < e; ++p) {
-        auto m = (*p) + firstPixel;
-        assert(m != i);
-        assert(int(y[m]) - int(y[i]) >= 0);
-        assert(int(y[m]) - int(y[i]) <= 1);
-        if (std::abs(int(x[m]) - int(x[i])) > 1)
-          continue;
-        auto l = nnn[k]++;
-        assert(l < maxNeighbours);
-        nn[k][l] = *p;
+      // fill NN
+      for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+        assert(k < maxiter);
+        auto p = hist.begin() + j;
+        auto i = *p + firstPixel;
+        assert(id[i] != invalidModuleId);
+        assert(id[i] == thisModuleId);  // same module
+        int be = Hist::bin(y[i] + 1);
+        auto e = hist.end(be);
+        ++p;
+        assert(0 == nnn[k]);
+        for (; p < e; ++p) {
+          auto m = (*p) + firstPixel;
+          assert(m != i);
+          assert(int(y[m]) - int(y[i]) >= 0);
+          assert(int(y[m]) - int(y[i]) <= 1);
+          if (std::abs(int(x[m]) - int(x[i])) > 1)
+            continue;
+          auto l = nnn[k]++;
+          assert(l < maxNeighbours);
+          nn[k][l] = *p;
+        }
       }
-    }
 
-    // for each pixel, look at all the pixels until the end of the module;
-    // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
-    // after the loop, all the pixel in each cluster should have the id equeal to the lowest
-    // pixel in the cluster ( clus[i] == i ).
-    bool more = true;
-    int nloops = 0;
-    while (__syncthreads_or(more)) {
-      if (1 == nloops % 2) {
-        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-          auto p = hist.begin() + j;
-          auto i = *p + firstPixel;
-          auto m = clusterId[i];
-          while (m != clusterId[m])
-            m = clusterId[m];
-          clusterId[i] = m;
+      // for each pixel, look at all the pixels until the end of the module;
+      // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+      // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+      // pixel in the cluster ( clus[i] == i ).
+      bool more = true;
+      int nloops = 0;
+      while (__syncthreads_or(more)) {
+        if (1 == nloops % 2) {
+          for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            auto m = clusterId[i];
+            while (m != clusterId[m])
+              m = clusterId[m];
+            clusterId[i] = m;
+          }
+        } else {
+          more = false;
+          for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            for (int kk = 0; kk < nnn[k]; ++kk) {
+              auto l = nn[k][kk];
+              auto m = l + firstPixel;
+              assert(m != i);
+              auto old = atomicMin(&clusterId[m], clusterId[i]);
+              if (old != clusterId[i]) {
+                // end the loop only if no changes were applied
+                more = true;
+              }
+              atomicMin(&clusterId[i], old);
+            }  // nnloop
+          }    // pixel loop
         }
-      } else {
-        more = false;
-        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-          auto p = hist.begin() + j;
-          auto i = *p + firstPixel;
-          for (int kk = 0; kk < nnn[k]; ++kk) {
-            auto l = nn[k][kk];
-            auto m = l + firstPixel;
-            assert(m != i);
-            auto old = atomicMin(&clusterId[m], clusterId[i]);
-            if (old != clusterId[i]) {
-              // end the loop only if no changes were applied
-              more = true;
-            }
-            atomicMin(&clusterId[i], old);
-          }  // nnloop
-        }    // pixel loop
-      }
-      ++nloops;
-    }  // end while
+        ++nloops;
+      }  // end while
 
 #ifdef GPU_DEBUG
-    {
-      __shared__ int n0;
-      if (threadIdx.x == 0)
-        n0 = nloops;
-      __syncthreads();
-      auto ok = n0 == nloops;
-      assert(__syncthreads_and(ok));
-      if (thisModuleId % 100 == 1)
+      {
+        __shared__ int n0;
         if (threadIdx.x == 0)
-          printf("# loops %d\n", nloops);
-    }
+          n0 = nloops;
+        __syncthreads();
+        auto ok = n0 == nloops;
+        assert(__syncthreads_and(ok));
+        if (thisModuleId % 100 == 1)
+          if (threadIdx.x == 0)
+            printf("# loops %d\n", nloops);
+      }
 #endif
 
-    __shared__ unsigned int foundClusters;
-    foundClusters = 0;
-    __syncthreads();
+      __shared__ unsigned int foundClusters;
+      foundClusters = 0;
+      __syncthreads();
 
-    // find the number of different clusters, identified by a pixels with clus[i] == i;
-    // mark these pixels with a negative id.
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (clusterId[i] == i) {
-        auto old = atomicInc(&foundClusters, 0xffffffff);
-        clusterId[i] = -(old + 1);
+      // find the number of different clusters, identified by a pixels with clus[i] == i;
+      // mark these pixels with a negative id.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == invalidModuleId)  // skip invalid pixels
+          continue;
+        if (clusterId[i] == i) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          clusterId[i] = -(old + 1);
+        }
       }
-    }
-    __syncthreads();
+      __syncthreads();
 
-    // propagate the negative id to all the pixels in the cluster.
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (clusterId[i] >= 0) {
-        // mark each pixel in a cluster with the same id as the first one
-        clusterId[i] = clusterId[clusterId[i]];
+      // propagate the negative id to all the pixels in the cluster.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == invalidModuleId)  // skip invalid pixels
+          continue;
+        if (clusterId[i] >= 0) {
+          // mark each pixel in a cluster with the same id as the first one
+          clusterId[i] = clusterId[clusterId[i]];
+        }
       }
-    }
-    __syncthreads();
+      __syncthreads();
 
-    // adjust the cluster id to be a positive value starting from 0
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId) {  // skip invalid pixels
-        clusterId[i] = -9999;
-        continue;
+      // adjust the cluster id to be a positive value starting from 0
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == invalidModuleId) {  // skip invalid pixels
+          clusterId[i] = -9999;
+          continue;
+        }
+        clusterId[i] = -clusterId[i] - 1;
       }
-      clusterId[i] = -clusterId[i] - 1;
-    }
-    __syncthreads();
+      __syncthreads();
 
-    if (threadIdx.x == 0) {
-      nClustersInModule[thisModuleId] = foundClusters;
-      moduleId[blockIdx.x] = thisModuleId;
+      if (threadIdx.x == 0) {
+        nClustersInModule[thisModuleId] = foundClusters;
+        moduleId[module] = thisModuleId;
 #ifdef GPU_DEBUG
-      if (foundClusters > gMaxHit) {
-        gMaxHit = foundClusters;
-        if (foundClusters > 8)
-          printf("max hit %d in %d\n", foundClusters, thisModuleId);
-      }
+        if (foundClusters > gMaxHit) {
+          gMaxHit = foundClusters;
+          if (foundClusters > 8)
+            printf("max hit %d in %d\n", foundClusters, thisModuleId);
+        }
 #endif
 #ifdef GPU_DEBUG
-      if (thisModuleId % 100 == 1)
-        printf("%d clusters in module %d\n", foundClusters, thisModuleId);
+        if (thisModuleId % 100 == 1)
+          printf("%d clusters in module %d\n", foundClusters, thisModuleId);
 #endif
-    }
+      }
+    }  // module loop
   }
-
 }  // namespace gpuClustering
 
 #endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
diff --git a/src/cudadev/plugin-SiPixelClusterizer/gpuClusteringConstants.h b/src/cudadev/plugin-SiPixelClusterizer/gpuClusteringConstants.h
deleted file mode 100644
index 0bce634ee..000000000
--- a/src/cudadev/plugin-SiPixelClusterizer/gpuClusteringConstants.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
-#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
-
-#include "CUDADataFormats/gpuClusteringConstants.h"
-
-#endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
diff --git a/src/cudadev/plugin-SiPixelRecHits/PixelRecHits.cu b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu
similarity index 83%
rename from src/cudadev/plugin-SiPixelRecHits/PixelRecHits.cu
rename to src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu
index 4cd3fc152..ba62da1b5 100644
--- a/src/cudadev/plugin-SiPixelRecHits/PixelRecHits.cu
+++ b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu
@@ -8,10 +8,10 @@
 // CMSSW headers
 #include "CUDACore/cudaCheck.h"
 #include "CUDACore/device_unique_ptr.h"
-#include "plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h"  // !
-#include "plugin-SiPixelClusterizer/gpuClusteringConstants.h"        // !
+#include "CUDADataFormats/gpuClusteringConstants.h"
+#include "plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h"
 
-#include "PixelRecHits.h"
+#include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
 
 namespace {
@@ -47,30 +47,29 @@ namespace pixelgpudetails {
 #ifdef GPU_DEBUG
     std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
 #endif
-    if (blocks)  // protect from empty events
+    // protect from empty events
+    if (blocks) {
       gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream>>>(
           cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
-    cudaCheck(cudaGetLastError());
+      cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+      cudaCheck(cudaDeviceSynchronize());
 #endif
+    }
 
     // assuming full warp of threads is better than a smaller number...
     if (nHits) {
       setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
       cudaCheck(cudaGetLastError());
-    }
 
-    if (nHits) {
-      cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream);
+      cms::cuda::fillManyFromVector(
+          hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream);
       cudaCheck(cudaGetLastError());
-    }
 
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+      cudaCheck(cudaDeviceSynchronize());
 #endif
+    }
 
     return hits_d;
   }
diff --git a/src/cudadev/plugin-SiPixelRecHits/PixelRecHits.h b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h
similarity index 78%
rename from src/cudadev/plugin-SiPixelRecHits/PixelRecHits.h
rename to src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h
index 8f5653fbd..7b0a38a15 100644
--- a/src/cudadev/plugin-SiPixelRecHits/PixelRecHits.h
+++ b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.h
@@ -1,5 +1,5 @@
-#ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
-#define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHitGPUKernel_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHitGPUKernel_h
 
 #include <cstdint>
 
@@ -8,7 +8,7 @@
 #include "CUDADataFormats/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 
 namespace pixelgpudetails {
 
@@ -30,4 +30,4 @@ namespace pixelgpudetails {
   };
 }  // namespace pixelgpudetails
 
-#endif  // RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
+#endif  // RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHitGPUKernel_h
diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
index a82e23eab..413982fc3 100644
--- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
@@ -4,7 +4,7 @@
 #include "CUDACore/Product.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
 #include "Framework/PluginFactory.h"
@@ -12,7 +12,7 @@
 #include "CUDACore/ScopedContext.h"
 #include "CondFormats/PixelCPEFast.h"
 
-#include "PixelRecHits.h"  // TODO : spit product from kernel
+#include "PixelRecHitGPUKernel.h"
 
 class SiPixelRecHitCUDA : public edm::EDProducer {
 public:
@@ -23,13 +23,11 @@ class SiPixelRecHitCUDA : public edm::EDProducer {
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
   // The mess with inputs will be cleaned up when migrating to the new framework
-  edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
-  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
-
-  edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tokenHit_;
-
-  pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
+  const edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
+  const edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
+  const edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
+  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tokenHit_;
+  const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_;
 };
 
 SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg)
@@ -48,11 +46,6 @@ void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
   auto const& digis = ctx.get(iEvent, tokenDigi_);
   auto const& bs = ctx.get(iEvent, tBeamSpot);
 
-  auto nHits = clusters.nClusters();
-  if (nHits >= TrackingRecHit2DSOAView::maxHits()) {
-    std::cout << "Clusters/Hits Overflow " << nHits << " >= " << TrackingRecHit2DSOAView::maxHits() << std::endl;
-  }
-
   ctx.emplace(iEvent,
               tokenHit_,
               gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream()));
diff --git a/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h
index 433d3b012..adddc8b83 100644
--- a/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h
+++ b/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h
@@ -5,11 +5,12 @@
 #include <cstdio>
 #include <limits>
 
-#include "CUDADataFormats/BeamSpotCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
-#include "DataFormats/approx_atan2.h"
 #include "CUDACore/cuda_assert.h"
+#include "CUDADataFormats/BeamSpotCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/gpuClusteringConstants.h"
 #include "CondFormats/pixelCPEforGPU.h"
+#include "DataFormats/approx_atan2.h"
 
 namespace gpuPixelRecHits {
 
@@ -53,7 +54,7 @@ namespace gpuPixelRecHits {
     }
 
     // to be moved in common namespace...
-    constexpr uint16_t InvId = 9999;  // must be > MaxNumModules
+    using gpuClustering::invalidModuleId;
     constexpr int32_t MaxHitsInIter = pixelCPEforGPU::MaxHitsInIter;
 
     using ClusParams = pixelCPEforGPU::ClusParams;
@@ -70,7 +71,7 @@ namespace gpuPixelRecHits {
 #ifdef GPU_DEBUG
     if (threadIdx.x == 0) {
       auto k = clusters.moduleStart(1 + blockIdx.x);
-      while (digis.moduleInd(k) == InvId)
+      while (digis.moduleInd(k) == invalidModuleId)
         ++k;
       assert(digis.moduleInd(k) == me);
     }
@@ -83,8 +84,6 @@ namespace gpuPixelRecHits {
 #endif
 
     for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
-      auto first = clusters.moduleStart(1 + blockIdx.x);
-
       int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
       int lastClus = startClus + nClusInIter;
       assert(nClusInIter <= nclus);
@@ -100,32 +99,30 @@ namespace gpuPixelRecHits {
         clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
         clusParams.maxCol[ic] = 0;
         clusParams.charge[ic] = 0;
-        clusParams.Q_f_X[ic] = 0;
-        clusParams.Q_l_X[ic] = 0;
-        clusParams.Q_f_Y[ic] = 0;
-        clusParams.Q_l_Y[ic] = 0;
+        clusParams.q_f_X[ic] = 0;
+        clusParams.q_l_X[ic] = 0;
+        clusParams.q_f_Y[ic] = 0;
+        clusParams.q_l_Y[ic] = 0;
       }
 
-      first += threadIdx.x;
-
       __syncthreads();
 
-      // one thead per "digi"
-
+      // one thread per "digi"
+      auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x;
       for (int i = first; i < numElements; i += blockDim.x) {
         auto id = digis.moduleInd(i);
-        if (id == InvId)
+        if (id == invalidModuleId)
           continue;  // not valid
         if (id != me)
           break;  // end of module
         auto cl = digis.clus(i);
         if (cl < startClus || cl >= lastClus)
           continue;
-        auto x = digis.xx(i);
-        auto y = digis.yy(i);
         cl -= startClus;
         assert(cl >= 0);
         assert(cl < MaxHitsInIter);
+        auto x = digis.xx(i);
+        auto y = digis.yy(i);
         atomicMin(&clusParams.minRow[cl], x);
         atomicMax(&clusParams.maxRow[cl], x);
         atomicMin(&clusParams.minCol[cl], y);
@@ -139,7 +136,7 @@ namespace gpuPixelRecHits {
       auto pixmx = std::numeric_limits<uint16_t>::max();
       for (int i = first; i < numElements; i += blockDim.x) {
         auto id = digis.moduleInd(i);
-        if (id == InvId)
+        if (id == invalidModuleId)
           continue;  // not valid
         if (id != me)
           break;  // end of module
@@ -154,13 +151,13 @@ namespace gpuPixelRecHits {
         auto ch = std::min(digis.adc(i), pixmx);
         atomicAdd(&clusParams.charge[cl], ch);
         if (clusParams.minRow[cl] == x)
-          atomicAdd(&clusParams.Q_f_X[cl], ch);
+          atomicAdd(&clusParams.q_f_X[cl], ch);
         if (clusParams.maxRow[cl] == x)
-          atomicAdd(&clusParams.Q_l_X[cl], ch);
+          atomicAdd(&clusParams.q_l_X[cl], ch);
         if (clusParams.minCol[cl] == y)
-          atomicAdd(&clusParams.Q_f_Y[cl], ch);
+          atomicAdd(&clusParams.q_f_Y[cl], ch);
         if (clusParams.maxCol[cl] == y)
-          atomicAdd(&clusParams.Q_l_Y[cl], ch);
+          atomicAdd(&clusParams.q_l_Y[cl], ch);
       }
 
       __syncthreads();
@@ -168,13 +165,9 @@ namespace gpuPixelRecHits {
       // next one cluster per thread...
 
       first = clusters.clusModuleStart(me) + startClus;
-
       for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
         auto h = first + ic;  // output index in global memory
 
-        // this cannot happen anymore
-        if (h >= TrackingRecHit2DSOAView::maxHits())
-          break;  // overflow...
         assert(h < hits.nHits());
         assert(h < clusters.clusModuleStart(me + 1));
 
@@ -182,9 +175,7 @@ namespace gpuPixelRecHits {
         pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
         // store it
-
         hits.charge(h) = clusParams.charge[ic];
-
         hits.detectorIndex(h) = me;
 
         float xl, yl;
diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc
index 998185ca4..10679ab76 100644
--- a/src/cudadev/plugin-Validation/HistoValidator.cc
+++ b/src/cudadev/plugin-Validation/HistoValidator.cc
@@ -3,7 +3,7 @@
 #include "CUDADataFormats/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "CUDADataFormats/ZVertexHeterogeneous.h"
 #include "Framework/EventSetup.h"
 #include "Framework/Event.h"
@@ -151,7 +151,7 @@ void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup)
 
     int nTracks = 0;
     for (int i = 0; i < tracks->stride(); ++i) {
-      if (tracks->nHits(i) > 0 and tracks->quality(i) >= trackQuality::loose) {
+      if (tracks->nHits(i) > 0 and tracks->quality(i) >= pixelTrack::Quality::loose) {
         ++nTracks;
         histos["track_nhits"].fill(tracks->nHits(i));
         histos["track_chi2"].fill(tracks->chi2(i));
@@ -162,7 +162,7 @@ void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup)
         histos["track_tip_zoom"].fill(tracks->tip(i));
         histos["track_zip"].fill(tracks->zip(i));
         histos["track_zip_zoom"].fill(tracks->zip(i));
-        histos["track_quality"].fill(tracks->quality(i));
+        histos["track_quality"].fill((uint8_t) tracks->quality(i));
       }
     }
 
diff --git a/src/cudadev/plugins.txt b/src/cudadev/plugins.txt
index 52dfe3102..9af0f9592 100644
--- a/src/cudadev/plugins.txt
+++ b/src/cudadev/plugins.txt
@@ -3,7 +3,7 @@ BeamSpotToCUDA pluginBeamSpotProducer.so
 CAHitNtupletCUDA pluginPixelTriplets.so
 CountValidator pluginValidation.so
 HistoValidator pluginValidation.so
-SiPixelFedCablingMapGPUWrapperESProducer pluginSiPixelClusterizer.so
+SiPixelROCsStatusAndMappingWrapperESProducer pluginSiPixelClusterizer.so
 SiPixelGainCalibrationForHLTGPUESProducer pluginSiPixelClusterizer.so
 SiPixelRawToClusterCUDA pluginSiPixelClusterizer.so
 SiPixelDigisSoAFromCUDA pluginSiPixelRawToDigi.so
diff --git a/src/cudadev/test/HistoContainer_t.cu b/src/cudadev/test/HistoContainer_t.cu
index 15aafe0d3..4f5a4ee54 100644
--- a/src/cudadev/test/HistoContainer_t.cu
+++ b/src/cudadev/test/HistoContainer_t.cu
@@ -28,9 +28,11 @@ void go() {
 
   using Hist = HistoContainer<T, 128, N, 8 * sizeof(T), uint32_t, nParts>;
   std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
-            << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' '
+            << Hist::ctCapacity() << ' ' << offsetof(Hist, content) - offsetof(Hist, off) << ' '
             << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
 
+  assert(Hist::totbins() == Hist::ctNOnes());
+
   Hist h;
   auto h_d = make_device_unique<Hist[]>(1, nullptr);
 
@@ -55,6 +57,7 @@ void go() {
       offsets[8] = 256 * 11 + offsets[7];
       offsets[9] = 44 + offsets[8];
       offsets[10] = 3297 + offsets[9];
+      assert(offsets[10] <= N);
     }
 
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
@@ -69,7 +72,7 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
-    fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
+    fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
diff --git a/src/cudadev/test/HistoContainer_t_cpu.cc b/src/cudadev/test/HistoContainer_t_cpu.cc
index ad1121ef1..b829018e0 100644
--- a/src/cudadev/test/HistoContainer_t_cpu.cc
+++ b/src/cudadev/test/HistoContainer_t_cpu.cc
@@ -25,51 +25,73 @@ void go() {
   constexpr int N = 12000;
   T v[N];
 
+  using HistR = HistoContainer<T, NBINS, -1, S>;
   using Hist = HistoContainer<T, NBINS, N, S>;
   using Hist4 = HistoContainer<T, NBINS, N, S, uint16_t, 4>;
+  std::cout << "HistoContainerR " << HistR::nbits() << ' ' << HistR::nbins() << ' ' << HistR::totbins() << ' '
+            << HistR::ctNOnes() << ' ' << HistR::ctCapacity() << ' ' << (rmax - rmin) / HistR::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+
   std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
-            << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+            << Hist::ctCapacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
   std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
   std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' '
-            << Hist4::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+            << Hist4::ctCapacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
   for (auto nh = 0; nh < 4; ++nh)
     std::cout << "bins " << int(Hist4::bin(0)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin)) + Hist4::histOff(nh)
               << ' ' << int(Hist::bin(rmax)) + Hist4::histOff(nh) << std::endl;
 
+  uint32_t mem[N];
+  HistR hr;
+  typename HistR::View view{&hr, nullptr, mem, -1, N};
+  hr.initStorage(view);
+  std::cout << "HistoContainerR " << hr.capacity() << std::endl;
+  assert(hr.capacity() == N);
   Hist h;
   Hist4 h4;
+  assert(h.capacity() == N);
+  assert(h4.capacity() == N);
+
   for (int it = 0; it < 5; ++it) {
     for (long long j = 0; j < N; j++)
       v[j] = rgen(eng);
     if (it == 2)
       for (long long j = N / 2; j < N / 2 + N / 4; j++)
         v[j] = 4;
+    hr.zero();
     h.zero();
     h4.zero();
+    assert(hr.size() == 0);
     assert(h.size() == 0);
     assert(h4.size() == 0);
     for (long long j = 0; j < N; j++) {
+      hr.count(v[j]);
       h.count(v[j]);
       if (j < 2000)
         h4.count(v[j], 2);
       else
         h4.count(v[j], j % 4);
     }
+    assert(hr.size() == 0);
     assert(h.size() == 0);
     assert(h4.size() == 0);
+    hr.finalize();
     h.finalize();
     h4.finalize();
     assert(h.size() == N);
     assert(h4.size() == N);
     for (long long j = 0; j < N; j++) {
+      hr.fill(v[j], j);
       h.fill(v[j], j);
       if (j < 2000)
         h4.fill(v[j], j, 2);
       else
         h4.fill(v[j], j, j % 4);
     }
+    assert(hr.off[0] == 0);
     assert(h.off[0] == 0);
     assert(h4.off[0] == 0);
+    assert(hr.size() == N);
     assert(h.size() == N);
     assert(h4.size() == N);
 
@@ -80,6 +102,11 @@ void go() {
         std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
     };
 
+    for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+      assert(h.size(i) == hr.size(i));
+      assert(*h.begin(i) == *hr.begin(i));
+    }
+
     for (uint32_t i = 0; i < Hist::nbins(); ++i) {
       if (0 == h.size(i))
         continue;
diff --git a/src/cudadev/test/OneHistoContainer_t.cu b/src/cudadev/test/OneHistoContainer_t.cu
index 960f77eca..89fed087a 100644
--- a/src/cudadev/test/OneHistoContainer_t.cu
+++ b/src/cudadev/test/OneHistoContainer_t.cu
@@ -112,7 +112,7 @@ void go() {
   assert(v_d.get());
 
   using Hist = HistoContainer<T, NBINS, N, S>;
-  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' '
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::ctCapacity() << ' '
             << (rmax - rmin) / Hist::nbins() << std::endl;
   std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
 
diff --git a/src/cudadev/test/OneToManyAssoc_t.h b/src/cudadev/test/OneToManyAssoc_t.h
index 69c3ade3d..50bf6fb38 100644
--- a/src/cudadev/test/OneToManyAssoc_t.h
+++ b/src/cudadev/test/OneToManyAssoc_t.h
@@ -13,17 +13,23 @@
 #include "CUDACore/currentDevice.h"
 #endif
 
-#include "CUDACore/HistoContainer.h"
+#include "CUDACore/OneToManyAssoc.h"
 using cms::cuda::AtomicPairCounter;
 
 constexpr uint32_t MaxElem = 64000;
 constexpr uint32_t MaxTk = 8000;
 constexpr uint32_t MaxAssocs = 4 * MaxTk;
 
+#ifdef RUNTIME_SIZE
+using Assoc = cms::cuda::OneToManyAssoc<uint16_t, -1, -1>;
+using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, -1>;
+using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
+#else
 using Assoc = cms::cuda::OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
 using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, MaxAssocs>;
 using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
 using TK = std::array<uint16_t, 4>;
+#endif
 
 __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
@@ -32,7 +38,7 @@ __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restr
     if (threadIdx.x == 0)
       local.zero();
     __syncthreads();
-    local.countDirect(2 + i % 4);
+    local.count(2 + i % 4);
     __syncthreads();
     if (threadIdx.x == 0)
       assoc->add(local);
@@ -42,12 +48,12 @@ __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restr
 __global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = first; i < n; i += gridDim.x * blockDim.x)
-    assoc->countDirect(2 + i % 4);
+    assoc->count(2 + i % 4);
 }
 
 __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) {
   auto first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x)
+  for (int i = first; i < m1->totOnes(); i += gridDim.x * blockDim.x)
     assert(m1->off[i] == m2->off[i]);
 }
 
@@ -60,7 +66,7 @@ __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int3
     if (k >= n)
       return;
     if (tk[k][j] < MaxElem)
-      assoc->countDirect(tk[k][j]);
+      assoc->count(tk[k][j]);
   }
 }
 
@@ -73,11 +79,11 @@ __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32
     if (k >= n)
       return;
     if (tk[k][j] < MaxElem)
-      assoc->fillDirect(tk[k][j], k);
+      assoc->fill(tk[k][j], k);
   }
 }
 
-__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); }
+__global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); }
 
 template <typename Assoc>
 __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
@@ -90,9 +96,55 @@ __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Asso
 
 template <typename Assoc>
 __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) {
-  if (apc->get().m >= Assoc::nbins())
-    printf("Overflow %d %d\n", apc->get().m, Assoc::nbins());
-  assert(assoc->size() < Assoc::capacity());
+  if (int(apc->get().m) >= assoc->nOnes())
+    printf("Overflow %d %d\n", apc->get().m, assoc->nOnes());
+  assert(int(assoc->size()) < assoc->capacity());
+}
+
+template <typename Assoc>
+__global__ void verifyFill(Assoc const* __restrict__ la, int n) {
+  printf("assoc size %d\n", la->size());
+  int imax = 0;
+  long long ave = 0;
+  int z = 0;
+  for (int i = 0; i < n; ++i) {
+    auto x = la->size(i);
+    if (x == 0) {
+      z++;
+      continue;
+    }
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la->size(n));
+  printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z);
+}
+
+template <typename Assoc>
+__global__ void verifyFinal(Assoc const* __restrict__ la, int N) {
+  printf("assoc size %d\n", la->size());
+
+  int imax = 0;
+  long long ave = 0;
+  for (int i = 0; i < N; ++i) {
+    auto x = la->size(i);
+    if (!(x == 4 || x == 3))
+      printf("%d %d\n", i, x);
+    assert(x == 4 || x == 3);
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la->size(N));
+  printf("found with ave occupancy %f %d\n", double(ave) / N, imax);
+}
+
+template <typename T>
+auto make_unique(std::size_t size) {
+#ifdef __CUDACC__
+  return cms::cuda::make_device_unique<T>(size, 0);
+#else
+  return std::make_unique<T>(size);
+#endif
 }
 
 int main() {
@@ -118,9 +170,9 @@ int main() {
   assert(gridDim.z == 1);
 #endif
 
-  std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl;
-  std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' '
-            << SmallAssoc::capacity() << std::endl;
+  std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::ctNOnes() << ' ' << Assoc::ctCapacity() << std::endl;
+  std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::ctNOnes() << ' '
+            << SmallAssoc::ctCapacity() << std::endl;
 
   std::mt19937 eng;
 
@@ -164,19 +216,36 @@ int main() {
   }
   std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
 
+  auto a_d = make_unique<Assoc[]>(1);
+  auto sa_d = make_unique<SmallAssoc[]>(1);
 #ifdef __CUDACC__
   auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
   assert(v_d.get());
-  auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
-  auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
-  auto a_d = std::make_unique<Assoc>();
-  auto sa_d = std::make_unique<SmallAssoc>();
   auto v_d = tr.data();
 #endif
 
-  launchZero(a_d.get(), 0);
+  Assoc::Counter* a_st = nullptr;
+  int a_n = MaxElem;
+
+  Assoc::index_type* a_st2 = nullptr;
+  SmallAssoc::index_type* sa_st2 = nullptr;
+  int a_n2 = MaxAssocs;
+
+// storage
+#ifdef RUNTIME_SIZE
+  auto a_st_d = make_unique<Assoc::Counter[]>(a_n);
+  auto a_st2_d = make_unique<Assoc::index_type[]>(a_n2);
+  auto sa_st2_d = make_unique<SmallAssoc::index_type[]>(a_n2);
+  a_st = a_st_d.get();
+  a_st2 = a_st2_d.get();
+  sa_st2 = sa_st2_d.get();
+#endif
+  Assoc::View aView = {a_d.get(), a_st, a_st2, a_n, a_n2};
+  launchZero(aView, 0);
+  SmallAssoc::View saView = {sa_d.get(), nullptr, sa_st2, -1, a_n2};
+  launchZero(saView, 0);
 
 #ifdef __CUDACC__
   auto nThreads = 256;
@@ -184,41 +253,21 @@ int main() {
 
   count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 
-  launchFinalize(a_d.get(), 0);
+  launchFinalize(aView, 0);
   verify<<<1, 1>>>(a_d.get());
   fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+  verifyFill<<<1, 1>>>(a_d.get(), n);
+
 #else
   count(v_d, a_d.get(), N);
-  launchFinalize(a_d.get());
+  launchFinalize(aView);
   verify(a_d.get());
   fill(v_d, a_d.get(), N);
-#endif
+  verifyFill(a_d.get(), n);
 
-  Assoc la;
-
-#ifdef __CUDACC__
-  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
-#else
-  memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
 #endif
 
-  std::cout << la.size() << std::endl;
-  imax = 0;
-  ave = 0;
-  z = 0;
-  for (auto i = 0U; i < n; ++i) {
-    auto x = la.size(i);
-    if (x == 0) {
-      z++;
-      continue;
-    }
-    ave += x;
-    imax = std::max(imax, int(x));
-  }
-  assert(0 == la.size(n));
-  std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
-
-  // now the inverse map (actually this is the direct....)
+  // now the inverse map (actually this is the ....)
   AtomicPairCounter* dc_d;
   AtomicPairCounter dc(0);
 
@@ -230,8 +279,8 @@ int main() {
   finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
-  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
   cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
+  verifyFinal<<<1, 1>>>(a_d.get(), N);
 
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
@@ -243,7 +292,8 @@ int main() {
   fillBulk(dc_d, v_d, a_d.get(), N);
   finalizeBulk(dc_d, a_d.get());
   verifyBulk(a_d.get(), dc_d);
-  memcpy(&la, a_d.get(), sizeof(Assoc));
+
+  verifyFinal(a_d.get(), N);
 
   AtomicPairCounter sdc(0);
   fillBulk(&sdc, v_d, sa_d.get(), N);
@@ -254,40 +304,35 @@ int main() {
 
   std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl;
 
-  std::cout << la.size() << std::endl;
-  imax = 0;
-  ave = 0;
-  for (auto i = 0U; i < N; ++i) {
-    auto x = la.size(i);
-    if (!(x == 4 || x == 3))
-      std::cout << i << ' ' << x << std::endl;
-    assert(x == 4 || x == 3);
-    ave += x;
-    imax = std::max(imax, int(x));
-  }
-  assert(0 == la.size(N));
-  std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
-
   // here verify use of block local counters
-#ifdef __CUDACC__
-  auto m1_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
-  auto m2_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
-#else
-  auto m1_d = std::make_unique<Multiplicity>();
-  auto m2_d = std::make_unique<Multiplicity>();
+  auto m1_d = make_unique<Multiplicity[]>(1);
+  auto m2_d = make_unique<Multiplicity[]>(1);
+
+  Multiplicity::index_type* m1_st = nullptr;
+  Multiplicity::index_type* m2_st = nullptr;
+  int m_n = 0;
+
+#ifdef RUNTIME_SIZE
+  m_n = MaxTk;
+  auto m1_st_d = make_unique<Multiplicity::index_type[]>(m_n);
+  auto m2_st_d = make_unique<Multiplicity::index_type[]>(m_n);
+  m1_st = m1_st_d.get();
+  m2_st = m1_st_d.get();
 #endif
-  launchZero(m1_d.get(), 0);
-  launchZero(m2_d.get(), 0);
+  Multiplicity::View view1 = {m1_d.get(), nullptr, m1_st, -1, m_n};
+  Multiplicity::View view2 = {m2_d.get(), nullptr, m2_st, -1, m_n};
+  launchZero(view1, 0);
+  launchZero(view2, 0);
 
 #ifdef __CUDACC__
   nBlocks = (4 * N + nThreads - 1) / nThreads;
   countMulti<<<nBlocks, nThreads>>>(v_d.get(), m1_d.get(), N);
   countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
-  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+  verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
 
-  launchFinalize(m1_d.get(), 0);
-  launchFinalize(m2_d.get(), 0);
-  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+  launchFinalize(view1, 0);
+  launchFinalize(view2, 0);
+  verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
 
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
@@ -296,8 +341,8 @@ int main() {
   countMultiLocal(v_d, m2_d.get(), N);
   verifyMulti(m1_d.get(), m2_d.get());
 
-  launchFinalize(m1_d.get());
-  launchFinalize(m2_d.get());
+  launchFinalize(view1, 0);
+  launchFinalize(view2, 0);
   verifyMulti(m1_d.get(), m2_d.get());
 #endif
   return 0;
diff --git a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu
index 5f25e4c1a..5f3a26391 100644
--- a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu
+++ b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h"
 #include "CUDACore/copyAsync.h"
 #include "CUDACore/cudaCheck.h"
 
diff --git a/src/cudadev/test/TrajectoryStateSOA_t.h b/src/cudadev/test/TrajectoryStateSOA_t.h
index 2fcf9fc09..1b8636ae6 100644
--- a/src/cudadev/test/TrajectoryStateSOA_t.h
+++ b/src/cudadev/test/TrajectoryStateSOA_t.h
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/TrajectoryStateSoA.h"
+#include "CUDADataFormats/TrajectoryStateSoAT.h"
 
 using Vector5d = Eigen::Matrix<double, 5, 1>;
 using Matrix5d = Eigen::Matrix<double, 5, 5>;
@@ -17,7 +17,7 @@ __host__ __device__ Matrix5d loadCov(Vector5d const& e) {
   return cov;
 }
 
-using TS = TrajectoryStateSoA<128>;
+using TS = TrajectoryStateSoAT<128>;
 
 __global__ void testTSSoA(TS* pts, int n) {
   assert(n <= 128);
diff --git a/src/cudadev/test/VertexFinder_t.h b/src/cudadev/test/VertexFinder_t.h
index 53f26d2de..aed660c0d 100644
--- a/src/cudadev/test/VertexFinder_t.h
+++ b/src/cudadev/test/VertexFinder_t.h
@@ -266,10 +266,7 @@ int main() {
       cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      gridDim.x = 1;
-      assert(blockIdx.x == 0);
       splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
-      resetGrid();
       nv = ws_d->nvIntermediate;
 #endif
       std::cout << "after split " << nv << std::endl;
diff --git a/src/cudadev/test/gpuClustering_t.h b/src/cudadev/test/gpuClustering_t.h
index 5388e3499..4f273e57a 100644
--- a/src/cudadev/test/gpuClustering_t.h
+++ b/src/cudadev/test/gpuClustering_t.h
@@ -10,31 +10,32 @@
 #include <vector>
 
 #ifdef __CUDACC__
-
-#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/cudaCheck.h"
-#include "CUDACore/requireDevices.h"
+#include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/launch.h"
-#endif
+#include "CUDACore/requireDevices.h"
+#endif  // __CUDACC__
 
 // dirty, but works
 #include "plugin-SiPixelClusterizer/gpuClustering.h"
 #include "plugin-SiPixelClusterizer/gpuClusterChargeCut.h"
+#include "plugin-SiPixelClusterizer/SiPixelClusterThresholds.h"
 
 int main(void) {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
-#endif
+#endif  // __CUDACC__
 
   using namespace gpuClustering;
 
-  int numElements = 256 * 2000;
+  constexpr int numElements = 256 * maxNumModules;
+  constexpr SiPixelClusterThresholds clusterThresholds(kSiPixelClusterThresholdsDefaultPhase1);
+
   // these in reality are already on GPU
   auto h_id = std::make_unique<uint16_t[]>(numElements);
   auto h_x = std::make_unique<uint16_t[]>(numElements);
   auto h_y = std::make_unique<uint16_t[]>(numElements);
   auto h_adc = std::make_unique<uint16_t[]>(numElements);
-
   auto h_clus = std::make_unique<int[]>(numElements);
 
 #ifdef __CUDACC__
@@ -43,16 +44,14 @@ int main(void) {
   auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
   auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
   auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
-  auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules + 1, nullptr);
-  auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
-  auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
-#else
-
-  auto h_moduleStart = std::make_unique<uint32_t[]>(MaxNumModules + 1);
-  auto h_clusInModule = std::make_unique<uint32_t[]>(MaxNumModules);
-  auto h_moduleId = std::make_unique<uint32_t[]>(MaxNumModules);
-
-#endif
+  auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, nullptr);
+  auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
+  auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
+#else   // __CUDACC__
+  auto h_moduleStart = std::make_unique<uint32_t[]>(maxNumModules + 1);
+  auto h_clusInModule = std::make_unique<uint32_t[]>(maxNumModules);
+  auto h_moduleId = std::make_unique<uint32_t[]>(maxNumModules);
+#endif  // __CUDACC__
 
   // later random number
   int n = 0;
@@ -150,7 +149,7 @@ int main(void) {
         ++n;
       }
       ++ncl;
-      h_id[n++] = InvId;  // error
+      h_id[n++] = invalidModuleId;  // error
       // messy
       int xx[5] = {21, 25, 23, 24, 22};
       for (int k = 0; k < 5; ++k) {
@@ -191,7 +190,7 @@ int main(void) {
     // all odd id
     for (int id = 11; id <= 1800; id += 2) {
       if ((id / 20) % 2)
-        h_id[n++] = InvId;  // error
+        h_id[n++] = invalidModuleId;  // error
       for (int x = 0; x < 40; x += 4) {
         ++ncl;
         if ((id / 10) % 2) {
@@ -217,8 +216,8 @@ int main(void) {
             if (y[k] == 3)
               continue;  // hole
             if (id == 51) {
-              h_id[n++] = InvId;
-              h_id[n++] = InvId;
+              h_id[n++] = invalidModuleId;
+              h_id[n++] = invalidModuleId;
             }  // error
             h_id[n] = id;
             h_x[n] = x + 1;
@@ -245,11 +244,11 @@ int main(void) {
     // size_t size8 = n * sizeof(uint8_t);
 
     cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
-
     cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
+
     // Launch CUDA Kernels
     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
@@ -258,11 +257,11 @@ int main(void) {
 
     cms::cuda::launch(countModules, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
 
-    blocksPerGrid = MaxNumModules;  //nModules;
+    blocksPerGrid = maxNumModules;  //nModules;
 
     std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
               << " threads\n";
-    cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)));
+    cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)));
 
     cms::cuda::launch(findClus,
                       {blocksPerGrid, threadsPerBlock},
@@ -277,21 +276,22 @@ int main(void) {
     cudaDeviceSynchronize();
     cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
-    uint32_t nclus[MaxNumModules], moduleId[nModules];
-    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    uint32_t nclus[maxNumModules], moduleId[nModules];
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
-    std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
+    std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
               << std::endl;
-    for (auto i = MaxNumModules; i > 0; i--)
+    for (auto i = maxNumModules; i > 0; i--)
       if (nclus[i - 1] > 0) {
         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
         break;
       }
-    if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
+    if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
     cms::cuda::launch(clusterChargeCut,
                       {blocksPerGrid, threadsPerBlock},
+                      clusterThresholds,
                       d_id.get(),
                       d_adc.get(),
                       d_moduleStart.get(),
@@ -301,58 +301,49 @@ int main(void) {
                       n);
 
     cudaDeviceSynchronize();
-#else
+#else   // __CUDACC__
     h_moduleStart[0] = nModules;
     countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
-    memset(h_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));
-    gridDim.x = MaxNumModules;  //not needed in the kernel for this specific case;
-    assert(blockIdx.x == 0);
-    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
-      findClus(h_id.get(),
-               h_x.get(),
-               h_y.get(),
-               h_moduleStart.get(),
-               h_clusInModule.get(),
-               h_moduleId.get(),
-               h_clus.get(),
-               n);
-    resetGrid();
+    memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t));
+    findClus(
+        h_id.get(), h_x.get(), h_y.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
 
     nModules = h_moduleStart[0];
     auto nclus = h_clusInModule.get();
 
-    std::cout << "before charge cut found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << " clusters"
+    std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
               << std::endl;
-    for (auto i = MaxNumModules; i > 0; i--)
+    for (auto i = maxNumModules; i > 0; i--)
       if (nclus[i - 1] > 0) {
         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
         break;
       }
-    if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
+    if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
-    gridDim.x = MaxNumModules;  // no needed in the kernel for in this specific case
-    assert(blockIdx.x == 0);
-    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
-      clusterChargeCut(
-          h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
-    resetGrid();
-
-#endif
+    clusterChargeCut(clusterThresholds,
+                     h_id.get(),
+                     h_adc.get(),
+                     h_moduleStart.get(),
+                     h_clusInModule.get(),
+                     h_moduleId.get(),
+                     h_clus.get(),
+                     n);
+#endif  // __CUDACC__
 
     std::cout << "found " << nModules << " Modules active" << std::endl;
 
 #ifdef __CUDACC__
     cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
     cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
-    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
     cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
-#endif
+#endif  // __CUDACC__
 
     std::set<unsigned int> clids;
     for (int i = 0; i < n; ++i) {
       assert(h_id[i] != 666);  // only noise
-      if (h_id[i] == InvId)
+      if (h_id[i] == invalidModuleId)
         continue;
       assert(h_clus[i] >= 0);
       assert(h_clus[i] < int(nclus[h_id[i]]));
@@ -388,9 +379,9 @@ int main(void) {
         std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
     }
 
-    std::cout << "found " << std::accumulate(nclus, nclus + MaxNumModules, 0) << ' ' << clids.size() << " clusters"
+    std::cout << "found " << std::accumulate(nclus, nclus + maxNumModules, 0) << ' ' << clids.size() << " clusters"
               << std::endl;
-    for (auto i = MaxNumModules; i > 0; i--)
+    for (auto i = maxNumModules; i > 0; i--)
       if (nclus[i - 1] > 0) {
         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
         break;
diff --git a/src/cudadev/test/testEigenGPU.cu b/src/cudadev/test/testEigenGPU.cu
index 9cbcc4c57..0e9c8aebe 100644
--- a/src/cudadev/test/testEigenGPU.cu
+++ b/src/cudadev/test/testEigenGPU.cu
@@ -16,7 +16,7 @@
 
 using namespace Eigen;
 
-namespace Rfit {
+namespace riemannFit {
   constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
   constexpr uint32_t stride() { return maxNumberOfTracks(); }
   // hits
@@ -32,32 +32,32 @@ namespace Rfit {
   // fast fit
   using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()>>;
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 template <int N>
 __global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, 4);
   if (i != 0)
     return;
   printf("GPU sizes %lu %lu %lu %lu %lu\n",
          sizeof(hits[i]),
          sizeof(hits_ge[i]),
          sizeof(Vector4d),
-         sizeof(Rfit::line_fit),
-         sizeof(Rfit::circle_fit));
+         sizeof(riemannFit::LineFit),
+         sizeof(riemannFit::CircleFit));
 }
 
 template <int N>
 __global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d result(presults + i, 4);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d result(presults + i, 4);
 #ifdef USE_BL
-  BrokenLine::BL_Fast_fit(hits, result);
+  brokenline::fastFit(hits, result);
 #else
-  Rfit::Fast_fit(hits, result);
+  riemannFit::fastFit(hits, result);
 #endif
 }
 
@@ -68,24 +68,24 @@ __global__ void kernelBrokenLineFit(double* __restrict__ phits,
                                     float* __restrict__ phits_ge,
                                     double* __restrict__ pfast_fit_input,
                                     double B,
-                                    Rfit::circle_fit* circle_fit,
-                                    Rfit::line_fit* line_fit) {
+                                    riemannFit::CircleFit* circle_fit,
+                                    riemannFit::LineFit* line_fit) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
 
-  BrokenLine::PreparedBrokenLineData<N> data;
-  Rfit::Matrix3d Jacob;
+  brokenline::PreparedBrokenLineData<N> data;
+  riemannFit::Matrix3d Jacob;
 
   auto& line_fit_results = line_fit[i];
   auto& circle_fit_results = circle_fit[i];
 
-  BrokenLine::prepareBrokenLineData(hits, fast_fit_input, B, data);
-  BrokenLine::BL_Line_fit(hits_ge, fast_fit_input, B, data, line_fit_results);
-  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
+  brokenline::prepareBrokenLineData(hits, fast_fit_input, B, data);
+  brokenline::lineFit(hits_ge, fast_fit_input, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results);
   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
-      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 
@@ -99,21 +99,21 @@ __global__ void kernelBrokenLineFit(double* __restrict__ phits,
 #else
 
 template <int N>
-__global__ void kernelCircleFit(double* __restrict__ phits,
-                                float* __restrict__ phits_ge,
-                                double* __restrict__ pfast_fit_input,
-                                double B,
-                                Rfit::circle_fit* circle_fit_resultsGPU) {
+__global__ void kernel_CircleFit(double* __restrict__ phits,
+                                 float* __restrict__ phits_ge,
+                                 double* __restrict__ pfast_fit_input,
+                                 double B,
+                                 riemannFit::CircleFit* circle_fit_resultsGPU) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
 
   constexpr auto n = N;
 
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
-  Rfit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
-  Rfit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, n).colwise().norm());
+  riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
 
 #ifdef TEST_DEBUG
   if (0 == i) {
@@ -133,7 +133,7 @@ __global__ void kernelCircleFit(double* __restrict__ phits,
     printf("B: %f\n", B);
   }
 #endif
-  circle_fit_resultsGPU[i] = Rfit::Circle_fit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
+  circle_fit_resultsGPU[i] = riemannFit::circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true);
 #ifdef TEST_DEBUG
   if (0 == i) {
     printf("Circle param %f,%f,%f\n",
@@ -148,14 +148,14 @@ template <int N>
 __global__ void kernelLineFit(double* __restrict__ phits,
                               float* __restrict__ phits_ge,
                               double B,
-                              Rfit::circle_fit* circle_fit,
+                              riemannFit::CircleFit* circle_fit,
                               double* __restrict__ pfast_fit_input,
-                              Rfit::line_fit* line_fit) {
+                              riemannFit::LineFit* line_fit) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map4d fast_fit_input(pfast_fit_input + i, 4);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
-  line_fit[i] = Rfit::Line_fit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  line_fit[i] = riemannFit::lineFit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true);
 }
 #endif
 
@@ -204,8 +204,8 @@ __device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
 template <int N>
 __global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
-  Rfit::Map3xNd<N> hits(phits + i, 3, N);
-  Rfit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
+  riemannFit::Map3xNd<N> hits(phits + i, 3, N);
+  riemannFit::Map6xNf<N> hits_ge(phits_ge + i, 6, N);
   hits_ge = MatrixXf::Zero(6, N);
   fillHitsAndHitsCov(hits, hits_ge);
 }
@@ -213,22 +213,22 @@ __global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phit
 template <int N>
 void testFit() {
   constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd<N> hits;
-  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+  riemannFit::Matrix3xNd<N> hits;
+  riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
   double* hitsGPU = nullptr;
   ;
   float* hits_geGPU = nullptr;
   double* fast_fit_resultsGPU = nullptr;
-  double* fast_fit_resultsGPUret = new double[Rfit::maxNumberOfTracks() * sizeof(Vector4d)];
-  Rfit::circle_fit* circle_fit_resultsGPU = nullptr;
-  Rfit::circle_fit* circle_fit_resultsGPUret = new Rfit::circle_fit();
-  Rfit::line_fit* line_fit_resultsGPU = nullptr;
-  Rfit::line_fit* line_fit_resultsGPUret = new Rfit::line_fit();
+  double* fast_fit_resultsGPUret = new double[riemannFit::maxNumberOfTracks() * sizeof(Vector4d)];
+  riemannFit::CircleFit* circle_fit_resultsGPU = nullptr;
+  riemannFit::CircleFit* circle_fit_resultsGPUret = new riemannFit::CircleFit();
+  riemannFit::LineFit* line_fit_resultsGPU = nullptr;
+  riemannFit::LineFit* line_fit_resultsGPUret = new riemannFit::LineFit();
 
   fillHitsAndHitsCov(hits, hits_ge);
 
   std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' '
-            << sizeof(Rfit::line_fit) << ' ' << sizeof(Rfit::circle_fit) << std::endl;
+            << sizeof(riemannFit::LineFit) << ' ' << sizeof(riemannFit::CircleFit) << std::endl;
 
   std::cout << "Generated hits:\n" << hits << std::endl;
   std::cout << "Generated cov:\n" << hits_ge << std::endl;
@@ -236,23 +236,23 @@ void testFit() {
   // FAST_FIT_CPU
 #ifdef USE_BL
   Vector4d fast_fit_results;
-  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+  brokenline::fastFit(hits, fast_fit_results);
 #else
   Vector4d fast_fit_results;
-  Rfit::Fast_fit(hits, fast_fit_results);
+  riemannFit::fastFit(hits, fast_fit_results);
 #endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
   // for timing    purposes we fit    4096 tracks
   constexpr uint32_t Ntracks = 4096;
-  cudaCheck(cudaMalloc(&hitsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix3xNd<N>)));
-  cudaCheck(cudaMalloc(&hits_geGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::Matrix6xNf<N>)));
-  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
-  cudaCheck(cudaMalloc(&line_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
-  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, Rfit::maxNumberOfTracks() * sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMalloc(&hitsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix3xNd<N>)));
+  cudaCheck(cudaMalloc(&hits_geGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix6xNf<N>)));
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMalloc(&line_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::CircleFit)));
 
-  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Vector4d)));
-  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, Rfit::maxNumberOfTracks() * sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(Vector4d)));
+  cudaCheck(cudaMemset(line_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit)));
 
   kernelPrintSizes<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
   kernelFillHitsAndHitsCov<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU);
@@ -263,23 +263,23 @@ void testFit() {
 
   cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
                        fast_fit_resultsGPU,
-                       Rfit::maxNumberOfTracks() * sizeof(Vector4d),
+                       riemannFit::maxNumberOfTracks() * sizeof(Vector4d),
                        cudaMemcpyDeviceToHost));
-  Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
+  riemannFit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
   assert(isEqualFuzzy(fast_fit_results, fast_fit));
 
 #ifdef USE_BL
   // CIRCLE AND LINE FIT CPU
-  BrokenLine::PreparedBrokenLineData<N> data;
-  BrokenLine::karimaki_circle_fit circle_fit_results;
-  Rfit::line_fit line_fit_results;
-  Rfit::Matrix3d Jacob;
-  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
-  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
-  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  brokenline::PreparedBrokenLineData<N> data;
+  brokenline::karimaki_circle_fit circle_fit_results;
+  riemannFit::LineFit line_fit_results;
+  riemannFit::Matrix3d Jacob;
+  brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
-      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 
@@ -290,19 +290,20 @@ void testFit() {
 
 #else
   // CIRCLE_FIT CPU
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
 
-  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge, hits_cov);
-  Rfit::circle_fit circle_fit_results =
-      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+  riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::CircleFit circle_fit_results =
+      riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
 
   // CIRCLE_FIT GPU
-  kernelCircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
+  kernel_CircleFit<N><<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU);
   cudaDeviceSynchronize();
 
   // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+  riemannFit::LineFit line_fit_results =
+      riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
 
   kernelLineFit<N>
       <<<Ntracks / 64, 64>>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU);
@@ -311,14 +312,15 @@ void testFit() {
 
   std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;
 
-  cudaCheck(
-      cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(
+      circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(riemannFit::CircleFit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));
 
   std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
   // LINE_FIT GPU
-  cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
+  cudaCheck(
+      cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(riemannFit::LineFit), cudaMemcpyDeviceToHost));
   std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
   assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6));  // requires fma on CPU
 
diff --git a/src/cudadev/test/testRiemannFit.cc b/src/cudadev/test/testRiemannFit.cc
index 4df7bcf30..3891720d6 100644
--- a/src/cudadev/test/testRiemannFit.cc
+++ b/src/cudadev/test/testRiemannFit.cc
@@ -13,7 +13,7 @@
 
 using namespace Eigen;
 
-namespace Rfit {
+namespace riemannFit {
   constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; }
   constexpr uint32_t stride() { return maxNumberOfTracks(); }
   // hits
@@ -29,7 +29,7 @@ namespace Rfit {
   // fast fit
   using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride()> >;
 
-}  // namespace Rfit
+}  // namespace riemannFit
 
 /*
 Hit global: 641,0 2: 2.934787,0.773211,-10.980247
@@ -89,8 +89,8 @@ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) {
 template <int N>
 void testFit() {
   constexpr double B = 0.0113921;
-  Rfit::Matrix3xNd<N> hits;
-  Rfit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
+  riemannFit::Matrix3xNd<N> hits;
+  riemannFit::Matrix6xNf<N> hits_ge = MatrixXf::Zero(6, N);
 
   fillHitsAndHitsCov(hits, hits_ge);
 
@@ -102,37 +102,38 @@ void testFit() {
   // FAST_FIT_CPU
 #ifdef USE_BL
   Vector4d fast_fit_results;
-  BrokenLine::BL_Fast_fit(hits, fast_fit_results);
+  brokenline::fastFit(hits, fast_fit_results);
 #else
   Vector4d fast_fit_results;
-  Rfit::Fast_fit(hits, fast_fit_results);
+  riemannFit::fastFit(hits, fast_fit_results);
 #endif
   std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl;
 
   // CIRCLE_FIT CPU
 
 #ifdef USE_BL
-  BrokenLine::PreparedBrokenLineData<N> data;
-  BrokenLine::karimaki_circle_fit circle_fit_results;
-  Rfit::Matrix3d Jacob;
-
-  BrokenLine::prepareBrokenLineData(hits, fast_fit_results, B, data);
-  Rfit::line_fit line_fit_results;
-  BrokenLine::BL_Line_fit(hits_ge, fast_fit_results, B, data, line_fit_results);
-  BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
+  brokenline::PreparedBrokenLineData<N> data;
+  brokenline::karimaki_circle_fit circle_fit_results;
+  riemannFit::Matrix3d Jacob;
+
+  brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data);
+  riemannFit::LineFit line_fit_results;
+  brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results);
+  brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results);
   Jacob << 1., 0, 0, 0, 1., 0, 0, 0,
-      -B / std::copysign(Rfit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
+      -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2));
   circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2));
   circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose();
 #else
-  Rfit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
-  Rfit::Matrix2Nd<N> hits_cov = Rfit::Matrix2Nd<N>::Zero();
-  Rfit::loadCovariance2D(hits_ge, hits_cov);
-  Rfit::circle_fit circle_fit_results =
-      Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
+  riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+  riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+  riemannFit::loadCovariance2D(hits_ge, hits_cov);
+  riemannFit::CircleFit circle_fit_results =
+      riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true);
   // LINE_FIT CPU
-  Rfit::line_fit line_fit_results = Rfit::Line_fit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
-  Rfit::par_uvrtopak(circle_fit_results, B, true);
+  riemannFit::LineFit line_fit_results =
+      riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true);
+  riemannFit::par_uvrtopak(circle_fit_results, B, true);
 
 #endif