diff --git a/CUDADataFormats/TrackingRecHit/interface/SiPixelStatus.h b/CUDADataFormats/TrackingRecHit/interface/SiPixelStatus.h
new file mode 100644
index 0000000000000..75e4468a811b6
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/SiPixelStatus.h
@@ -0,0 +1,12 @@
+#ifndef CUDADataFormats_TrackingRecHit_interface_SiPixelStatus_H
+#define CUDADataFormats_TrackingRecHit_interface_SiPixelStatus_H
+
+struct SiPixelStatus {
+  uint8_t isBigX : 1;
+  uint8_t isOneX : 1;
+  uint8_t isBigY : 1;
+  uint8_t isOneY : 1;
+  uint8_t qBin : 3;
+};
+
+#endif
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index 73a6daaa4e387..3a1d90b251ee2 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -14,10 +14,12 @@ class TrackingRecHit2DHeterogeneous {
 
   TrackingRecHit2DHeterogeneous() = default;
 
-  explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
-                                         pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                         uint32_t const* hitsModuleStart,
-                                         cudaStream_t stream);
+  explicit TrackingRecHit2DHeterogeneous(
+      uint32_t nHits,
+      pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+      uint32_t const* hitsModuleStart,
+      cudaStream_t stream,
+      TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
 
   ~TrackingRecHit2DHeterogeneous() = default;
 
@@ -41,6 +43,9 @@ class TrackingRecHit2DHeterogeneous {
   cms::cuda::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cudaStream_t stream) const;
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
+  // needs specialization for Host
+  void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
+
 private:
   static constexpr uint32_t n16 = 4;
   static constexpr uint32_t n32 = 9;
@@ -64,26 +69,33 @@ class TrackingRecHit2DHeterogeneous {
   int16_t* m_iphi;
 };
 
+using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
+using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
+using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
+using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
+
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 template <typename Traits>
-TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
-                                                                     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-                                                                     uint32_t const* hitsModuleStart,
-                                                                     cudaStream_t stream)
+TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
+    uint32_t nHits,
+    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+    uint32_t const* hitsModuleStart,
+    cudaStream_t stream,
+    TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
     : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
   auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
 
   view->m_nHits = nHits;
-  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
-  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);  // leave it on host and pass it by value?
+  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_cpeParams = cpeParams;
   view->m_hitsModuleStart = hitsModuleStart;
 
   // if empy do not bother
-  if (0 == nHits) {
+  if (0 == nHits) 
     if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
       cms::cuda::copyAsync(m_view, view, stream);
     } else {
@@ -97,38 +109,57 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // if ordering is relevant they may have to be stored phi-ordered by layer or so
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
-  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
-  m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);
 
-  auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
+  // host copy is "reduced"  (to be reviewed at some point)
+  if
+#ifdef __cpp_if_constexpr
+      constexpr
+#endif
+      (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
+    // it has to compile for ALL cases
+    copyFromGPU(input, stream);
+  } else {
+    assert(input == nullptr);
+    m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
+    m_store32 = Traits::template make_unique<float[]>(nHits * n32 + 11, stream);
+    m_HistStore = Traits::template make_unique<TrackingRecHit2DSOAView::Hist>(stream);
+  }
+
   auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
 
   // copy all the pointers
-  m_hist = view->m_hist = m_HistStore.get();
 
   view->m_xl = get32(0);
   view->m_yl = get32(1);
   view->m_xerr = get32(2);
   view->m_yerr = get32(3);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
 
-  view->m_xg = get32(4);
-  view->m_yg = get32(5);
-  view->m_zg = get32(6);
-  view->m_rg = get32(7);
-
-  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(0));
-
-  view->m_charge = reinterpret_cast<int32_t*>(get32(8));
-  view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
-  view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
-  view->m_detInd = get16(1);
-
-  m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
+  if
+#ifdef __cpp_if_constexpr
+      constexpr
+#endif
+      (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
+    assert(input == nullptr);
+    view->m_xg = get32(5);
+    view->m_yg = get32(6);
+    view->m_zg = get32(7);
+    view->m_rg = get32(8);
+
+    auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
+    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));
+
+    view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
+    view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
+    view->m_detInd = get16(0);
+
+    m_hist = view->m_hist = m_HistStore.get();
+    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
+  }
 
   // transfer view
   if
-#ifndef __CUDACC__
+#ifdef __cpp_if_constexpr
       constexpr
 #endif
       (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
@@ -138,9 +169,4 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   }
 }
 
-using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
-using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
-using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
-using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
-
 #endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
new file mode 100644
index 0000000000000..263a0dee56dd5
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
@@ -0,0 +1,52 @@
+#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
+#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+
+// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
+class TrackingRecHit2DReduced {
+public:
+  using HLPstorage = HostProduct<float[]>;
+  using HIDstorage = HostProduct<uint16_t[]>;
+
+  template <typename UP32, typename UP16>
+  TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits)
+      : m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) {
+    auto get32 = [&](int i) { return const_cast<float*>(m_store32.get()) + i * nhits; };
+
+    // copy all the pointers (better be in sync with the producer store)
+
+    m_view.m_xl = get32(0);
+    m_view.m_yl = get32(1);
+    m_view.m_xerr = get32(2);
+    m_view.m_yerr = get32(3);
+    m_view.m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
+    m_view.m_detInd = const_cast<uint16_t*>(m_store16.get());
+  }
+
+  // view only!
+  TrackingRecHit2DReduced(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}
+
+  TrackingRecHit2DReduced() = default;
+  ~TrackingRecHit2DReduced() = default;
+
+  TrackingRecHit2DReduced(const TrackingRecHit2DReduced&) = delete;
+  TrackingRecHit2DReduced& operator=(const TrackingRecHit2DReduced&) = delete;
+  TrackingRecHit2DReduced(TrackingRecHit2DReduced&&) = default;
+  TrackingRecHit2DReduced& operator=(TrackingRecHit2DReduced&&) = default;
+
+  TrackingRecHit2DSOAView& view() { return m_view; }
+  TrackingRecHit2DSOAView const& view() const { return m_view; }
+
+  auto nHits() const { return m_nHits; }
+
+  TrackingRecHit2DSOAView m_view;
+
+  HLPstorage m_store32;
+  HIDstorage m_store16;
+
+  int m_nHits;
+};
+
+#endif
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
index 808feb2a4218f..52bc64dc66ad9 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
@@ -7,6 +7,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+#include "CUDADataFormats/TrackingRecHit/interface/SiPixelStatus.h"
 
 namespace pixelCPEforGPU {
   struct ParamsOnGPU;
@@ -14,6 +15,9 @@ namespace pixelCPEforGPU {
 
 class TrackingRecHit2DSOAView {
 public:
+  using Status = SiPixelStatus;
+  static_assert(sizeof(Status) == sizeof(uint8_t));
+
   static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
   using hindex_type = uint16_t;  // if above is <=2^16
 
@@ -24,6 +28,7 @@ class TrackingRecHit2DSOAView {
 
   template <typename>
   friend class TrackingRecHit2DHeterogeneous;
+  friend class TrackingRecHit2DReduced;
 
   __device__ __forceinline__ uint32_t nHits() const { return m_nHits; }
 
@@ -49,8 +54,19 @@ class TrackingRecHit2DSOAView {
   __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; }
   __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); }
 
-  __device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; }
-  __device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); }
+  __device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) {
+    // static_assert(0xffffff == chargeMask());
+    ich = std::min(ich, chargeMask());
+    uint32_t w = *reinterpret_cast<uint8_t*>(&is);
+    ich |= (w << 24);
+    m_chargeAndStatus[i] = ich;
+  }
+
+  __device__ __forceinline__ Status status(int i) const {
+    uint8_t w = __ldg(m_chargeAndStatus + i) >> 24;
+    return *reinterpret_cast<Status*>(&w);
+  }
+  __device__ __forceinline__ uint32_t charge(int i) const { return __ldg(m_chargeAndStatus + i) & chargeMask(); }
   __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; }
   __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); }
   __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; }
@@ -81,7 +97,8 @@ class TrackingRecHit2DSOAView {
   int16_t* m_iphi;
 
   // cluster properties
-  int32_t* m_charge;
+  static constexpr uint32_t chargeMask() { return (1 << 24) - 1; }
+  uint32_t* m_chargeAndStatus;
   int16_t* m_xsize;
   int16_t* m_ysize;
   uint16_t* m_detInd;
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
index 7b04ed2d530a0..6d994c9cf74c5 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
@@ -1,19 +1,33 @@
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
   return ret;
 }
 
 template <>
-cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
+cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::detIndexToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store16, nHits(), stream);
+  return ret;
+}
+
+template <>
+cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
   auto ret = cms::cuda::make_host_unique<uint32_t[]>(2001, stream);
   cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
   return ret;
 }
+
+// the only specialization needed
+template <>
+void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
+  assert(input);
+  m_store32 = input->localCoordToHostAsync(stream);
+}
diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h
index 3d40821493c5b..7de2d7621ef32 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes.h
+++ b/CUDADataFormats/TrackingRecHit/src/classes.h
@@ -3,6 +3,7 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_SiPixelCluster_src_classes_h
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
index 32af6c181ae68..faf3cf887cb4e 100644
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
@@ -15,14 +15,18 @@ int main() {
   cudaStream_t stream;
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
+  uint32_t nHits = 200;
   // inner scope to deallocate memory before destroying the stream
   {
-    auto nHits = 200;
     TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);
 
     testTrackingRecHit2D::runKernels(tkhit.view());
-  }
 
+    TrackingRecHit2DHost tkhitH(nHits, nullptr, nullptr, stream, &tkhit);
+    cudaStreamSynchronize(stream);
+    assert(tkhitH.view());
+    assert(tkhitH.view()->nHits() == nHits);
+  }
   cudaCheck(cudaStreamDestroy(stream));
 
   return 0;
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
index a1fc79b41eca6..f4bba39e6b971 100644
--- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -2,11 +2,6 @@
 
 #include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
 
-// needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
-#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
-// TODO reuse some of the HCAL constats from
-//#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
-// ?
 
 #include "SimpleAlgoGPU.h"
 #include "KernelHelpers.h"
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
index 681211b82e1af..f306d421bc0e6 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
@@ -11,8 +11,11 @@
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
+#include "CUDADataFormats/TrackingRecHit/interface/SiPixelStatus.h"
+
 namespace pixelCPEforGPU {
 
+  using Status = SiPixelStatus;
   using Frame = SOAFrame<float>;
   using Rotation = SOARotation<float>;
 
@@ -39,7 +42,11 @@ namespace pixelCPEforGPU {
 
     float x0, y0, z0;  // the vertex in the local coord of the detector
 
-    float sx[3], sy[3];  // the errors...
+    float apeX, apeY;  // ape^2
+    uint8_t sx2, sy1, sy2;
+    uint8_t sigmax[16], sigmax1[16], sigmay[16];  // in micron
+    float xfact[5], yfact[5];
+    int minCh[5];
 
     Frame frame;
   };
@@ -94,8 +101,10 @@ namespace pixelCPEforGPU {
     float xerr[N];
     float yerr[N];
 
-    int16_t xsize[N];  // clipped at 127 if negative is edge....
+    int16_t xsize[N];  // (*8) clipped at 127 if negative is edge....
     int16_t ysize[N];
+
+    Status status[N];
   };
 
   constexpr int32_t MaxHitsInIter = gpuClustering::maxHitsInIter();
@@ -205,8 +214,8 @@ namespace pixelCPEforGPU {
     if (phase1PixelTopology::isBigPixY(cp.maxCol[ic]))
       ++ysize;
 
-    int unbalanceX = 8. * std::abs(float(cp.Q_f_X[ic] - cp.Q_l_X[ic])) / float(cp.Q_f_X[ic] + cp.Q_l_X[ic]);
-    int unbalanceY = 8. * std::abs(float(cp.Q_f_Y[ic] - cp.Q_l_Y[ic])) / float(cp.Q_f_Y[ic] + cp.Q_l_Y[ic]);
+    int unbalanceX = 8.f * std::abs(float(cp.Q_f_X[ic] - cp.Q_l_X[ic])) / float(cp.Q_f_X[ic] + cp.Q_l_X[ic]);
+    int unbalanceY = 8.f * std::abs(float(cp.Q_f_Y[ic] - cp.Q_l_Y[ic])) / float(cp.Q_f_Y[ic] + cp.Q_l_Y[ic]);
     xsize = 8 * xsize - unbalanceX;
     ysize = 8 * ysize - unbalanceY;
 
@@ -284,8 +293,8 @@ namespace pixelCPEforGPU {
     auto sy = cp.maxCol[ic] - cp.minCol[ic];
 
     // is edgy ?
-    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
-    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    bool isEdgeX = cp.xsize[ic] < 1;
+    bool isEdgeY = cp.ysize[ic] < 1;
     // is one and big?
     bool isBig1X = (0 == sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
     bool isBig1Y = (0 == sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
@@ -322,19 +331,42 @@ namespace pixelCPEforGPU {
     auto sx = cp.maxRow[ic] - cp.minRow[ic];
     auto sy = cp.maxCol[ic] - cp.minCol[ic];
 
-    // is edgy ?
-    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
-    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    // is edgy ?  (size is set negative: see above)
+    bool isEdgeX = cp.xsize[ic] < 1;
+    bool isEdgeY = cp.ysize[ic] < 1;
+
     // is one and big?
-    uint32_t ix = (0 == sx);
-    uint32_t iy = (0 == sy);
-    ix += (0 == sx) && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
-    iy += (0 == sy) && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+    bool isOneX = (0 == sx);
+    bool isOneY = (0 == sy);
+    bool isBigX = phase1PixelTopology::isBigPixX(cp.minRow[ic]);
+    bool isBigY = phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+
+    auto ch = cp.charge[ic];
+    auto bin = 0;
+    for (; bin < 4; ++bin)
+      if (ch < detParams.minCh[bin + 1])
+        break;
+    assert(bin < 5);
+
+    cp.status[ic].qBin = 4 - bin;
+    cp.status[ic].isOneX = isOneX;
+    cp.status[ic].isBigX = (isOneX & isBigX) | isEdgeX;
+    cp.status[ic].isOneY = isOneY;
+    cp.status[ic].isBigY = (isOneY & isBigY) | isEdgeY;
+
+    auto xoff = 81.f * comParams.thePitchX;
+    int jx = std::min(15, std::max(0, int(16.f * (cp.xpos[ic] + xoff) / (162.f * comParams.thePitchX))));
+
+    auto toCM = [](uint8_t x) { return float(x) * 1.e-4; };
 
     if (not isEdgeX)
-      cp.xerr[ic] = detParams.sx[ix];
+      cp.xerr[ic] = isOneX ? toCM(isBigX ? detParams.sx2 : detParams.sigmax1[jx])
+                           : detParams.xfact[bin] * toCM(detParams.sigmax[jx]);
+
+    auto ey = cp.ysize[ic] > 8 ? detParams.sigmay[std::min(cp.ysize[ic] - 9, 15)] : detParams.sy1;
+
     if (not isEdgeY)
-      cp.yerr[ic] = detParams.sy[iy];
+      cp.yerr[ic] = isOneY ? toCM(isBigY ? detParams.sy2 : detParams.sy1) : detParams.yfact[bin] * toCM(ey);
   }
 
 }  // namespace pixelCPEforGPU
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 17cd5aad4db52..689a324982104 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -149,8 +149,9 @@ namespace gpuPixelRecHits {
         assert(cl < MaxHitsInIter);
         auto x = digis.xx(i);
         auto y = digis.yy(i);
-        auto ch = std::min(digis.adc(i), pixmx);
+        auto ch = digis.adc(i);
         atomicAdd(&clusParams.charge[cl], ch);
+        ch = std::min(ch, pixmx);
         if (clusParams.minRow[cl] == x)
           atomicAdd(&clusParams.Q_f_X[cl], ch);
         if (clusParams.maxRow[cl] == x)
@@ -181,7 +182,9 @@ namespace gpuPixelRecHits {
 
         // store it
 
-        hits.charge(h) = clusParams.charge[ic];
+        hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
+
+        // printf("charge,status %d %d %d %d %d\n",hits.charge(h),hits.status(h).qBin,hits.chargeAndStatus(h),clusParams.charge[ic],clusParams.status[ic].qBin);
 
         hits.detectorIndex(h) = me;
 
@@ -192,8 +195,9 @@ namespace gpuPixelRecHits {
         hits.clusterSizeX(h) = clusParams.xsize[ic];
         hits.clusterSizeY(h) = clusParams.ysize[ic];
 
-        hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic];
-        hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic];
+        hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeX;
+        hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeY;
+        ;
 
         // keep it local for computations
         float xg, yg, zg;
diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
index f3b3f308fa9d3..6ba4c743a8b40 100644
--- a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
+++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
@@ -112,7 +112,8 @@ void PixelCPEFast::fillParamsForGpu() {
   m_commonParamsGPU.thePitchX = m_DetParams[0].thePitchX;
   m_commonParamsGPU.thePitchY = m_DetParams[0].thePitchY;
 
-  // std::cout << "pitch & thickness " <<  m_commonParamsGPU.thePitchX << ' ' << m_commonParamsGPU.thePitchY << "  " << m_commonParamsGPU.theThicknessB << ' ' << m_commonParamsGPU.theThicknessE << std::endl;
+  // std::cout << "pitch & thickness " << m_commonParamsGPU.thePitchX << ' ' << m_commonParamsGPU.thePitchY << "  "
+  //          << m_commonParamsGPU.theThicknessB << ' ' << m_commonParamsGPU.theThicknessE << std::endl;
 
   // zero average geometry
   memset(&m_averageGeometry, 0, sizeof(pixelCPEforGPU::AverageGeometry));
@@ -184,77 +185,136 @@ void PixelCPEFast::fillParamsForGpu() {
     pl += vv.phi();  // (not obvious)
 
     // errors .....
-    ClusterParamGeneric cp;
-    auto gvx = p.theOrigin.x() + 40.f * m_commonParamsGPU.thePitchX;
-    auto gvy = p.theOrigin.y();
-    auto gvz = 1.f / p.theOrigin.z();
-    //--- Note that the normalization is not required as only the ratio used
 
-    // calculate angles
-    cp.cotalpha = gvx * gvz;
-    cp.cotbeta = gvy * gvz;
+    ClusterParamGeneric cp;
 
     cp.with_track_angle = false;
 
     auto lape = p.theDet->localAlignmentError();
     if (lape.invalid())
       lape = LocalError();  // zero....
-
+    g.apeX = lape.xx();
+    g.apeY = lape.yy();
+
+    auto toMicron = [&](float x) { return std::min(511, int(x * 1.e4f + 0.5f)); };
+
+    {
+      // average angle
+      auto gvx = p.theOrigin.x() + 40.f * m_commonParamsGPU.thePitchX;
+      auto gvy = p.theOrigin.y();
+      auto gvz = 1.f / p.theOrigin.z();
+      cp.cotalpha = gvx * gvz;
+      cp.cotbeta = gvy * gvz;
+      errorFromTemplates(p, cp, 20000.);
+    }
 #ifdef DUMP_ERRORS
     auto m = 10000.f;
-    for (float qclus = 15000; qclus < 35000; qclus += 15000) {
-      errorFromTemplates(p, cp, qclus);
+    std::cout << i << ' ' << (g.isBarrel ? 'B' : 'E') << " ape " << m * std::sqrt(lape.xx()) << ' '
+              << m * std::sqrt(lape.yy()) << std::endl;
+    std::cout << i << ' ' << g.rawId << ' ' << cp.cotalpha << ' ' << 20000 << ' ' << cp.qBin_ << ' ' << cp.pixmx << ' '
+              << m * cp.sigmax << ' ' << m * cp.sx1 << ' ' << m * cp.sx2 << ' ' << m * cp.sigmay << ' ' << m * cp.sy1
+              << ' ' << m * cp.sy2 << std::endl;
+#endif
+
+    g.pixmx = std::max(0, cp.pixmx);
+    g.sx2 = toMicron(cp.sx2);
+    g.sy1 = std::max(21, toMicron(cp.sy1));  // for some angles sy1 is very small
+    g.sy2 = std::max(55, toMicron(cp.sy2));  // sometimes sy2 is smaller than others (due to angle?)
+
+    // sample xerr as function of position
+    auto xoff = -81.f * m_commonParamsGPU.thePitchX;
+
+    for (int ix = 0; ix < 16; ++ix) {
+      auto x = xoff + (0.5f + float(ix)) * 162.f * m_commonParamsGPU.thePitchX / 16.f;
+      auto gvx = p.theOrigin.x() - x;
+      auto gvy = p.theOrigin.y();
+      auto gvz = 1.f / p.theOrigin.z();
+      cp.cotbeta = gvy * gvz;
+      cp.cotalpha = gvx * gvz;
+      errorFromTemplates(p, cp, 20000.f);
+      g.sigmax[ix] = toMicron(cp.sigmax);
+      g.sigmax1[ix] = toMicron(cp.sx1);
+#ifdef DUMP_ERRORS
+      std::cout << "sigmax vs x " << i << ' ' << x << ' ' << cp.cotalpha << ' ' << int(g.sigmax[ix]) << ' '
+                << int(g.sigmax1[ix]) << ' ' << 10000.f * cp.sigmay << std::endl;
+#endif
+    }
 
-      std::cout << i << ' ' << qclus << ' ' << cp.pixmx << ' ' << m * cp.sigmax << ' ' << m * cp.sx1 << ' '
-                << m * cp.sx2 << ' ' << m * cp.sigmay << ' ' << m * cp.sy1 << ' ' << m * cp.sy2 << std::endl;
+#ifdef DUMP_ERRORS
+    // sample yerr as function of position
+    auto yoff = -54 * 4.f * m_commonParamsGPU.thePitchY;
+    for (int ix = 0; ix < 16; ++ix) {
+      auto y = yoff + (0.5f + float(ix)) * 432.f * m_commonParamsGPU.thePitchY / 16.f;
+      auto gvx = p.theOrigin.x() + 40.f * m_commonParamsGPU.thePitchY;
+      auto gvy = p.theOrigin.y() - y;
+      auto gvz = 1.f / p.theOrigin.z();
+      cp.cotbeta = gvy * gvz;
+      cp.cotalpha = gvx * gvz;
+      errorFromTemplates(p, cp, 20000.f);
+      std::cout << "sigmay vs y " << i << ' ' << y << ' ' << cp.cotbeta << ' ' << 10000.f * cp.sigmay << std::endl;
     }
-    std::cout << i << ' ' << m * std::sqrt(lape.xx()) << ' ' << m * std::sqrt(lape.yy()) << std::endl;
 #endif
 
-    errorFromTemplates(p, cp, 20000.f);
-    g.pixmx = std::max(0, cp.pixmx);
-    g.sx[0] = cp.sigmax;
-    g.sx[1] = cp.sx1;
-    g.sx[2] = cp.sx2;
-
-    g.sy[0] = cp.sigmay;
-    g.sy[1] = cp.sy1;
-    g.sy[2] = cp.sy2;
-
-    /*
-    // from run1??
-    if (i<96) {
-      g.sx[0] = 0.00120;
-      g.sx[1] = 0.00115;
-      g.sx[2] = 0.0050;
-
-      g.sy[0] = 0.00210;
-      g.sy[1] = 0.00375;
-      g.sy[2] = 0.0085;
-    } else if (g.isBarrel) {
-      g.sx[0] = 0.00120;
-      g.sx[1] = 0.00115;
-      g.sx[2] = 0.0050;
-
-      g.sy[0] = 0.00210;
-      g.sy[1] = 0.00375;
-      g.sy[2] = 0.0085;
-   } else {
-      g.sx[0] = 0.0020;
-      g.sx[1] = 0.0020;
-      g.sx[2] = 0.0050;
-
-      g.sy[0] = 0.0021;
-      g.sy[1] = 0.0021;
-      g.sy[2] = 0.0085;
-   }
-   */
-
-    for (int i = 0; i < 3; ++i) {
-      g.sx[i] = std::sqrt(g.sx[i] * g.sx[i] + lape.xx());
-      g.sy[i] = std::sqrt(g.sy[i] * g.sy[i] + lape.yy());
+    // average angle
+    auto gvx = p.theOrigin.x() + 40.f * m_commonParamsGPU.thePitchX;
+    auto gvy = p.theOrigin.y();
+    auto gvz = 1.f / p.theOrigin.z();
+    //--- Note that the normalization is not required as only the ratio used
+
+    // calculate angles
+    cp.cotalpha = gvx * gvz;
+    cp.cotbeta = gvy * gvz;
+    auto aveCB = cp.cotbeta;
+
+    // sample x by charge
+    int qbin = 5;  // low charge
+    int k = 0;
+    for (int qclus = 1000; qclus < 200000; qclus += 1000) {
+      errorFromTemplates(p, cp, qclus);
+      if (cp.qBin_ == qbin)
+        continue;
+      qbin = cp.qBin_;
+      g.xfact[k] = cp.sigmax;
+      g.yfact[k] = cp.sigmay;
+      g.minCh[k++] = qclus;
+#ifdef DUMP_ERRORS
+      std::cout << i << ' ' << g.rawId << ' ' << cp.cotalpha << ' ' << qclus << ' ' << cp.qBin_ << ' ' << cp.pixmx
+                << ' ' << m * cp.sigmax << ' ' << m * cp.sx1 << ' ' << m * cp.sx2 << ' ' << m * cp.sigmay << ' '
+                << m * cp.sy1 << ' ' << m * cp.sy2 << std::endl;
+#endif
+    }
+    assert(k <= 5);
+    // fill the rest  (sometimes bin 4 is missing)
+    for (int kk = k; kk < 5; ++kk) {
+      g.xfact[kk] = g.xfact[k - 1];
+      g.yfact[kk] = g.yfact[k - 1];
+      g.minCh[kk] = g.minCh[k - 1];
+    }
+    auto detx = 1.f / g.xfact[0];
+    auto dety = 1.f / g.yfact[0];
+    for (int kk = 0; kk < 5; ++kk) {
+      g.xfact[kk] *= detx;
+      g.yfact[kk] *= dety;
     }
-  }
+
+    // sample y in angle
+    float ys = 8.f - 4.f;  // apperent bias of half pixel (see plot)
+    // sample yerr as function of "size"
+    for (int iy = 0; iy < 16; ++iy) {
+      ys += 1.f;  // first bin 0 is for size 9  (and size is in fixed point 2^3)
+      if (15 == iy)
+        ys += 8.f;  // last bin for "overflow"
+      // cp.cotalpha = ys*100.f/(8.f*285.f);
+      cp.cotbeta = std::copysign(ys * 150.f / (8.f * 285.f), aveCB);
+      errorFromTemplates(p, cp, 20000.f);
+      g.sigmay[iy] = toMicron(cp.sigmay);
+#ifdef DUMP_ERRORS
+      std::cout << "sigmax/sigmay " << i << ' ' << (ys + 4.f) / 8.f << ' ' << cp.cotalpha << '/' << cp.cotbeta << ' '
+                << 10000.f * cp.sigmax << '/' << int(g.sigmay[iy]) << std::endl;
+#endif
+    }
+
+  }  // loop over det
 
   // compute ladder baricenter (only in global z) for the barrel
   auto& aveGeom = m_averageGeometry;
@@ -406,13 +466,20 @@ LocalPoint PixelCPEFast::localPosition(DetParam const& theDetParam, ClusterParam
   cp.Q_f_Y[0] = Q_f_Y;
   cp.Q_l_Y[0] = Q_l_Y;
 
+  cp.charge[0] = theClusterParam.theCluster->charge();
+
   auto ind = theDetParam.theDet->index();
   pixelCPEforGPU::position(m_commonParamsGPU, m_detParamsGPU[ind], cp, 0);
   auto xPos = cp.xpos[0];
   auto yPos = cp.ypos[0];
 
-  //  std::cout<<" in PixelCPEFast:localPosition - pos = "<<xPos<<" "<<yPos
-  //           << " size "<< cp.maxRow[0]-cp.minRow[0] << ' ' << cp.maxCol[0]-cp.minCol[0] << std::endl; //dk
+  // set the error  (mind ape....)
+  pixelCPEforGPU::errorFromDB(m_commonParamsGPU, m_detParamsGPU[ind], cp, 0);
+  theClusterParam.sigmax = cp.xerr[0];
+  theClusterParam.sigmay = cp.yerr[0];
+
+  // std::cout<<" in PixelCPEFast:localPosition - pos = "<<xPos<<" "<<yPos
+  //         << " size "<< cp.maxRow[0]-cp.minRow[0] << ' ' << cp.maxCol[0]-cp.minCol[0] << ' ' << cp.charge[0] << std::endl; //dk
 
   //--- Now put the two together
   LocalPoint pos_in_local(xPos, yPos);
@@ -474,116 +541,10 @@ void PixelCPEFast::collect_edge_charges(ClusterParam& theClusterParamBase,  //!<
 LocalError PixelCPEFast::localError(DetParam const& theDetParam, ClusterParam& theClusterParamBase) const {
   ClusterParamGeneric& theClusterParam = static_cast<ClusterParamGeneric&>(theClusterParamBase);
 
-  // Default errors are the maximum error used for edge clusters.
-  // These are determined by looking at residuals for edge clusters
-  float xerr = EdgeClusterErrorX_ * micronsToCm;
-  float yerr = EdgeClusterErrorY_ * micronsToCm;
-
-  // Find if cluster is at the module edge.
-  int maxPixelCol = theClusterParam.theCluster->maxPixelCol();
-  int maxPixelRow = theClusterParam.theCluster->maxPixelRow();
-  int minPixelCol = theClusterParam.theCluster->minPixelCol();
-  int minPixelRow = theClusterParam.theCluster->minPixelRow();
-
-  bool edgex = phase1PixelTopology::isEdgeX(minPixelRow) | phase1PixelTopology::isEdgeX(maxPixelRow);
-  bool edgey = phase1PixelTopology::isEdgeY(minPixelCol) | phase1PixelTopology::isEdgeY(maxPixelCol);
-
-  unsigned int sizex = theClusterParam.theCluster->sizeX();
-  unsigned int sizey = theClusterParam.theCluster->sizeY();
-
-  // Find if cluster contains double (big) pixels.
-  bool bigInX = theDetParam.theRecTopol->containsBigPixelInX(minPixelRow, maxPixelRow);
-  bool bigInY = theDetParam.theRecTopol->containsBigPixelInY(minPixelCol, maxPixelCol);
-
-  if (UseErrorsFromTemplates_) {
-    //
-    // Use template errors
-
-    if (!edgex) {  // Only use this for non-edge clusters
-      if (sizex == 1) {
-        if (!bigInX) {
-          xerr = theClusterParam.sx1;
-        } else {
-          xerr = theClusterParam.sx2;
-        }
-      } else {
-        xerr = theClusterParam.sigmax;
-      }
-    }
-
-    if (!edgey) {  // Only use for non-edge clusters
-      if (sizey == 1) {
-        if (!bigInY) {
-          yerr = theClusterParam.sy1;
-        } else {
-          yerr = theClusterParam.sy2;
-        }
-      } else {
-        yerr = theClusterParam.sigmay;
-      }
-    }
-
-  } else {  // simple errors
-
-    // This are the simple errors, hardcoded in the code
-    //cout << "Track angles are not known " << endl;
-    //cout << "Default angle estimation which assumes track from PV (0,0,0) does not work." << endl;
-
-    if (GeomDetEnumerators::isTrackerPixel(theDetParam.thePart)) {
-      if (GeomDetEnumerators::isBarrel(theDetParam.thePart)) {
-        DetId id = (theDetParam.theDet->geographicalId());
-        int layer = ttopo_.layer(id);
-        if (layer == 1) {
-          if (!edgex) {
-            if (sizex <= xerr_barrel_l1_.size())
-              xerr = xerr_barrel_l1_[sizex - 1];
-            else
-              xerr = xerr_barrel_l1_def_;
-          }
-
-          if (!edgey) {
-            if (sizey <= yerr_barrel_l1_.size())
-              yerr = yerr_barrel_l1_[sizey - 1];
-            else
-              yerr = yerr_barrel_l1_def_;
-          }
-        } else {  // layer 2,3
-          if (!edgex) {
-            if (sizex <= xerr_barrel_ln_.size())
-              xerr = xerr_barrel_ln_[sizex - 1];
-            else
-              xerr = xerr_barrel_ln_def_;
-          }
-
-          if (!edgey) {
-            if (sizey <= yerr_barrel_ln_.size())
-              yerr = yerr_barrel_ln_[sizey - 1];
-            else
-              yerr = yerr_barrel_ln_def_;
-          }
-        }
-
-      } else {  // EndCap
-
-        if (!edgex) {
-          if (sizex <= xerr_endcap_.size())
-            xerr = xerr_endcap_[sizex - 1];
-          else
-            xerr = xerr_endcap_def_;
-        }
-
-        if (!edgey) {
-          if (sizey <= yerr_endcap_.size())
-            yerr = yerr_endcap_[sizey - 1];
-          else
-            yerr = yerr_endcap_def_;
-        }
-      }  // end endcap
-    }
-
-  }  // end
+  auto xerr = theClusterParam.sigmax;
+  auto yerr = theClusterParam.sigmay;
 
-  //   std::cout<<" errors  "<<xerr<<" "<<yerr<<std::endl;  //dk
+  // std::cout<<" errors  "<<xerr<<" "<<yerr<<std::endl;  //dk
 
   auto xerr_sq = xerr * xerr;
   auto yerr_sq = yerr * yerr;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 7b5c9ea0ce0a4..1455adda721e3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -75,12 +75,40 @@ __global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
 
     // Prepare data structure
     auto const *hitId = foundNtuplets->begin(tkid);
+// #define YERR_FROM_DC
+#ifdef YERR_FROM_DC
+    // try to compute more precise error in y
+    auto dx = hhp->xGlobal(hitId[hitsInFit - 1]) - hhp->xGlobal(hitId[0]);
+    auto dy = hhp->yGlobal(hitId[hitsInFit - 1]) - hhp->yGlobal(hitId[0]);
+    auto dz = hhp->zGlobal(hitId[hitsInFit - 1]) - hhp->zGlobal(hitId[0]);
+    float ux, uy, uz;
+#endif
     for (unsigned int i = 0; i < hitsInFit; ++i) {
       auto hit = hitId[i];
       float ge[6];
+#ifdef YERR_FROM_DC
+      auto const &dp = hhp->cpeParams().detParams(hhp->detectorIndex(hit));
+      auto status = hhp->status(hit);
+      int qbin = 4 - status.qBin;
+      assert(qbin >= 0 && qbin < 5);
+      bool nok = (status.isBigY | status.isOneY);
+      // compute cotanbeta and use it to recompute error
+      dp.frame.rotation().multiply(dx, dy, dz, ux, uy, uz);
+      auto cb = std::abs(uy / uz);
+      int bin = int(cb * (285.f / 150.f) * 8.f) - 4;
+      bin = std::max(0, std::min(15, bin));
+      float yerr = dp.sigmay[bin] * 1.e-4f;
+      yerr *= dp.yfact[qbin];  // inflate
+      yerr *= yerr;
+      yerr += dp.apeY;
+      yerr = nok ? hhp->yerrLocal(hit) : yerr;
+      dp.frame.toGlobal(hhp->xerrLocal(hit), 0, yerr, ge);
+#else
       hhp->cpeParams()
           .detParams(hhp->detectorIndex(hit))
           .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+#endif
+
 #ifdef BL_DUMP_HITS
       if (dump) {
         printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 691395887dddb..9d51acd9e4d76 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -375,16 +375,13 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
     }
 
     // compute a pT-dependent chi2 cut
-    // default parameters:
-    //   - chi2MaxPt = 10 GeV
-    //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
-    //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
+    // at the moment just a flat cut...
     // (see CAHitNtupletGeneratorGPU.cc)
-    float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
-    float chi2Cut = cuts.chi2Scale *
-                    (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
-    // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
-    if (3.f * tracks->chi2(it) >= chi2Cut) {
+    // float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
+    // float chi2Cut = cuts.chi2Scale *
+    //                (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+    float chi2Cut = cuts.chi2Scale;
+    if (tracks->chi2(it) >= chi2Cut) {
 #ifdef NTUPLE_DEBUG
       printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
              it,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 464744594e9a6..aa5fbb94b21eb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -150,11 +150,11 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
 
   edm::ParameterSetDescription trackQualityCuts;
   trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
-  trackQualityCuts.add<std::vector<double>>("chi2Coeff", {0.68177776, 0.74609577, -0.08035491, 0.00315399})
+  trackQualityCuts.add<std::vector<double>>("chi2Coeff", {1., 0., 0., 0.})
       ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut");
-  trackQualityCuts.add<double>("chi2Scale", 30.)
+  trackQualityCuts.add<double>("chi2Scale", 25.)
       ->setComment(
-          "Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann "
+          "Factor to multiply the pT-dependent chi2 cut (currently: 25 for the broken line fit, ?? for the Riemann "
           "fit)");
   trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
   trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");