diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 907b7647a3452..0c3e8bd37cb6a 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -5,6 +5,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
@@ -21,15 +22,15 @@ class HeterogeneousSoA {
 
   explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
   explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cudautils::cpu::unique_ptr<T> &&p) : cm_ptr(std::move(p)) {}
 
-  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto const &operator*() const { return *get(); }
 
   auto const *operator-> () const { return get(); }
 
-  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto &operator*() { return *get(); }
 
@@ -47,12 +48,15 @@ class HeterogeneousSoA {
   // a union wan't do it, a variant will not be more efficienct
   cudautils::device::unique_ptr<T> dm_ptr;  //!
   cudautils::host::unique_ptr<T> hm_ptr;    //!
-  std::unique_ptr<T> std_ptr;               //!
+  cudautils::cpu::unique_ptr<T> cm_ptr;    //!
 };
 
 namespace cudaCompat {
 
   struct GPUTraits {
+    static constexpr const char * name = "GPU"; 
+    static constexpr bool runOnDevice = true;
+
     template <typename T>
     using unique_ptr = cudautils::device::unique_ptr<T>;
 
@@ -83,6 +87,9 @@ namespace cudaCompat {
   };
 
   struct HostTraits {
+    static constexpr const char * name = "HOST";
+    static constexpr bool runOnDevice = false;
+
     template <typename T>
     using unique_ptr = cudautils::host::unique_ptr<T>;
 
@@ -108,32 +115,45 @@ namespace cudaCompat {
   };
 
   struct CPUTraits {
+    static constexpr const char * name = "CPU";
+    static constexpr bool runOnDevice = false;
+
+    template <typename T>
+    using unique_ptr = cudautils::cpu::unique_ptr<T>;;
+
+    template <typename T>
+    static auto make_unique() {
+      return cudautils::make_cpu_unique<T>(cudaStreamDefault);
+    }
+
     template <typename T>
-    using unique_ptr = std::unique_ptr<T>;
+    static auto make_unique(size_t size) {
+      return cudautils::make_cpu_unique<T>(size,cudaStreamDefault);
+    }
 
     template <typename T>
-    static auto make_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+    static auto make_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
 
     template <typename T>
-    static auto make_host_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_host_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_device_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+    static auto make_device_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
   };
 
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index 17ad98ba403a4..0a706b6c57927 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -2,6 +2,7 @@
 #define CUDADataFormatsCommonHostProduct_H
 
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 
 // a heterogeneous unique pointer...
 template <typename T>
@@ -13,9 +14,9 @@ class HostProduct {
   HostProduct& operator=(HostProduct&&) = default;
 
   explicit HostProduct(cudautils::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
-  explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
+  explicit HostProduct(cudautils::cpu::unique_ptr<T>&& p) : cm_ptr(std::move(p)) {}
 
-  auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
+  auto const* get() const { return hm_ptr ? hm_ptr.get() : cm_ptr.get(); }
 
   auto const& operator*() const { return *get(); }
 
@@ -23,7 +24,7 @@ class HostProduct {
 
 private:
   cudautils::host::unique_ptr<T> hm_ptr;  //!
-  std::unique_ptr<T> std_ptr;             //!
+  cudautils::cpu::unique_ptr<T>  cm_ptr;  //!
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index d3650e164d44e..afb7c5aa37833 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -2,7 +2,6 @@
 #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
 #include <cuda_runtime.h>
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
index e6f223bfec4e3..60d2693fa45b0 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
@@ -1,7 +1,6 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 template <>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 6fbaced1858dd..5f29ad3b81ea8 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -72,7 +72,7 @@ namespace cudautils {
   inline void launchFinalize(Histo *__restrict__ h,
                              uint8_t *__restrict__ ws
 #ifndef __CUDACC__
-                             = cudaStreamDefault
+                             = nullptr
 #endif
                              ,
                              cudaStream_t stream
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
new file mode 100644
index 0000000000000..dab7d15f40f92
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
@@ -0,0 +1,86 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include <cstdlib>
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  namespace cpu {
+    namespace impl {
+      // Additional layer of types to distinguish from device:: and host::unique_ptr
+      class CPUDeleter {
+      public:
+        CPUDeleter() = default;
+
+        void operator()(void *ptr) {
+            ::free(ptr);
+        }
+      };
+    }  // namespace impl
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::CPUDeleter>;
+
+    namespace impl {
+      template <typename T>
+      struct make_cpu_unique_selector {
+        using non_array = cudautils::cpu::unique_ptr<T>;
+      };
+      template <typename T>
+      struct make_cpu_unique_selector<T[]> {
+        using unbounded_array = cudautils::cpu::unique_ptr<T[]>;
+      };
+      template <typename T, size_t N>
+      struct make_cpu_unique_selector<T[N]> {
+        struct bounded_array {};
+      };
+    }  // namespace impl
+  }    // namespace cpu
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique(cudaStream_t) {
+    static_assert(std::is_trivially_constructible<T>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n, cudaStream_t) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique(Args &&...) = delete;
+
+  // No check for the trivial constructor, make it clear in the interface
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized(cudaStream_t) {
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) {
+    using element_type = typename std::remove_extent<T>::type;
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique_uninitialized(Args &&...) =
+      delete;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index 32c7d7b481a50..892cb7cf5271a 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -6,6 +6,10 @@
  */
 
 #ifndef __CUDACC__
+  #define CUDA_KERNELS_ON_CPU
+#endif
+
+#ifdef CUDA_KERNELS_ON_CPU
 
 #include <algorithm>
 #include <cstdint>
@@ -86,18 +90,20 @@ namespace cudaCompat {
 #define __forceinline__
 #endif
 
-// make sure function are inlined to avoid multiple definition
 #ifndef __CUDA_ARCH__
+using namespace cudaCompat;
+#endif
+
+#endif  // CUDA_KERNELS_ON_CPU
+
+
+// make sure function are inlined to avoid multiple definition
+#ifndef __CUDACC__
 #undef __global__
 #define __global__ inline __attribute__((always_inline))
 #undef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 #endif
 
-#ifndef __CUDA_ARCH__
-using namespace cudaCompat;
-#endif
-
-#endif
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
index f3c8240224c0f..dfa48e33be4c5 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/launch.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -93,8 +93,12 @@ namespace cudautils {
   // wrappers for cudaLaunchKernel
 
   inline void launch(void (*kernel)(), LaunchParameters config) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel();
+#else
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+#endif
   }
 
   template <typename F, typename... Args>
@@ -104,6 +108,9 @@ namespace cudautils {
   std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
 #endif
   launch(F* kernel, LaunchParameters config, Args&&... args) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel(args...);
+#else
     using function_type = detail::kernel_traits<F>;
     typename function_type::argument_type_tuple args_copy(args...);
 
@@ -113,6 +120,7 @@ namespace cudautils {
     detail::pointer_setter<size>()(pointers, args_copy);
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+#endif
   }
 
   // wrappers for cudaLaunchCooperativeKernel
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index 203aac78a165c..819db79d62497 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -3,8 +3,10 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 
 #include <type_traits>
+#include <cstring>
 
 namespace cudautils {
   template <typename T>
@@ -15,6 +17,11 @@ namespace cudautils {
     cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
   }
 
+  template <typename T>
+  inline void memsetAsync(cudautils::cpu::unique_ptr<T>& ptr, T value, cudaStream_t) {
+    ::memset(ptr.get(), value, sizeof(T));
+  }
+
   /**
    * The type of `value` is `int` because of `cudaMemsetAsync()` takes
    * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
@@ -25,6 +32,12 @@ namespace cudautils {
   inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
     cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
   }
+  template <typename T>
+  inline void memsetAsync(cudautils::cpu::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t) {
+    ::memset(ptr.get(), value, nelements * sizeof(T));
+  }
+
+
 }  // namespace cudautils
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index a700c0865f0f2..88a12c789ed8f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -7,6 +7,18 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<bin file="Launch_t.cpp Launch_t.cu" name="gpuLaunch_t">
+</bin>
+
+<bin file="Launch_t.cpp" name="cpuLaunch_t">
+  <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU"/>
+</bin>
+
+<bin file="Launch_t.cpp Launch_t.cu" name="mixedLaunch_t">
+  <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU -DLaunchInCU"/>
+</bin>
+
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
@@ -65,7 +77,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,cpu_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
   <use name="catch2"/>
 </bin>
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
new file mode 100644
index 0000000000000..77c824fc833c4
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
@@ -0,0 +1,24 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+#include "Launch_t.h"
+
+#ifdef LaunchInCU
+  void wrapperInCU();
+#endif
+
+
+int main() {
+
+  requireCUDADevices();
+
+  printf("in Main\n");
+  printEnv();
+
+  wrapper();
+
+#ifdef LaunchInCU
+  wrapperInCU();
+#endif
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
new file mode 100644
index 0000000000000..90b24fb6fec4f
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
@@ -0,0 +1,12 @@
+#ifdef CUDA_KERNELS_ON_CPU
+#undef CUDA_KERNELS_ON_CPU
+#endif
+
+#include "Launch_t.h"
+
+#ifdef LaunchInCU
+  void wrapperInCU() {
+    printf("in cu wrapper\n");
+    wrapper();
+  }
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.h b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h
new file mode 100644
index 0000000000000..3237f6383f47d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/Launch_t.h
@@ -0,0 +1,46 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include <cuda_runtime.h>
+#include<cstdio>
+
+// these are in cudaCompat: here we do just a minimal test of "launch"
+#ifndef __CUDACC__
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+#endif
+
+inline
+constexpr void printEnv() {
+  printf("ENV\n");
+#ifdef CUDA_KERNELS_ON_CPU
+  printf("CUDA_KERNELS_ON_CPU defined\n");
+#endif
+#ifdef __CUDACC__
+  printf("__CUDACC__ defined\n");
+#endif
+#ifdef __CUDA_RUNTIME_H__
+  printf("__CUDA_RUNTIME_H__ defined\n");
+#endif
+#ifdef __CUDA_ARCH__
+  printf("__CUDA_ARCH__ defined\n");
+#endif
+  printf("---\n");
+}
+
+__global__
+void hello(float k) {
+
+  printf("hello from kernel %f\n",k);
+  printEnv();
+}
+
+
+inline void wrapper() {
+
+  printf("in Wrapper\n");
+  printEnv();
+
+  cudautils::launch(hello,{1, 1},3.14);
+  cudaDeviceSynchronize();
+
+}
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..e945464c26b74
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
@@ -0,0 +1,38 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
+
+  SECTION("Single element") {
+    auto ptr = cudautils::make_cpu_unique<int>(cudaStreamDefault);
+    REQUIRE(ptr != nullptr);
+    *ptr = 1; 
+    cudautils::memsetAsync(ptr,0,cudaStreamDefault);
+    REQUIRE(0==*ptr);
+  }
+
+  SECTION("Reset") {
+    auto ptr = cudautils::make_cpu_unique<int>(cudaStreamDefault);
+    REQUIRE(ptr != nullptr);
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr = cudautils::make_cpu_unique<int[]>(10,cudaStreamDefault);
+    REQUIRE(ptr != nullptr);
+    for (int i=0; i<10; ++i) ptr[i]=1;
+    cudautils::memsetAsync(ptr,0,10,cudaStreamDefault);
+    int s=0; for (int i=0; i<10; ++i) s+=ptr[i];
+    REQUIRE(0==s);
+  }
+
+  SECTION("Allocating too much") {
+    constexpr size_t maxSize = 1 << 30;  // 8**10
+    auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1,cudaStreamDefault);
+    REQUIRE(ptr != nullptr);
+  }
+}
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
index 945bbb28a3262..608c65eff83d1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
@@ -105,7 +105,7 @@ namespace cms {
     const edmNew::DetSetVector<SiPixelCluster>& input = *inputhandle;
 
     // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still...
-    auto hmsp = std::make_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1);
+    auto hmsp = cudautils::make_cpu_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1,cudaStreamDefault);
     auto hitsModuleStart = hmsp.get();
     auto hms = std::make_unique<HMSstorage>(std::move(hmsp));  // hmsp is gone
     iEvent.put(std::move(hms));  // hms is gone! hitsModuleStart still alive and kicking...
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
index a4f19ac276a7a..7e840e1b266e1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
@@ -81,7 +81,7 @@ void SiPixelRecHitFromSOA::acquire(edm::Event const& iEvent,
 void SiPixelRecHitFromSOA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
   // yes a unique ptr of a unique ptr so edm is happy
   auto sizeOfHitModuleStart = gpuClustering::MaxNumModules + 1;
-  auto hmsp = std::make_unique<uint32_t[]>(sizeOfHitModuleStart);
+  auto hmsp = cudautils::make_cpu_unique<uint32_t[]>(sizeOfHitModuleStart,cudaStreamDefault);
   std::copy(m_hitsModuleStart.get(), m_hitsModuleStart.get() + sizeOfHitModuleStart, hmsp.get());
   auto hms = std::make_unique<HMSstorage>(std::move(hmsp));  // hmsp is gone
   iEvent.put(std::move(hms));                                // hms is gone!
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index 7900cf8b2289a..d286583cbc822 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -103,7 +103,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   auto const& input = *hclusters;
 
   // yes a unique ptr of a unique ptr so edm is happy and the pointer stay still...
-  auto hmsp = std::make_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1);
+  auto hmsp = cudautils::make_cpu_unique<uint32_t[]>(gpuClustering::MaxNumModules + 1,cudaStreamDefault);
   auto hitsModuleStart = hmsp.get();
   auto hms = std::make_unique<HMSstorage>(std::move(hmsp));  // hmsp is gone
   iEvent.put(tokenModuleStart_, std::move(hms));             // hms is gone! hitsModuleStart still alive and kicking...
diff --git a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
index d8f6b2a35880a..bae77ee5d8fcf 100644
--- a/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/test/BuildFile.xml
@@ -21,3 +21,12 @@
 <library file="ReadPixelRecHit.cc" name="ReadPixelRecHit">
   <flags EDM_PLUGIN="1"/>
 </library>
+
+
+<library file="RecHitSoATest.cc RecHitSoATestCPU.cc RecHitSoATestGPU.cu" name="PixelRecHitSoATest">
+  <use name="cuda"/>
+  <use name="CUDADataFormats/BeamSpot"/>
+  <use name="CUDADataFormats/TrackingRecHit"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <flags EDM_PLUGIN="1"/>
+</library>
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
new file mode 100644
index 0000000000000..3b2cb0a921e19
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.cc
@@ -0,0 +1,78 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+
+template<typename Traits>
+void analyzeImpl(TrackingRecHit2DHeterogeneous<Traits> const & ghits, cudaStream_t stream);
+
+/*
+TrackingRecHit2DCUDA and TrackingRecHit2DCPU are NOT the same type (for tracks and vertices are the same type)
+they are templated with the (GPU/CPU)Traits so in principle the whole Analyzer can be (partially) templated as well
+they return the same view type though (so the real analyzer algo in the header file above does not need to be templated)
+*/
+
+class RecHitSoATest : public edm::global::EDAnalyzer<> {
+public:
+
+  explicit RecHitSoATest(const edm::ParameterSet& iConfig);
+  ~RecHitSoATest() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
+  const bool m_onGPU;
+  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tGpuHits;
+  edm::EDGetTokenT<TrackingRecHit2DCPU> tCpuHits;
+};
+
+RecHitSoATest::RecHitSoATest(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+  if (m_onGPU) {
+      tGpuHits = 
+          consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"));
+  } else {
+      tCpuHits =
+          consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"));
+  }
+}
+
+
+void RecHitSoATest::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const {
+  if (m_onGPU) {
+    auto const & gh = iEvent.get(tGpuHits);
+    CUDAScopedContextProduce ctx{gh};
+
+    analyzeImpl<cudaCompat::GPUTraits>(ctx.get(gh),ctx.stream());
+
+  } else {
+    analyzeImpl<cudaCompat::CPUTraits>(iEvent.get(tCpuHits),cudaStreamDefault);
+  }
+}
+
+void RecHitSoATest::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("onGPU", true);
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
+  descriptions.add("RecHitSoATest", desc);
+}
+
+DEFINE_FWK_MODULE(RecHitSoATest);
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h
new file mode 100644
index 0000000000000..1032501abe3fc
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATest.h
@@ -0,0 +1,71 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+__global__
+void analyzeHits(TrackingRecHit2DSOAView const* hhp, uint32_t nhits, double * ws) {
+
+  assert(hhp);
+  assert(ws);
+
+  auto const& hh = *hhp;
+  auto first = blockIdx.x * blockDim.x + threadIdx.x;
+
+#ifdef CUDA_KERNELS_ON_CPU
+  std::cout << "kernel ON CPU!" << std::endl;
+#else
+  if (0 == first) printf("kernel ON GPU!\n");
+#endif
+
+
+  if (0 == first)
+    printf("in analyzeHits SoA\n");
+
+  __shared__ int nh;
+  __shared__ double sch;
+ 
+  nh=0; sch=0;
+  __syncthreads();
+
+  for (int i = first, ni=nhits; i < ni; i += gridDim.x * blockDim.x) {
+    if (hh.charge(i) <=0) printf("a hit with zero charge?\n");
+    atomicAdd(&nh,1);
+    atomicAdd(&sch,(double)(hh.charge(i)));
+  }
+
+  __syncthreads();
+
+  if(0==threadIdx.x) {
+   atomicAdd(&ws[0],nh);
+   atomicAdd(&ws[1],sch);
+   atomicAdd(&ws[9],1);
+   if (gridDim.x==ws[9]) {
+     printf("in event  : found %f hits, average charge is %f\n",ws[0], ws[1]/ws[0]);  
+   }
+  }
+
+}
+
+template<typename Traits>
+void analyzeImpl(TrackingRecHit2DHeterogeneous<Traits> const & gHits, cudaStream_t stream) {
+#ifdef CUDA_KERNELS_ON_CPU
+  std::cout << "launching ON CPU!" << std::endl;
+  assert(!Traits::runOnDevice);
+#else
+  printf("launching ON GPU!\n");
+  assert(Traits::runOnDevice);
+#endif
+
+    auto nhits = gHits.nHits();
+
+    if (0 == nhits) return;
+
+    auto ws_d = Traits:: template make_unique<double[]>(10,stream);
+    cudautils::memsetAsync(ws_d, 0, 10, stream);
+    int threadsPerBlock = 256;
+    int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    cudautils::launch(analyzeHits,{threadsPerBlock,blocks,0,stream} , gHits.view(), nhits, ws_d.get());
+}
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc
new file mode 100644
index 0000000000000..aa9ab7168b6d3
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestCPU.cc
@@ -0,0 +1,4 @@
+#include "RecHitSoATest.h"
+template
+void analyzeImpl<cudaCompat::CPUTraits>(TrackingRecHit2DHeterogeneous<cudaCompat::CPUTraits> const & gHits, cudaStream_t stream);
+
diff --git a/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu
new file mode 100644
index 0000000000000..0553db894f398
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/test/RecHitSoATestGPU.cu
@@ -0,0 +1,4 @@
+#include "RecHitSoATest.h"
+template
+void analyzeImpl<cudaCompat::GPUTraits>(TrackingRecHit2DHeterogeneous<cudaCompat::GPUTraits> const & gHits, cudaStream_t stream);
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index cc5865d97fd95..6c90ab1dd8f6f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -1,68 +1,7 @@
 #include "BrokenLineFitOnGPU.h"
 
-void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
-
-  //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
-  auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // fit triplets
-    kernelBLFastFit<3>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
-
-    kernelBLFit<3>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   3,
-                   offset);
-
-    // fit quads
-    kernelBLFastFit<4>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
-
-    kernelBLFit<4>(tupleMultiplicity_d,
-                   bField_,
-                   outputSoa_d,
-                   hitsGPU_.get(),
-                   hits_geGPU_.get(),
-                   fast_fit_resultsGPU_.get(),
-                   4,
-                   offset);
-
-    if (fit5as4_) {
-      // fit penta (only first 4)
-      kernelBLFastFit<4>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-
-      kernelBLFit<4>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
-    } else {
-      // fit penta (all 5)
-      kernelBLFastFit<5>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-
-      kernelBLFit<5>(tupleMultiplicity_d,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     5,
-                     offset);
-    }
-
-  }  // loop on concurrent fits
-}
+template
+void HelixFitOnGPU::launchBrokenLineKernels<cudaCompat::CPUTraits>(HitsView const *hv,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples,
+                                            cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 660cf75e1f460..2eba3e9af5965 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,85 +1,7 @@
 #include "BrokenLineFitOnGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
+template
+void HelixFitOnGPU::launchBrokenLineKernels<cudaCompat::GPUTraits>(HitsView const *hv,
                                             uint32_t hitsInFit,
                                             uint32_t maxNumberOfTuples,
-                                            cudaStream_t stream) {
-  assert(tuples_d);
-
-  auto blockSize = 64;
-  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
-
-  //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // fit triplets
-    kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                             bField_,
-                                                             outputSoa_d,
-                                                             hitsGPU_.get(),
-                                                             hits_geGPU_.get(),
-                                                             fast_fit_resultsGPU_.get(),
-                                                             3,
-                                                             offset);
-    cudaCheck(cudaGetLastError());
-
-    // fit quads
-    kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 bField_,
-                                                                 outputSoa_d,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 4,
-                                                                 offset);
-    cudaCheck(cudaGetLastError());
-
-    if (fit5as4_) {
-      // fit penta (only first 4)
-      kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
-      cudaCheck(cudaGetLastError());
-    } else {
-      // fit penta (all 5)
-      kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   5,
-                                                                   offset);
-      cudaCheck(cudaGetLastError());
-    }
-
-  }  // loop on concurrent fits
-}
+                                            cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 82a5bee443f88..aa7c1e680929e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -185,3 +185,84 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
 #endif
   }
 }
+
+
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+template<typename Traits>
+void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
+                                            uint32_t hitsInFit,
+                                            uint32_t maxNumberOfTuples,
+                                            cudaStream_t stream) {
+  assert(tuples_d);
+
+  int blockSize = 64;
+  int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU_ = Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ =  Traits:: template make_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ =  Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // fit triplets
+    cudautils::launch(kernelBLFastFit<3>,{numberOfBlocks, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelBLFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d,
+                                                             bField_,
+                                                             outputSoa_d,
+                                                             hitsGPU_.get(),
+                                                             hits_geGPU_.get(),
+                                                             fast_fit_resultsGPU_.get(),
+                                                             3,
+                                                             offset);
+
+    // fit quads
+    cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                 bField_,
+                                                                 outputSoa_d,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 4,
+                                                                 offset);
+
+    if (fit5as4_) {
+      // fit penta (only first 4)
+      cudautils::launch(kernelBLFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      cudautils::launch(kernelBLFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
+    } else {
+      // fit penta (all 5)
+      cudautils::launch(kernelBLFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
+
+      cudautils::launch(kernelBLFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   5,
+                                                                   offset);
+    }
+
+  }  // loop on concurrent fits
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 11b644d466768..a87b45f47cbe5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -74,10 +74,10 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
     CUDAScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
 
-    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuples<cudaCompat::GPUTraits>(hits, bf, ctx.stream()));
   } else {
     auto const& hits = iEvent.get(tokenHitCPU_);
-    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
+    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples<cudaCompat::CPUTraits>(hits, bf,cudaStreamDefault));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 75066458dc170..03a8b927289b8 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -1,174 +1,17 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
-  kernel_printCounters(counters);
-}
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) {
-  kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices);
-}
+template
+void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-template <>
-void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  auto nhits = hh.nHits();
+template
+void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-#ifdef NTUPLE_DEBUG
-  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
-#endif
+template
+void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream);
 
-  // in principle we can use "nhits" to heuristically dimension the workspace...
-  // overkill to use template here (std::make_unique would suffice)
-  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
-  device_isOuterHitOfCell_.reset(
-      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
-  assert(device_isOuterHitOfCell_.get());
-  gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
-                                 nhits,
-                                 device_theCellNeighbors_,
-                                 device_theCellNeighborsContainer_.get(),
-                                 device_theCellTracks_,
-                                 device_theCellTracksContainer_.get());
+template
+void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
-  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
-  if (0 == nhits)
-    return;  // protect against empty events
-
-  // FIXME avoid magic numbers
-  auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
-  }
-
-  assert(nActualPairs <= gpuPixelDoublets::nPairs);
-  gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
-                                         device_nCells_,
-                                         device_theCellNeighbors_,
-                                         device_theCellTracks_,
-                                         hh.view(),
-                                         device_isOuterHitOfCell_.get(),
-                                         nActualPairs,
-                                         m_params.idealConditions_,
-                                         m_params.doClusterCut_,
-                                         m_params.doZ0Cut_,
-                                         m_params.doPtCut_,
-                                         m_params.maxNumberOfDoublets_);
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
-
-  assert(tuples_d && quality_d);
-
-  // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
-
-  auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
-
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
-
-  //
-  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
-  //
-
-  kernel_connect(device_hitTuple_apc_,
-                 device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-                 hh.view(),
-                 device_theCells_.get(),
-                 device_nCells_,
-                 device_theCellNeighbors_,
-                 device_isOuterHitOfCell_.get(),
-                 m_params.hardCurvCut_,
-                 m_params.ptmin_,
-                 m_params.CAThetaCutBarrel_,
-                 m_params.CAThetaCutForward_,
-                 m_params.dcaCutInnerTriplet_,
-                 m_params.dcaCutOuterTriplet_);
-
-  if (nhits > 1 && m_params.earlyFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
-  }
-
-  kernel_find_ntuplets(hh.view(),
-                       device_theCells_.get(),
-                       device_nCells_,
-                       device_theCellTracks_,
-                       tuples_d,
-                       device_hitTuple_apc_,
-                       quality_d,
-                       m_params.minHitsPerNtuplet_);
-  if (m_params.doStats_)
-    kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
-
-  cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
-
-  // remove duplicates (tracks that share a doublet)
-  kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
-
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-
-  if (nhits > 1 && m_params.lateFishbone_) {
-    fishbone(hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
-  }
-
-  if (m_params.doStats_) {
-    kernel_checkOverflows(tuples_d,
-                          device_tupleMultiplicity_.get(),
-                          device_hitTuple_apc_,
-                          device_theCells_.get(),
-                          device_nCells_,
-                          device_theCellNeighbors_,
-                          device_theCellTracks_,
-                          device_isOuterHitOfCell_.get(),
-                          nhits,
-                          m_params.maxNumberOfDoublets_,
-                          counters_);
-  }
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
-
-  // classify tracks based on kinematics
-  kernel_classifyTracks(tuples_d, tracks_d, m_params.cuts_, quality_d);
-
-  if (m_params.lateFishbone_) {
-    // apply fishbone cleaning to good tracks
-    kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
-  }
-
-  // remove duplicates (tracks that share a doublet)
-  kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
-
-  // fill hit->track "map"
-  kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
-  kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-
-  // remove duplicates (tracks that share a hit)
-  kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
-
-  if (m_params.doStats_) {
-    // counters (add flag???)
-    kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
-    kernel_doStatsForTracks(tuples_d, quality_d, counters_);
-  }
-
-#ifdef DUMP_GPU_TK_TUPLES
-  static std::atomic<int> iev(0);
-  ++iev;
-  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
-#endif
-}
+template
+void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index aaf882633f17d..8939acff2ebc2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,293 +1,17 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  auto blockSize = 128;
-  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
 
-  kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
-  cudaCheck(cudaGetLastError());
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-}
+template
+void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-template <>
-void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  // these are pointer on GPU!
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+template
+void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-  // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
+template
+void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream);
 
-  auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+template
+void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream);
 
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
-
-  //
-  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
-  //
-
-  auto nthTot = 64;
-  auto stride = 4;
-  auto blockSize = nthTot / stride;
-  auto numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  auto rescale = numberOfBlocks / 65536;
-  blockSize *= (rescale + 1);
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  assert(numberOfBlocks < 65536);
-  assert(blockSize > 0 && 0 == blockSize % 16);
-  dim3 blks(1, numberOfBlocks, 1);
-  dim3 thrs(stride, blockSize, 1);
-
-  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
-      device_hitTuple_apc_,
-      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-      hh.view(),
-      device_theCells_.get(),
-      device_nCells_,
-      device_theCellNeighbors_,
-      device_isOuterHitOfCell_.get(),
-      m_params.hardCurvCut_,
-      m_params.ptmin_,
-      m_params.CAThetaCutBarrel_,
-      m_params.CAThetaCutForward_,
-      m_params.dcaCutInnerTriplet_,
-      m_params.dcaCutOuterTriplet_);
-  cudaCheck(cudaGetLastError());
-
-  if (nhits > 1 && m_params.earlyFishbone_) {
-    auto nthTot = 128;
-    auto stride = 16;
-    auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
-    dim3 blks(1, numberOfBlocks, 1);
-    dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
-    cudaCheck(cudaGetLastError());
-  }
-
-  blockSize = 64;
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                                     device_theCells_.get(),
-                                                                     device_nCells_,
-                                                                     device_theCellTracks_,
-                                                                     tuples_d,
-                                                                     device_hitTuple_apc_,
-                                                                     quality_d,
-                                                                     m_params.minHitsPerNtuplet_);
-  cudaCheck(cudaGetLastError());
-
-  if (m_params.doStats_)
-    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(), device_theCells_.get(), device_nCells_);
-  cudaCheck(cudaGetLastError());
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-
-  blockSize = 128;
-  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
-  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
-
-  // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
-  cudaCheck(cudaGetLastError());
-
-  blockSize = 128;
-  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudaCheck(cudaGetLastError());
-
-  if (nhits > 1 && m_params.lateFishbone_) {
-    auto nthTot = 128;
-    auto stride = 16;
-    auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
-    dim3 blks(1, numberOfBlocks, 1);
-    dim3 thrs(stride, blockSize, 1);
-    fishbone<<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
-    cudaCheck(cudaGetLastError());
-  }
-
-  if (m_params.doStats_) {
-    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
-    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
-                                                                        device_tupleMultiplicity_.get(),
-                                                                        device_hitTuple_apc_,
-                                                                        device_theCells_.get(),
-                                                                        device_nCells_,
-                                                                        device_theCellNeighbors_,
-                                                                        device_theCellTracks_,
-                                                                        device_isOuterHitOfCell_.get(),
-                                                                        nhits,
-                                                                        m_params.maxNumberOfDoublets_,
-                                                                        counters_);
-    cudaCheck(cudaGetLastError());
-  }
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  auto nhits = hh.nHits();
-
-#ifdef NTUPLE_DEBUG
-  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
-#endif
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-
-  // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
-  assert(device_isOuterHitOfCell_.get());
-  {
-    int threadsPerBlock = 128;
-    // at least one block!
-    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
-    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
-                                                                           nhits,
-                                                                           device_theCellNeighbors_,
-                                                                           device_theCellNeighborsContainer_.get(),
-                                                                           device_theCellTracks_,
-                                                                           device_theCellTracksContainer_.get());
-    cudaCheck(cudaGetLastError());
-  }
-
-  device_theCells_ = cudautils::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-
-  if (0 == nhits)
-    return;  // protect against empty events
-
-  // FIXME avoid magic numbers
-  auto nActualPairs = gpuPixelDoublets::nPairs;
-  if (!m_params.includeJumpingForwardDoublets_)
-    nActualPairs = 15;
-  if (m_params.minHitsPerNtuplet_ > 3) {
-    nActualPairs = 13;
-  }
-
-  assert(nActualPairs <= gpuPixelDoublets::nPairs);
-  int stride = 4;
-  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
-  int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
-  dim3 blks(1, blocks, 1);
-  dim3 thrs(stride, threadsPerBlock, 1);
-  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(device_theCells_.get(),
-                                                                    device_nCells_,
-                                                                    device_theCellNeighbors_,
-                                                                    device_theCellTracks_,
-                                                                    hh.view(),
-                                                                    device_isOuterHitOfCell_.get(),
-                                                                    nActualPairs,
-                                                                    m_params.idealConditions_,
-                                                                    m_params.doClusterCut_,
-                                                                    m_params.doZ0Cut_,
-                                                                    m_params.doPtCut_,
-                                                                    m_params.maxNumberOfDoublets_);
-  cudaCheck(cudaGetLastError());
-
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
-  // these are pointer on GPU!
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = (Quality *)(&tracks_d->m_quality);
-
-  auto blockSize = 64;
-
-  // classify tracks based on kinematics
-  auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, m_params.cuts_, quality_d);
-  cudaCheck(cudaGetLastError());
-
-  if (m_params.lateFishbone_) {
-    // apply fishbone cleaning to good tracks
-    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        device_theCells_.get(), device_nCells_, quality_d);
-    cudaCheck(cudaGetLastError());
-  }
-
-  // remove duplicates (tracks that share a doublet)
-  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
-  cudaCheck(cudaGetLastError());
-
-  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
-    // fill hit->track "map"
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
-    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
-    cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
-  }
-  if (m_params.minHitsPerNtuplet_ < 4) {
-    // remove duplicates (tracks that share a hit)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
-    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
-  }
-
-  if (m_params.doStats_) {
-    // counters (add flag???)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
-    kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
-    cudaCheck(cudaGetLastError());
-    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
-    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
-    cudaCheck(cudaGetLastError());
-  }
-#ifdef GPU_DEBUG
-  cudaDeviceSynchronize();
-  cudaCheck(cudaGetLastError());
-#endif
-
-#ifdef DUMP_GPU_TK_TUPLES
-  static std::atomic<int> iev(0);
-  ++iev;
-  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
-#endif
-}
-
-template <>
-void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) {
-  kernel_printCounters<<<1, 1>>>(counters);
-}
+template
+void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 0140958aa9ef2..8213cc9c2af03 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -171,7 +171,7 @@ class CAHitNtupletGeneratorKernels {
   void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
-  void allocateOnGPU(cudaStream_t stream);
+  void allocate(cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index b91911c66924e..e688052f1faf0 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -4,9 +4,9 @@
 
 template <>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU::allocate(cudaStream_t stream) {
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU::allocate(cudaStream_t stream) {
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 6888e6725bc39..19df8f5e249a2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -601,3 +601,282 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
          c.nEmptyCells / double(c.nCells),
          c.nZeroTrackCells / double(c.nCells));
 }
+
+
+
+// launchers
+
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  auto blockSize = 128;
+  int numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+
+  cudautils::launch(kernel_fillHitDetIndices,{numberOfBlocks, blockSize, 0, cudaStream},
+      &tracks_d->hitIndices, hv, &tracks_d->detIndices);
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  // zero tuples
+  cudautils::launchZero(tuples_d, cudaStream);
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  auto nthTot = 64;
+  auto stride = 4;
+  auto blockSize = nthTot / stride;
+  int numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  auto rescale = numberOfBlocks / 65536;
+  blockSize *= (rescale + 1);
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  assert(numberOfBlocks < 65536);
+  assert(blockSize > 0 && 0 == blockSize % 16);
+  dim3 blks(1, numberOfBlocks, 1);
+  dim3 thrs(stride, blockSize, 1);
+
+  cudautils::launch(kernel_connect,{blks, thrs, 0, cudaStream},
+      device_hitTuple_apc_,
+      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+      hh.view(),
+      device_theCells_.get(),
+      device_nCells_,
+      device_theCellNeighbors_,
+      device_isOuterHitOfCell_.get(),
+      m_params.hardCurvCut_,
+      m_params.ptmin_,
+      m_params.CAThetaCutBarrel_,
+      m_params.CAThetaCutForward_,
+      m_params.dcaCutInnerTriplet_,
+      m_params.dcaCutOuterTriplet_);
+
+  if (nhits > 1 && m_params.earlyFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    int blockSize = nthTot / stride;
+    int numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+  }
+
+  blockSize = 64;
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_find_ntuplets,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(),
+                                                                     device_theCells_.get(),
+                                                                     device_nCells_,
+                                                                     device_theCellTracks_,
+                                                                     tuples_d,
+                                                                     device_hitTuple_apc_,
+                                                                     quality_d,
+                                                                     m_params.minHitsPerNtuplet_);
+
+  if (m_params.doStats_)
+    cudautils::launch(kernel_mark_used,{numberOfBlocks, blockSize, 0, cudaStream},hh.view(), device_theCells_.get(), device_nCells_);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+  blockSize = 128;
+  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+  cudautils::launch(cudautils::finalizeBulk<std::remove_pointer<decltype(tuples_d)>::type>,{numberOfBlocks, blockSize, 0, cudaStream},device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_earlyDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
+      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+
+  blockSize = 128;
+  numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_countMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cudautils::launch(kernel_fillMultiplicity,{numberOfBlocks, blockSize, 0, cudaStream},
+      tuples_d, quality_d, device_tupleMultiplicity_.get());
+
+  if (nhits > 1 && m_params.lateFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    cudautils::launch(fishbone,{blks, thrs, 0, cudaStream},
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+  }
+
+  if (m_params.doStats_) {
+    numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_checkOverflows,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_,
+                                                                        device_theCellTracks_,
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        m_params.maxNumberOfDoublets_,
+                                                                        counters_);
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  device_isOuterHitOfCell_ = TTraits:: template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  assert(device_isOuterHitOfCell_.get());
+  {
+    int threadsPerBlock = 128;
+    // at least one block!
+    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    cudautils::launch(gpuPixelDoublets::initDoublets,{blocks, threadsPerBlock, 0, stream},device_isOuterHitOfCell_.get(),
+                                                                           nhits,
+                                                                           device_theCellNeighbors_,
+                                                                           device_theCellNeighborsContainer_.get(),
+                                                                           device_theCellTracks_,
+                                                                           device_theCellTracksContainer_.get());
+  }
+
+  device_theCells_ = TTraits:: template make_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs = gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_)
+    nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_ > 3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs <= gpuPixelDoublets::nPairs);
+  int stride = 4;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
+  int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  dim3 blks(1, blocks, 1);
+  dim3 thrs(stride, threadsPerBlock, 1);
+  cudautils::launch(gpuPixelDoublets::getDoubletsFromHisto,{blks, thrs, 0, stream},device_theCells_.get(),
+                                                                    device_nCells_,
+                                                                    device_theCellNeighbors_,
+                                                                    device_theCellTracks_,
+                                                                    hh.view(),
+                                                                    device_isOuterHitOfCell_.get(),
+                                                                    nActualPairs,
+                                                                    m_params.idealConditions_,
+                                                                    m_params.doClusterCut_,
+                                                                    m_params.doZ0Cut_,
+                                                                    m_params.doPtCut_,
+                                                                    m_params.maxNumberOfDoublets_);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto const *tuples_d = &tracks_d->hitIndices;
+  auto *quality_d = (Quality *)(&tracks_d->m_quality);
+
+  int blockSize = 64;
+
+  // classify tracks based on kinematics
+  int numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_classifyTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, tracks_d, m_params.cuts_, quality_d);
+
+  if (m_params.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_fishboneCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
+        device_theCells_.get(), device_nCells_, quality_d);
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
+  cudautils::launch(kernel_fastDuplicateRemover,{numberOfBlocks, blockSize, 0, cudaStream},
+      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+
+  if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) {
+    // fill hit->track "map"
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_countHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},
+        tuples_d, quality_d, device_hitToTuple_.get());
+    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cudaCheck(cudaGetLastError());
+    cudautils::launch(kernel_fillHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, device_hitToTuple_.get());
+  }
+  if (m_params.minHitsPerNtuplet_ < 4) {
+    // remove duplicates (tracks that share a hit)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_tripletCleaner,{numberOfBlocks, blockSize, 0, cudaStream},
+        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+  }
+
+  if (m_params.doStats_) {
+    // counters (add flag???)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_doStatsForHitInTracks,{numberOfBlocks, blockSize, 0, cudaStream},device_hitToTuple_.get(), counters_);
+    numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize;
+    cudautils::launch(kernel_doStatsForTracks,{numberOfBlocks, blockSize, 0, cudaStream},tuples_d, quality_d, counters_);
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  cudautils::launch(kernel_print_found_ntuplets,{1, 32, 0, cudaStream},
+      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+#endif
+}
+
+template <typename TTraits>
+void CAHitNtupletGeneratorKernels<TTraits>::printCounters(Counters const *counters) {
+  cudautils::launch(kernel_printCounters,{1, 1,0, cudaStreamDefault},counters);
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 2e875caba7130..e6525bd16a49b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -159,61 +159,3 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
           "cuts\" based on the fit results (pT, Tip, Zip).");
 }
 
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
-                                                                    float bfield,
-                                                                    cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
-
-  auto* soa = tracks.get();
-
-  CAHitNtupletGeneratorKernelsGPU kernels(m_params);
-  kernels.counters_ = m_counters;
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
-
-  kernels.allocateOnGPU(stream);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
-
-  kernels.buildDoublets(hits_d, stream);
-  kernels.launchKernels(hits_d, soa, stream);
-  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
-  if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
-  } else {
-    fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
-  }
-  kernels.classifyTuples(hits_d, soa, stream);
-
-  return tracks;
-}
-
-PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
-
-  auto* soa = tracks.get();
-  assert(soa);
-
-  CAHitNtupletGeneratorKernelsCPU kernels(m_params);
-  kernels.counters_ = m_counters;
-  kernels.allocateOnGPU(nullptr);
-
-  kernels.buildDoublets(hits_d, nullptr);
-  kernels.launchKernels(hits_d, soa, nullptr);
-  kernels.fillHitDetIndices(hits_d.view(), soa, nullptr);  // in principle needed only if Hits not "available"
-
-  if (0 == hits_d.nHits())
-    return tracks;
-
-  // now fit
-  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
-
-  if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
-  } else {
-    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets());
-  }
-
-  kernels.classifyTuples(hits_d, soa, nullptr);
-
-  return tracks;
-}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index de2e1913dd18b..6e92c1b217cfb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -47,9 +47,8 @@ class CAHitNtupletGeneratorOnGPU {
   static void fillDescriptions(edm::ParameterSetDescription& desc);
   static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
-  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const;
-
-  PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const;
+  template<typename Traits, typename Hits2D>
+  PixelTrackHeterogeneous makeTuples(Hits2D const& hits_d, float bfield, cudaStream_t stream) const;
 
 private:
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const;
@@ -63,4 +62,42 @@ class CAHitNtupletGeneratorOnGPU {
   Counters* m_counters = nullptr;
 };
 
+template<typename Traits, typename Hits2D>
+inline
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(Hits2D const& hits_d,
+                                                                    float bfield,
+                                                                    cudaStream_t stream) const {
+  PixelTrackHeterogeneous tracks(Traits:: template make_unique<pixelTrack::TrackSoA>(stream));
+
+  using NTupletGeneratorKernels = CAHitNtupletGeneratorKernels<Traits>;
+
+  auto* soa = tracks.get();
+
+  NTupletGeneratorKernels kernels(m_params);
+  kernels.counters_ = m_counters;
+  kernels.allocate(stream);
+
+  kernels.buildDoublets(hits_d, stream);
+  kernels.launchKernels(hits_d, soa, stream);
+  kernels.fillHitDetIndices(hits_d.view(), soa, stream);  // in principle needed only if Hits not "available"
+
+  if constexpr (!Traits::runOnDevice)
+  if (0 == hits_d.nHits())
+    return tracks;
+
+  // now fit
+  HelixFitOnGPU fitter(bfield, m_params.fit5as4_);
+  fitter.allocate(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernels<Traits>(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  } else {
+    fitter.launchBrokenLineKernels<Traits>(hits_d.view(), hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  }
+  kernels.classifyTuples(hits_d, soa, stream);
+
+  return tracks;
+}
+
+
+
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index becbd0a1a8540..050a5c83431ce 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,7 +1,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
+void HelixFitOnGPU::allocate(Tuples const *tuples,
                                   TupleMultiplicity const *tupleMultiplicity,
                                   OutputSoA *helix_fit_results) {
   tuples_d = tuples;
@@ -13,4 +13,4 @@ void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
   assert(outputSoa_d);
 }
 
-void HelixFitOnGPU::deallocateOnGPU() {}
+void HelixFitOnGPU::deallocate() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 05b399e870f58..cf877dbd11363 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -41,17 +41,18 @@ class HelixFitOnGPU {
   using TupleMultiplicity = CAConstants::TupleMultiplicity;
 
   explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
-  ~HelixFitOnGPU() { deallocateOnGPU(); }
+  ~HelixFitOnGPU() { deallocate(); }
 
   void setBField(double bField) { bField_ = bField; }
+
+  template<typename Traits>
   void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
-  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
-  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
-  void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  template<typename Traits>
+  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
 
-  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
-  void deallocateOnGPU();
+  void allocate(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
+  void deallocate();
 
 private:
   static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index 3476362864a79..f77f9a4fb9a41 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -1,110 +1,7 @@
 #include "RiemannFitOnGPU.h"
 
-void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) {
-  assert(tuples_d);
-
-  //  Fit internals
-  auto hitsGPU_ = std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double));
-  auto hits_geGPU_ = std::make_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float));
-  auto fast_fit_resultsGPU_ =
-      std::make_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double));
-  auto circle_fit_resultsGPU_holder = std::make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit));
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // triplets
-    kernelFastFit<3>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-    kernelCircleFit<3>(tupleMultiplicity_d,
-                       3,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-
-    kernelLineFit<3>(tupleMultiplicity_d,
-                     3,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
-
-    // quads
-    kernelFastFit<4>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-    kernelCircleFit<4>(tupleMultiplicity_d,
-                       4,
-                       bField_,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-
-    kernelLineFit<4>(tupleMultiplicity_d,
-                     4,
-                     bField_,
-                     outputSoa_d,
-                     hitsGPU_.get(),
-                     hits_geGPU_.get(),
-                     fast_fit_resultsGPU_.get(),
-                     circle_fit_resultsGPU_,
-                     offset);
-
-    if (fit5as4_) {
-      // penta
-      kernelFastFit<4>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-      kernelCircleFit<4>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
-
-      kernelLineFit<4>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-
-    } else {
-      // penta all 5
-      kernelFastFit<5>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-
-      kernelCircleFit<5>(tupleMultiplicity_d,
-                         5,
-                         bField_,
-                         hitsGPU_.get(),
-                         hits_geGPU_.get(),
-                         fast_fit_resultsGPU_.get(),
-                         circle_fit_resultsGPU_,
-                         offset);
-
-      kernelLineFit<5>(tupleMultiplicity_d,
-                       5,
-                       bField_,
-                       outputSoa_d,
-                       hitsGPU_.get(),
-                       hits_geGPU_.get(),
-                       fast_fit_resultsGPU_.get(),
-                       circle_fit_resultsGPU_,
-                       offset);
-    }
-  }
-}
+template
+void HelixFitOnGPU::launchRiemannKernels<cudaCompat::CPUTraits>(HitsView const *hv,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples,
+                                         cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index cb5d32b47aea3..011b7842f9681 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -1,131 +1,7 @@
 #include "RiemannFitOnGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
-void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
+template
+void HelixFitOnGPU::launchRiemannKernels<cudaCompat::GPUTraits>(HitsView const *hv,
                                          uint32_t nhits,
                                          uint32_t maxNumberOfTuples,
-                                         cudaStream_t stream) {
-  assert(tuples_d);
-
-  auto blockSize = 64;
-  auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
-
-  //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
-  auto circle_fit_resultsGPU_holder =
-      cudautils::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
-  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
-
-  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
-    // triplets
-    kernelFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelCircleFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                 3,
-                                                                 bField_,
-                                                                 hitsGPU_.get(),
-                                                                 hits_geGPU_.get(),
-                                                                 fast_fit_resultsGPU_.get(),
-                                                                 circle_fit_resultsGPU_,
-                                                                 offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                               3,
-                                                               bField_,
-                                                               outputSoa_d,
-                                                               hitsGPU_.get(),
-                                                               hits_geGPU_.get(),
-                                                               fast_fit_resultsGPU_.get(),
-                                                               circle_fit_resultsGPU_,
-                                                               offset);
-    cudaCheck(cudaGetLastError());
-
-    // quads
-    kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     4,
-                                                                     bField_,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
-    cudaCheck(cudaGetLastError());
-
-    kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                   4,
-                                                                   bField_,
-                                                                   outputSoa_d,
-                                                                   hitsGPU_.get(),
-                                                                   hits_geGPU_.get(),
-                                                                   fast_fit_resultsGPU_.get(),
-                                                                   circle_fit_resultsGPU_,
-                                                                   offset);
-    cudaCheck(cudaGetLastError());
-
-    if (fit5as4_) {
-      // penta
-      kernelFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelCircleFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelLineFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
-      cudaCheck(cudaGetLastError());
-    } else {
-      // penta all 5
-      kernelFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
-          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelCircleFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                       5,
-                                                                       bField_,
-                                                                       hitsGPU_.get(),
-                                                                       hits_geGPU_.get(),
-                                                                       fast_fit_resultsGPU_.get(),
-                                                                       circle_fit_resultsGPU_,
-                                                                       offset);
-      cudaCheck(cudaGetLastError());
-
-      kernelLineFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
-                                                                     5,
-                                                                     bField_,
-                                                                     outputSoa_d,
-                                                                     hitsGPU_.get(),
-                                                                     hits_geGPU_.get(),
-                                                                     fast_fit_resultsGPU_.get(),
-                                                                     circle_fit_resultsGPU_,
-                                                                     offset);
-      cudaCheck(cudaGetLastError());
-    }
-  }
-}
+                                         cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index b46de519034d5..c6345ba6eab3b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -187,3 +187,134 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 #endif
   }
 }
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+template<typename Traits>
+void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
+                                         uint32_t nhits,
+                                         uint32_t maxNumberOfTuples,
+                                         cudaStream_t stream) {
+  assert(tuples_d);
+
+  int blockSize = 64;
+  int numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+
+  //  Fit internals
+  auto hitsGPU_ = Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
+  auto hits_geGPU_ = Traits:: template make_unique<float[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
+  auto fast_fit_resultsGPU_ = Traits:: template make_unique<double[]>(
+      maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
+  auto circle_fit_resultsGPU_holder =
+      Traits:: template make_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
+  Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
+
+  for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+    // triplets
+    cudautils::launch(kernelFastFit<3>,{numberOfBlocks, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, 3, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelCircleFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                 3,
+                                                                 bField_,
+                                                                 hitsGPU_.get(),
+                                                                 hits_geGPU_.get(),
+                                                                 fast_fit_resultsGPU_.get(),
+                                                                 circle_fit_resultsGPU_,
+                                                                 offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelLineFit<3>,{numberOfBlocks, blockSize, 0, stream},tupleMultiplicity_d,
+                                                               3,
+                                                               bField_,
+                                                               outputSoa_d,
+                                                               hitsGPU_.get(),
+                                                               hits_geGPU_.get(),
+                                                               fast_fit_resultsGPU_.get(),
+                                                               circle_fit_resultsGPU_,
+                                                               offset);
+    cudaCheck(cudaGetLastError());
+
+    // quads
+    cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+        tuples_d, tupleMultiplicity_d, 4, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                     4,
+                                                                     bField_,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+    cudaCheck(cudaGetLastError());
+
+    cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                   4,
+                                                                   bField_,
+                                                                   outputSoa_d,
+                                                                   hitsGPU_.get(),
+                                                                   hits_geGPU_.get(),
+                                                                   fast_fit_resultsGPU_.get(),
+                                                                   circle_fit_resultsGPU_,
+                                                                   offset);
+    cudaCheck(cudaGetLastError());
+
+    if (fit5as4_) {
+      // penta
+      cudautils::launch(kernelFastFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelCircleFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelLineFit<4>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+      cudaCheck(cudaGetLastError());
+    } else {
+      // penta all 5
+      cudautils::launch(kernelFastFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},
+          tuples_d, tupleMultiplicity_d, 5, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelCircleFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                       5,
+                                                                       bField_,
+                                                                       hitsGPU_.get(),
+                                                                       hits_geGPU_.get(),
+                                                                       fast_fit_resultsGPU_.get(),
+                                                                       circle_fit_resultsGPU_,
+                                                                       offset);
+      cudaCheck(cudaGetLastError());
+
+      cudautils::launch(kernelLineFit<5>,{numberOfBlocks / 4, blockSize, 0, stream},tupleMultiplicity_d,
+                                                                     5,
+                                                                     bField_,
+                                                                     outputSoa_d,
+                                                                     hitsGPU_.get(),
+                                                                     hits_geGPU_.get(),
+                                                                     fast_fit_resultsGPU_.get(),
+                                                                     circle_fit_resultsGPU_,
+                                                                     offset);
+      cudaCheck(cudaGetLastError());
+    }
+  }
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
index e959abcfef949..347476cf1b6a3 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
 
     assert(tracks);
 
-    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make<cudaCompat::GPUTraits>(ctx.stream(), tracks, m_ptMin,m_OnGPU));
 
   } else {
     auto const* tracks = iEvent.get(tokenCPUTrack_).get();
@@ -117,7 +117,8 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
     std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
     */
 
-    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+    cudaStream_t stream=cudaStreamDefault;
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make<cudaCompat::CPUTraits>(stream,tracks, m_ptMin,m_OnGPU));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc
new file mode 100644
index 0000000000000..e8c6a812d86d2
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cc
@@ -0,0 +1,9 @@
+#define CUDA_KERNELS_ON_CPU // not needed yet
+#include "gpuVertexFinderImpl.h"
+namespace gpuVertexFinder {
+template<>
+    ZVertexHeterogeneous Producer::make<cudaCompat::CPUTraits>(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const {
+       assert(!onGPU);
+       return makeImpl<cudaCompat::CPUTraits>(stream,tksoa,ptMin);
+    }
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp
deleted file mode 100644
index 084763385bdb4..0000000000000
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
index 084763385bdb4..96739a42c3b0a 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
@@ -1 +1,8 @@
 #include "gpuVertexFinderImpl.h"
+namespace gpuVertexFinder {
+template<>
+    ZVertexHeterogeneous Producer::make<cudaCompat::GPUTraits>(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const {
+       assert(onGPU);
+       return makeImpl<cudaCompat::GPUTraits>(stream,tksoa,ptMin);
+    }
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index 6cd86c93a6737..4dc4572da6e7e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -63,8 +63,11 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
-    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
+    template<typename Traits>
+    ZVertexHeterogeneous makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+
+    template<typename Traits>
+    ZVertexHeterogeneous make(cudaStream_t stream, TkSoA const* tksoa, float ptMin, bool onGPU) const;
 
   private:
     const bool oneKernel_;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
index d6e63227ccf85..6fd6e985f25b9 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -1,12 +1,14 @@
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksByDensity.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
-#include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksIterative.h"
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
 
 #include "gpuFitVertices.h"
 #include "gpuSortByPt2.h"
 #include "gpuSplitVertices.h"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
 
 namespace gpuVertexFinder {
 
@@ -84,87 +86,45 @@ namespace gpuVertexFinder {
   }
 #endif
 
-#ifdef __CUDACC__
-  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
-    // std::cout << "producing Vertices on GPU" << std::endl;
-    ZVertexHeterogeneous vertices(cudautils::make_device_unique<ZVertexSoA>(stream));
-#else
-  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
-    // std::cout << "producing Vertices on  CPU" <<    std::endl;
-    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
-#endif
+  template<typename Traits>
+  ZVertexHeterogeneous Producer::makeImpl(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+    // std::cout << "producing Vertices on " << Traits::name << std::endl;
+    ZVertexHeterogeneous vertices(Traits::template make_unique<ZVertexSoA>(stream));
     assert(tksoa);
     auto* soa = vertices.get();
     assert(soa);
 
-#ifdef __CUDACC__
-    auto ws_d = cudautils::make_device_unique<WorkSpace>(stream);
-#else
-    auto ws_d = std::make_unique<WorkSpace>();
-#endif
+    auto ws_d = Traits::template make_unique<WorkSpace>(stream);
 
-#ifdef __CUDACC__
-    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
     auto blockSize = 128;
     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
-    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
-    cudaCheck(cudaGetLastError());
-#else
-    cudaCompat::resetGrid();
-    init(soa, ws_d.get());
-    loadTracks(tksoa, soa, ws_d.get(), ptMin);
-#endif
+    cudautils::launch(init,{1,1,0,stream},soa, ws_d.get());
+    cudautils::launch(loadTracks, {numberOfBlocks, blockSize, 0, stream}, tksoa, soa, ws_d.get(), ptMin);
 
-#ifdef __CUDACC__
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudautils::launch(vertexFinderOneKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
 #else
-      vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
-      cudaCheck(cudaGetLastError());
+      cudautils::launch(vertexFinderKernel1,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
-      cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+      cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f);
+      cudautils::launch(vertexFinderKernel2,{1, 1024 - 256, 0, stream},soa, ws_d.get());
 #endif
     } else {  // five kernels
       if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        cudautils::launch(clusterTracksByDensityKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        cudautils::launch(clusterTracksDBSCAN,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
-        clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        cudautils::launch(clusterTracksIterative,{1, 1024 - 256, 0, stream},soa, ws_d.get(), minT, eps, errmax, chi2max);
       }
-      cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.);
-      cudaCheck(cudaGetLastError());
+      cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 50.);
       // one block per vertex...
-      splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f);
-      cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.);
-      cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get());
+      cudautils::launch(splitVerticesKernel,{1024, 128, 0, stream},soa, ws_d.get(), 9.f);
+      cudautils::launch(fitVerticesKernel,{1, 1024 - 256, 0, stream},soa, ws_d.get(), 5000.);
+      cudautils::launch(sortByPt2Kernel,{1, 1024 - 256, 0, stream},soa, ws_d.get());
     }
-    cudaCheck(cudaGetLastError());
-#else  // __CUDACC__
-    if (useDensity_) {
-      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useDBSCAN_) {
-      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    } else if (useIterative_) {
-      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
-    }
-    // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
-    fitVertices(soa, ws_d.get(), 50.);
-    // one block per vertex!
-    blockIdx.x = 0;
-    gridDim.x = 1;
-    splitVertices(soa, ws_d.get(), 9.f);
-    resetGrid();
-    fitVertices(soa, ws_d.get(), 5000.);
-    sortByPt2(soa, ws_d.get());
-#endif
 
     return vertices;
   }
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5261069a6b283..1bf4e223ab52f 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -22,7 +22,6 @@
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuSplitVertices.h"
 
 #ifdef ONE_KERNEL
-#ifdef __CUDACC__
 __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
                                       gpuVertexFinder::WorkSpace* pws,
                                       int minT,      // min number of neighbours to be "seed"
@@ -41,7 +40,6 @@ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
   sortByPt2(pdata, pws);
 }
 #endif
-#endif
 
 using namespace gpuVertexFinder;
 
@@ -56,7 +54,7 @@ struct Event {
 
 struct ClusterGenerator {
   explicit ClusterGenerator(float nvert, float ntrack)
-      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
+      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(0.001,1.) {}
 
   void operator()(Event& ev) {
     int nclus = clusGen(reng);
@@ -68,19 +66,28 @@ struct ClusterGenerator {
 
     ev.ztrack.clear();
     ev.eztrack.clear();
-    ev.ivert.clear();
+    ev.pttrack.clear();
+    ev.ivert.clear();    
+    float ptMax=0; float pt5=0;
     for (int iv = 0; iv < nclus; ++iv) {
       auto nt = trackGen(reng);
-      ev.itrack[nclus] = nt;
+      if (iv == 5) nt *= 3;
+      ev.itrack[iv] = nt;
+      float ptSum=0;
       for (int it = 0; it < nt; ++it) {
         auto err = errgen(reng);  // reality is not flat....
         ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
         ev.eztrack.push_back(err * err);
         ev.ivert.push_back(iv);
-        ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
+        ev.pttrack.push_back(std::pow(ptGen(reng),iv==5 ?-1.0f:-0.5f));
         ev.pttrack.back() *= ev.pttrack.back();
+        ptSum += ev.pttrack.back();
       }
+      ptMax = std::max(ptMax,ptSum);
+      if (iv == 5) pt5 = ptSum;
     }
+    std::cout << "PV, ptMax " << std::sqrt(pt5) << ' ' << std::sqrt(ptMax) << std::endl;
+
     // add noise
     auto nt = 2 * trackGen(reng);
     for (int it = 0; it < nt; ++it) {
@@ -88,7 +95,7 @@ struct ClusterGenerator {
       ev.ztrack.push_back(rgen(reng));
       ev.eztrack.push_back(err * err);
       ev.ivert.push_back(9999);
-      ev.pttrack.push_back(0.5f + ptGen(reng));
+      ev.pttrack.push_back(std::pow(ptGen(reng),-0.5f));
       ev.pttrack.back() *= ev.pttrack.back();
     }
   }
@@ -99,7 +106,7 @@ struct ClusterGenerator {
   std::poisson_distribution<int> clusGen;
   std::poisson_distribution<int> trackGen;
   std::normal_distribution<float> gauss;
-  std::exponential_distribution<float> ptGen;
+  std::uniform_real_distribution<float> ptGen;  // becomes a power low
 };
 
 // a macro SORRY
@@ -112,8 +119,14 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
   printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
 }
 
+__global__ void linit(ZVertices * pdata, WorkSpace * pws, int nt) {
+  auto & __restrict__ data = *pdata;
+  auto & __restrict__ ws = *pws;
+  for (int i=0; i<nt; ++i) {  ws.itrk[i]=i; data.idv[i] = -1;}
+}
+
 int main() {
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
   requireCUDADevices();
 
   auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
@@ -127,28 +140,36 @@ int main() {
 
   float eps = 0.1f;
   std::array<float, 3> par{{eps, 0.01f, 9.0f}};
-  for (int nav = 30; nav < 80; nav += 20) {
+  for (int nav = 30; nav < 100; nav += 20) {
     ClusterGenerator gen(nav, 10);
 
-    for (int i = 8; i < 20; ++i) {
-      auto kk = i / 4;  // M param
+    for (int iii = 8; iii < 20; ++iii) {
+      auto kk = iii / 4;  // M param
 
       gen(ev);
 
-#ifdef __CUDACC__
+      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+      int nt = ev.ztrack.size();
+      int nvori = ev.zvert.size();
+      int ntori = nt;
+      assert(ntori<int(ZVertexSoA::MAXTRACKS));
+      assert(nvori< int(ZVertexSoA::MAXVTX));
+
+#ifndef CUDA_KERNELS_ON_CPU
       init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      linit<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get(),ntori);
 #else
       onGPU_d->init();
       ws_d->init();
+      for (int16_t i=0; i<ntori; ++i) {  ws_d->itrk[i]=i; onGPU_d->idv[i] = -1;}  // FIXME do the same on GPU....
 #endif
 
-      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
-      auto nt = ev.ztrack.size();
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
       cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+
 #else
       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
@@ -156,19 +177,19 @@ int main() {
       ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #endif
 
-      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (iii % 4) << std::endl;
 
-      if ((i % 4) == 0)
+      if ((iii % 4) == 0)
         par = {{eps, 0.02f, 12.0f}};
-      if ((i % 4) == 1)
+      if ((iii % 4) == 1)
         par = {{eps, 0.02f, 9.0f}};
-      if ((i % 4) == 2)
+      if ((iii % 4) == 2)
         par = {{eps, 0.01f, 9.0f}};
-      if ((i % 4) == 3)
+      if ((iii % 4) == 3)
         par = {{0.7f * eps, 0.01f, 9.0f}};
 
       uint32_t nv = 0;
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
@@ -200,6 +221,7 @@ int main() {
         continue;
       }
 
+      int16_t * idv = nullptr;
       float* zv = nullptr;
       float* wv = nullptr;
       float* ptv2 = nullptr;
@@ -209,19 +231,22 @@ int main() {
       // keep chi2 separated...
       float chi2[2 * nv];  // make space for splitting...
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
+      int16_t hidv[16000];
       float hzv[2 * nv];
       float hwv[2 * nv];
       float hptv2[2 * nv];
       int32_t hnn[2 * nv];
       uint16_t hind[2 * nv];
 
+      idv = hidv;
       zv = hzv;
       wv = hwv;
       ptv2 = hptv2;
       nn = hnn;
       ind = hind;
 #else
+      idv = onGPU_d->idv;
       zv = onGPU_d->zv;
       wv = onGPU_d->wv;
       ptv2 = onGPU_d->ptv2;
@@ -229,13 +254,71 @@ int main() {
       ind = onGPU_d->sortInd;
 #endif
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
+   auto verifyMatch = [&]() {
+
+      // matching-merging metrics
+      constexpr int MAXMA = 32;
+      struct Match { Match() {for (auto&e:vid)e=-1; for (auto&e:nt)e=0;} std::array<int,MAXMA> vid; std::array<int,MAXMA> nt; };
+
+      auto nnn=0;
+      Match matches[nv]; for (auto kv = 0U; kv < nv; ++kv) { matches[kv] =  Match();}
+      auto iPV =  ind[nv - 1];
+      for (int it=0; it<nt; ++it) {
+        auto const iv = idv[it];
+        if (iv>9990) continue;
+        assert(iv<int(nv));
+        if (iv<0) continue;
+        auto const tiv = ev.ivert[it];
+        if (tiv>9990) continue;
+        ++nnn;
+        for (int i=0; i<MAXMA; ++i) {
+          if (matches[iv].vid[i]<0) { matches[iv].vid[i]=tiv; matches[iv].nt[i]=1; break;}
+          else if (tiv==matches[iv].vid[i]) { ++(matches[iv].nt[i]); break;}
+        }
+      }
+
+      float frac[nv];
+      int nok=0; int merged50=0; int merged75=0; int nmess=0;
+      float dz=0;
+      for (auto kv = 0U; kv < nv; ++kv) {
+        auto mx = std::max_element(matches[kv].nt.begin(),matches[kv].nt.end())-matches[kv].nt.begin();
+        assert(mx>=0 && mx<MAXMA);
+        if (0==matches[kv].nt[mx]) std::cout <<"????? " << kv << ' ' << matches[kv].vid[mx] << ' ' << matches[kv].vid[0] << std::endl;
+        auto itv = matches[kv].vid[mx];
+        frac[kv] = itv<0 ? 0.f : float(matches[kv].nt[mx])/float(ev.itrack[itv]);
+        assert(frac[kv]<1.1f);
+        if (frac[kv]>0.75f) ++nok;
+        if (frac[kv]<0.5f) ++nmess;
+        auto ldz = std::abs(zv[kv] - ev.zvert[itv]);
+        dz = std::max(dz,ldz);
+        int nm5=0; int nm7=0;
+        int ntt=0;
+        for (int i=0; i<MAXMA; ++i) {
+          ntt+=matches[kv].nt[i];
+          auto itv = matches[kv].vid[i];
+          float f = itv<0 ? 0.f : float(matches[kv].nt[i])/float(ev.itrack[itv]);
+          if (f>0.5f) ++nm5;
+          if (f>0.75f) ++nm7;
+        }
+        if (nm5>1) ++merged50;
+        if (nm7>1) ++merged75;
+        if (kv ==  iPV ) std::cout << "PV " << itv << ' ' << std::sqrt(ptv2[kv]) << ' ' << float(ntt)/float(ev.itrack[itv]) << '/' <<  frac[kv] << '/' << nm5 << '/' << nm7 << ' ' << dz << std::endl;
+      }
+      // for (auto f: frac) std::cout << f << ' ';
+      // std::cout << std::endl;
+      std::cout << "ori/tot/matched/merged5//merged7/random/dz "
+                << nvori << '/' << nv << '/' << nok << '/' << merged50 << '/' << merged75  << '/' << nmess
+                << '/' << dz << std::endl;
+      }; // verifyMatch
+
+
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)
           chi2[j] /= float(nn[j]);
@@ -244,7 +327,7 @@ int main() {
         std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
       }
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
@@ -263,7 +346,7 @@ int main() {
         std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
       }
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       // one vertex per block!!!
       cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
@@ -276,12 +359,9 @@ int main() {
 #endif
       std::cout << "after split " << nv << std::endl;
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
-      cudaCheck(cudaGetLastError());
-
       cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
-      cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
@@ -295,14 +375,16 @@ int main() {
         continue;
       }
 
-#ifdef __CUDACC__
+#ifndef CUDA_KERNELS_ON_CPU
       cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(idv, LOC_ONGPU(idv), nt * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
+
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)
           chi2[j] /= float(nn[j]);
@@ -333,7 +415,7 @@ int main() {
         }
         dd[kv] = md;
       }
-      if (i == 6) {
+      if (iii == 6) {
         for (auto d : dd)
           std::cout << d << ' ';
         std::cout << std::endl;
@@ -345,6 +427,10 @@ int main() {
       rms = std::sqrt(rms) / (nv - 1);
       std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
 
+
+      verifyMatch();
+
+
     }  // loop on events
   }    // lopp on ave vert